aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c146
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c72
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c518
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c168
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c260
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/exit.c317
-rw-r--r--kernel/fork.c64
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c1246
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/gcov/Kconfig48
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c148
-rw-r--r--kernel/gcov/fs.c673
-rw-r--r--kernel/gcov/gcc_3_4.c447
-rw-r--r--kernel/gcov/gcov.h128
-rw-r--r--kernel/groups.c288
-rw-r--r--kernel/hrtimer.c86
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/handle.c74
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/manage.c95
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c42
-rw-r--r--kernel/kallsyms.c134
-rw-r--r--kernel/kexec.c16
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kthread.c97
-rw-r--r--kernel/lockdep.c16
-rw-r--r--kernel/lockdep_proc.c3
-rw-r--r--kernel/module.c117
-rw-r--r--kernel/mutex.c31
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c46
-rw-r--r--kernel/perf_counter.c4860
-rw-r--r--kernel/pid.c17
-rw-r--r--kernel/pid_namespace.c24
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/hibernate.c (renamed from kernel/power/disk.c)34
-rw-r--r--kernel/power/hibernate_nvs.c135
-rw-r--r--kernel/power/main.c521
-rw-r--r--kernel/power/power.h25
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c80
-rw-r--r--kernel/power/suspend.c300
-rw-r--r--kernel/power/suspend_test.c187
-rw-r--r--kernel/power/swsusp.c198
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/printk.c33
-rw-r--r--kernel/profile.c19
-rw-r--r--kernel/ptrace.c178
-rw-r--r--kernel/rcupreempt.c8
-rw-r--r--kernel/rcutree.c28
-rw-r--r--kernel/rcutree_trace.c64
-rw-r--r--kernel/res_counter.c12
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c248
-rw-r--r--kernel/rtmutex_common.h8
-rw-r--r--kernel/sched.c530
-rw-r--r--kernel/sched_cpupri.c25
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c61
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/sched_rt.c18
-rw-r--r--kernel/signal.c119
-rw-r--r--kernel/slow-work.c27
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c85
-rw-r--r--kernel/sys.c290
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c93
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/clocksource.c23
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-oneshot.c17
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c145
-rw-r--r--kernel/trace/Kconfig167
-rw-r--r--kernel/trace/Makefile20
-rw-r--r--kernel/trace/blktrace.c296
-rw-r--r--kernel/trace/events.c14
-rw-r--r--kernel/trace/ftrace.c884
-rw-r--r--kernel/trace/kmemtrace.c12
-rw-r--r--kernel/trace/ring_buffer.c1043
-rw-r--r--kernel/trace/ring_buffer_benchmark.c419
-rw-r--r--kernel/trace/trace.c449
-rw-r--r--kernel/trace/trace.h254
-rw-r--r--kernel/trace/trace_boot.c5
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_profile.c24
-rw-r--r--kernel/trace/trace_event_types.h15
-rw-r--r--kernel/trace/trace_events.c861
-rw-r--r--kernel/trace/trace_events_filter.c1213
-rw-r--r--kernel/trace/trace_events_stage_1.h39
-rw-r--r--kernel/trace/trace_events_stage_2.h176
-rw-r--r--kernel/trace/trace_events_stage_3.h281
-rw-r--r--kernel/trace/trace_export.c110
-rw-r--r--kernel/trace/trace_functions.c13
-rw-r--r--kernel/trace/trace_functions_graph.c78
-rw-r--r--kernel/trace/trace_hw_branches.c203
-rw-r--r--kernel/trace/trace_mmiotrace.c6
-rw-r--r--kernel/trace/trace_output.c239
-rw-r--r--kernel/trace/trace_output.h34
-rw-r--r--kernel/trace/trace_power.c8
-rw-r--r--kernel/trace/trace_printk.c34
-rw-r--r--kernel/trace/trace_sched_switch.c12
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_selftest.c58
-rw-r--r--kernel/trace/trace_stack.c26
-rw-r--r--kernel/trace/trace_stat.c232
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_sysprof.c9
-rw-r--r--kernel/trace/trace_workqueue.c25
-rw-r--r--kernel/user.c67
-rw-r--r--kernel/utsname.c13
-rw-r--r--kernel/wait.c7
-rw-r--r--kernel/workqueue.c11
135 files changed, 16499 insertions, 5405 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660..2093a691f1c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
async.o
+obj-y += groups.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -68,8 +69,9 @@ obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
@@ -93,8 +95,11 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa3156416..9f3391090b3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
static int acct_on(char *name)
{
struct file *file;
+ struct vfsmount *mnt;
int error;
struct pid_namespace *ns;
struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
acct = NULL;
}
- mnt_pin(file->f_path.mnt);
+ mnt = file->f_path.mnt;
+ mnt_pin(mnt);
acct_file_reopen(ns->bacct, file, ns);
spin_unlock(&acct_lock);
- mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+ mntput(mnt); /* it's pinned, now give up active reference */
kfree(acct);
return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 9442c3533ba..defc2e6f1e3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
-
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
/* Serialize requests from userspace. */
-static DEFINE_MUTEX(audit_cmd_mutex);
+DEFINE_MUTEX(audit_cmd_mutex);
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
kfree_skb(skb);
}
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ char *data = NLMSG_DATA(nlh);
+
+ if (nlh->nlmsg_type != AUDIT_EOE) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+ else
+ audit_log_lost("printk limit exceeded\n");
+ }
+
+ audit_hold_skb(skb);
+}
+
static void kauditd_send_skb(struct sk_buff *skb)
{
int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
if (skb) {
if (audit_pid)
kauditd_send_skb(skb);
- else {
- if (printk_ratelimit())
- printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
- else
- audit_log_lost("printk limit exceeded\n");
-
- audit_hold_skb(skb);
- }
+ else
+ audit_printk_skb(skb);
} else {
DECLARE_WAITQUEUE(wait, current);
set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
return 0;
}
-#ifdef CONFIG_AUDIT_TREE
-static int prune_tree_thread(void *unused)
-{
- mutex_lock(&audit_cmd_mutex);
- audit_prune_trees();
- mutex_unlock(&audit_cmd_mutex);
- return 0;
-}
-
-void audit_schedule_prune(void)
-{
- kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
-}
-#endif
-
struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
int multi, void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
- int len = NLMSG_SPACE(size);
void *data;
int flags = multi ? NLM_F_MULTI : 0;
int t = done ? NLMSG_DONE : type;
- skb = alloc_skb(len, GFP_KERNEL);
+ skb = nlmsg_new(size, GFP_KERNEL);
if (!skb)
return NULL;
- nlh = NLMSG_PUT(skb, pid, seq, t, size);
- nlh->nlmsg_flags = flags;
- data = NLMSG_DATA(nlh);
+ nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
+ data = NLMSG_DATA(nlh);
memcpy(data, payload, size);
return skb;
-nlmsg_failure: /* Used by NLMSG_PUT */
+nlmsg_failure: /* Used by NLMSG_NEW */
if (skb)
kfree_skb(skb);
return NULL;
@@ -926,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/*
- * Get message from skb (based on rtnetlink_rcv_skb). Each message is
- * processed by audit_receive_msg. Malformed skbs with wrong length are
- * discarded silently.
+ * Get message from skb. Each message is processed by audit_receive_msg.
+ * Malformed skbs with wrong length are discarded silently.
*/
static void audit_receive_skb(struct sk_buff *skb)
{
- int err;
- struct nlmsghdr *nlh;
- u32 rlen;
+ struct nlmsghdr *nlh;
+ /*
+ * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+ * if the nlmsg_len was not aligned
+ */
+ int len;
+ int err;
- while (skb->len >= NLMSG_SPACE(0)) {
- nlh = nlmsg_hdr(skb);
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- if ((err = audit_receive_msg(skb, nlh))) {
+ nlh = nlmsg_hdr(skb);
+ len = skb->len;
+
+ while (NLMSG_OK(nlh, len)) {
+ err = audit_receive_msg(skb, nlh);
+ /* if err or if this message says it wants a response */
+ if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- } else if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
+
+ nlh = NLMSG_NEXT(nlh, len);
}
}
@@ -959,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
-#ifdef CONFIG_AUDITSYSCALL
-static const struct inotify_operations audit_inotify_ops = {
- .handle_event = audit_handle_ievent,
- .destroy_watch = audit_free_parent,
-};
-#endif
-
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
@@ -991,12 +978,6 @@ static int __init audit_init(void)
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-#ifdef CONFIG_AUDITSYSCALL
- audit_ih = inotify_init(&audit_inotify_ops);
- if (IS_ERR(audit_ih))
- audit_panic("cannot initialize inotify handle");
-#endif
-
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
@@ -1070,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
goto err;
}
- ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
- if (!ab->skb)
- goto err;
-
ab->ctx = ctx;
ab->gfp_mask = gfp_mask;
- nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
- nlh->nlmsg_type = type;
- nlh->nlmsg_flags = 0;
- nlh->nlmsg_pid = 0;
- nlh->nlmsg_seq = 0;
+
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
+ if (!ab->skb)
+ goto nlmsg_failure;
+
+ nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+
return ab;
+
+nlmsg_failure: /* Used by NLMSG_NEW */
+ kfree_skb(ab->skb);
+ ab->skb = NULL;
err:
audit_buffer_free(ab);
return NULL;
@@ -1452,6 +1435,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
kfree(pathname);
}
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+ audit_log_format(ab, " key=");
+ if (key)
+ audit_log_untrustedstring(ab, key);
+ else
+ audit_log_format(ab, "(null)");
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
@@ -1475,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
skb_queue_tail(&audit_skb_queue, ab->skb);
wake_up_interruptible(&kauditd_wait);
} else {
- if (nlh->nlmsg_type != AUDIT_EOE) {
- if (printk_ratelimit()) {
- printk(KERN_NOTICE "type=%d %s\n",
- nlh->nlmsg_type,
- ab->skb->data + NLMSG_SPACE(0));
- } else
- audit_log_lost("printk limit exceeded\n");
- }
- audit_hold_skb(ab->skb);
+ audit_printk_skb(ab->skb);
}
ab->skb = NULL;
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661..208687be4f3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
};
/* Rule lists */
-struct audit_parent;
-
-struct audit_watch {
- atomic_t count; /* reference count */
- char *path; /* insertion path */
- dev_t dev; /* associated superblock device */
- unsigned long ino; /* associated inode number */
- struct audit_parent *parent; /* associated parent */
- struct list_head wlist; /* entry in parent->watches list */
- struct list_head rules; /* associated rules */
-};
-
+struct audit_watch;
struct audit_tree;
struct audit_chunk;
@@ -108,19 +97,28 @@ struct audit_netlink_list {
int audit_send_list(void *);
-struct inotify_watch;
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
-
-extern void audit_free_parent(struct inotify_watch *);
-extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
- const char *, struct inode *);
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
+/* audit watch functions */
+extern unsigned long audit_watch_inode(struct audit_watch *watch);
+extern dev_t audit_watch_dev(struct audit_watch *watch);
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule);
+extern void audit_remove_watch(struct audit_watch *watch);
+extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
+extern void audit_inotify_unregister(struct list_head *in_list);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch *watch);
+
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
extern int audit_remove_tree_rule(struct audit_krule *);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
-extern void audit_schedule_prune(void);
-extern void audit_prune_trees(void);
extern const char *audit_tree_path(struct audit_tree *);
extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
+#define audit_kill_trees(list) BUG()
#endif
extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
return 0;
}
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
#define audit_filter_inodes(t,c) AUDIT_DISABLED
#endif
+
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a8..2451dc6f328 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
#include <linux/inotify.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/kthread.h>
struct audit_tree;
struct audit_chunk;
@@ -441,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
if (rule->tree) {
/* not a half-baked one */
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=remove rule dir=");
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
- if (rule->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab, rule->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
+ audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
rule->tree = NULL;
@@ -519,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
}
}
+static void audit_schedule_prune(void);
+
/* called with audit_filter_mutex */
int audit_remove_tree_rule(struct audit_krule *rule)
{
@@ -568,7 +569,7 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(path.mnt, path.dentry);
+ root_mnt = collect_mounts(&path);
path_put(&path);
if (!root_mnt)
goto skip_it;
@@ -660,7 +661,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(path.mnt, path.dentry);
+ mnt = collect_mounts(&path);
path_put(&path);
if (!mnt) {
err = -ENOMEM;
@@ -720,7 +721,7 @@ int audit_tag_tree(char *old, char *new)
err = kern_path(new, 0, &path);
if (err)
return err;
- tagged = collect_mounts(path.mnt, path.dentry);
+ tagged = collect_mounts(&path);
path_put(&path);
if (!tagged)
return -ENOMEM;
@@ -824,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
/*
* That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread, with audit_cmd_mutex held.
+ * Runs from a separate thread.
*/
-void audit_prune_trees(void)
+static int prune_tree_thread(void *unused)
{
+ mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
@@ -844,6 +846,40 @@ void audit_prune_trees(void)
}
mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+ return 0;
+}
+
+static void audit_schedule_prune(void)
+{
+ kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall. Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+ mutex_lock(&audit_cmd_mutex);
+ mutex_lock(&audit_filter_mutex);
+
+ while (!list_empty(list)) {
+ struct audit_tree *victim;
+
+ victim = list_entry(list->next, struct audit_tree, list);
+ kill_rules(victim);
+ list_del_init(&victim->list);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ prune_one(victim);
+
+ mutex_lock(&audit_filter_mutex);
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
}
/*
@@ -854,6 +890,8 @@ void audit_prune_trees(void)
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
+ struct list_head *postponed = audit_killed_trees();
+ int need_prune = 0;
int n;
if (chunk->dead)
@@ -869,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
owner->root = NULL;
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
- kill_rules(owner);
- list_move(&owner->list, &prune_list);
- audit_schedule_prune();
+ if (!postponed) {
+ kill_rules(owner);
+ list_move(&owner->list, &prune_list);
+ need_prune = 1;
+ } else {
+ list_move(&owner->list, postponed);
+ }
spin_lock(&hash_lock);
}
list_del_rcu(&chunk->hash);
for (n = 0; n < chunk->count; n++)
list_del_init(&chunk->owners[n].list);
spin_unlock(&hash_lock);
+ if (need_prune)
+ audit_schedule_prune();
mutex_unlock(&audit_filter_mutex);
}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 00000000000..0e96dbc60ea
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * event. Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * audit_remove_watch(). Additionally, an audit_watch may exist
+ * temporarily to assist in searching existing filter data. Each
+ * audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_watch {
+ atomic_t count; /* reference count */
+ char *path; /* insertion path */
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ struct audit_parent *parent; /* associated parent */
+ struct list_head wlist; /* entry in parent->watches list */
+ struct list_head rules; /* associated rules */
+};
+
+struct audit_parent {
+ struct list_head ilist; /* entry in inotify registration list */
+ struct list_head watches; /* associated watches */
+ struct inotify_watch wdata; /* inotify watch data */
+ unsigned flags; /* status flags */
+};
+
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID 0x001
+
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+
+static void audit_free_parent(struct inotify_watch *i_watch)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
+
+void audit_get_watch(struct audit_watch *watch)
+{
+ atomic_inc(&watch->count);
+}
+
+void audit_put_watch(struct audit_watch *watch)
+{
+ if (atomic_dec_and_test(&watch->count)) {
+ WARN_ON(watch->parent);
+ WARN_ON(!list_empty(&watch->rules));
+ kfree(watch->path);
+ kfree(watch);
+ }
+}
+
+void audit_remove_watch(struct audit_watch *watch)
+{
+ list_del(&watch->wlist);
+ put_inotify_watch(&watch->parent->wdata);
+ watch->parent = NULL;
+ audit_put_watch(watch); /* match initial get */
+}
+
+char *audit_watch_path(struct audit_watch *watch)
+{
+ return watch->path;
+}
+
+struct list_head *audit_watch_rules(struct audit_watch *watch)
+{
+ return &watch->rules;
+}
+
+unsigned long audit_watch_inode(struct audit_watch *watch)
+{
+ return watch->ino;
+}
+
+dev_t audit_watch_dev(struct audit_watch *watch)
+{
+ return watch->dev;
+}
+
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+ struct audit_parent *parent;
+ s32 wd;
+
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (unlikely(!parent))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&parent->watches);
+ parent->flags = 0;
+
+ inotify_init_watch(&parent->wdata);
+ /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+ get_inotify_watch(&parent->wdata);
+ wd = inotify_add_watch(audit_ih, &parent->wdata,
+ ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+ if (wd < 0) {
+ audit_free_parent(&parent->wdata);
+ return ERR_PTR(wd);
+ }
+
+ return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+ struct audit_watch *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (unlikely(!watch))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&watch->rules);
+ atomic_set(&watch->count, 1);
+ watch->path = path;
+ watch->dev = (dev_t)-1;
+ watch->ino = (unsigned long)-1;
+
+ return watch;
+}
+
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+ struct audit_watch *watch;
+
+ if (!audit_ih)
+ return -EOPNOTSUPP;
+
+ if (path[0] != '/' || path[len-1] == '/' ||
+ krule->listnr != AUDIT_FILTER_EXIT ||
+ op != Audit_equal ||
+ krule->inode_f || krule->watch || krule->tree)
+ return -EINVAL;
+
+ watch = audit_init_watch(path);
+ if (IS_ERR(watch))
+ return PTR_ERR(watch);
+
+ audit_get_watch(watch);
+ krule->watch = watch;
+
+ return 0;
+}
+
+/* Duplicate the given audit watch. The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+ char *path;
+ struct audit_watch *new;
+
+ path = kstrdup(old->path, GFP_KERNEL);
+ if (unlikely(!path))
+ return ERR_PTR(-ENOMEM);
+
+ new = audit_init_watch(path);
+ if (IS_ERR(new)) {
+ kfree(path);
+ goto out;
+ }
+
+ new->dev = old->dev;
+ new->ino = old->ino;
+ get_inotify_watch(&old->parent->wdata);
+ new->parent = old->parent;
+
+out:
+ return new;
+}
+
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "auid=%u ses=%u op=",
+ audit_get_loginuid(current),
+ audit_get_sessionid(current));
+ audit_log_string(ab, op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, w->path);
+ audit_log_key(ab, r->filterkey);
+ audit_log_format(ab, " list=%d res=1", r->listnr);
+ audit_log_end(ab);
+ }
+}
+
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+ const char *dname, dev_t dev,
+ unsigned long ino, unsigned invalidating)
+{
+ struct audit_watch *owatch, *nwatch, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *oentry, *nentry;
+
+ mutex_lock(&audit_filter_mutex);
+ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+ if (audit_compare_dname_path(dname, owatch->path, NULL))
+ continue;
+
+ /* If the update involves invalidating rules, do the inode-based
+ * filtering now, so we don't omit records. */
+ if (invalidating && current->audit_context)
+ audit_filter_inodes(current, current->audit_context);
+
+ nwatch = audit_dupe_watch(owatch);
+ if (IS_ERR(nwatch)) {
+ mutex_unlock(&audit_filter_mutex);
+ audit_panic("error updating watch, skipping");
+ return;
+ }
+ nwatch->dev = dev;
+ nwatch->ino = ino;
+
+ list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+ oentry = container_of(r, struct audit_entry, rule);
+ list_del(&oentry->rule.rlist);
+ list_del_rcu(&oentry->list);
+
+ nentry = audit_dupe_rule(&oentry->rule, nwatch);
+ if (IS_ERR(nentry)) {
+ list_del(&oentry->rule.list);
+ audit_panic("error updating watch, removing");
+ } else {
+ int h = audit_hash_ino((u32)ino);
+ list_add(&nentry->rule.rlist, &nwatch->rules);
+ list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+ list_replace(&oentry->rule.list,
+ &nentry->rule.list);
+ }
+
+ audit_watch_log_rule_change(r, owatch, "updated rules");
+
+ call_rcu(&oentry->rcu, audit_free_rule_rcu);
+ }
+
+ audit_remove_watch(owatch);
+ goto add_watch_to_parent; /* event applies to a single watch */
+ }
+ mutex_unlock(&audit_filter_mutex);
+ return;
+
+add_watch_to_parent:
+ list_add(&nwatch->wlist, &parent->watches);
+ mutex_unlock(&audit_filter_mutex);
+ return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+ struct audit_watch *w, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *e;
+
+ mutex_lock(&audit_filter_mutex);
+ parent->flags |= AUDIT_PARENT_INVALID;
+ list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+ list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+ e = container_of(r, struct audit_entry, rule);
+ audit_watch_log_rule_change(r, w, "remove rule");
+ list_del(&r->rlist);
+ list_del(&r->list);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+ }
+ audit_remove_watch(w);
+ }
+ mutex_unlock(&audit_filter_mutex);
+}
+
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+void audit_inotify_unregister(struct list_head *in_list)
+{
+ struct audit_parent *p, *n;
+
+ list_for_each_entry_safe(p, n, in_list, ilist) {
+ list_del(&p->ilist);
+ inotify_rm_watch(audit_ih, &p->wdata);
+ /* the unpin matching the pin in audit_do_del_rule() */
+ unpin_inotify_watch(&p->wdata);
+ }
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+{
+ struct nameidata *ndparent, *ndwatch;
+ int err;
+
+ ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+ if (unlikely(!ndparent))
+ return -ENOMEM;
+
+ ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+ if (unlikely(!ndwatch)) {
+ kfree(ndparent);
+ return -ENOMEM;
+ }
+
+ err = path_lookup(path, LOOKUP_PARENT, ndparent);
+ if (err) {
+ kfree(ndparent);
+ kfree(ndwatch);
+ return err;
+ }
+
+ err = path_lookup(path, 0, ndwatch);
+ if (err) {
+ kfree(ndwatch);
+ ndwatch = NULL;
+ }
+
+ *ndp = ndparent;
+ *ndw = ndwatch;
+
+ return 0;
+}
+
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+ if (ndp) {
+ path_put(&ndp->path);
+ kfree(ndp);
+ }
+ if (ndw) {
+ path_put(&ndw->path);
+ kfree(ndw);
+ }
+}
+
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+ struct audit_parent *parent)
+{
+ struct audit_watch *w, *watch = krule->watch;
+ int watch_found = 0;
+
+ list_for_each_entry(w, &parent->watches, wlist) {
+ if (strcmp(watch->path, w->path))
+ continue;
+
+ watch_found = 1;
+
+ /* put krule's and initial refs to temporary watch */
+ audit_put_watch(watch);
+ audit_put_watch(watch);
+
+ audit_get_watch(w);
+ krule->watch = watch = w;
+ break;
+ }
+
+ if (!watch_found) {
+ get_inotify_watch(&parent->wdata);
+ watch->parent = parent;
+
+ list_add(&watch->wlist, &parent->watches);
+ }
+ list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule)
+{
+ struct audit_watch *watch = krule->watch;
+ struct inotify_watch *i_watch;
+ struct audit_parent *parent;
+ struct nameidata *ndp = NULL, *ndw = NULL;
+ int ret = 0;
+
+ mutex_unlock(&audit_filter_mutex);
+
+ /* Avoid calling path_lookup under audit_filter_mutex. */
+ ret = audit_get_nd(watch->path, &ndp, &ndw);
+ if (ret) {
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+ goto error;
+ }
+
+ /* update watch filter fields */
+ if (ndw) {
+ watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+ watch->ino = ndw->path.dentry->d_inode->i_ino;
+ }
+
+ /* The audit_filter_mutex must not be held during inotify calls because
+ * we hold it during inotify event callback processing. If an existing
+ * inotify watch is found, inotify_find_watch() grabs a reference before
+ * returning.
+ */
+ if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+ &i_watch) < 0) {
+ parent = audit_init_parent(ndp);
+ if (IS_ERR(parent)) {
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+ ret = PTR_ERR(parent);
+ goto error;
+ }
+ } else
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ mutex_lock(&audit_filter_mutex);
+
+ /* parent was moved before we took audit_filter_mutex */
+ if (parent->flags & AUDIT_PARENT_INVALID)
+ ret = -ENOENT;
+ else
+ audit_add_to_parent(krule, parent);
+
+ /* match get in audit_init_parent or inotify_find_watch */
+ put_inotify_watch(&parent->wdata);
+
+error:
+ audit_put_nd(ndp, ndw); /* NULL args OK */
+ return ret;
+
+}
+
+void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+{
+ struct audit_watch *watch = krule->watch;
+ struct audit_parent *parent = watch->parent;
+
+ list_del(&krule->rlist);
+
+ if (list_empty(&watch->rules)) {
+ audit_remove_watch(watch);
+
+ if (list_empty(&parent->watches)) {
+ /* Put parent on the inotify un-registration
+ * list. Grab a reference before releasing
+ * audit_filter_mutex, to be released in
+ * audit_inotify_unregister().
+ * If filesystem is going away, just leave
+ * the sucker alone, eviction will take
+ * care of it. */
+ if (pin_inotify_watch(&parent->wdata))
+ list_add(&parent->ilist, list);
+ }
+ }
+}
+
+/* Update watch data in audit rules based on inotify events. */
+static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+ u32 cookie, const char *dname, struct inode *inode)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev,
+ inode->i_ino, 0);
+ else if (mask & (IN_DELETE|IN_MOVED_FROM))
+ audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ /* inotify automatically removes the watch and sends IN_IGNORED */
+ else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+ audit_remove_parent_watches(parent);
+ /* inotify does not remove the watch, so remove it manually */
+ else if(mask & IN_MOVE_SELF) {
+ audit_remove_parent_watches(parent);
+ inotify_remove_watch_locked(audit_ih, i_watch);
+ } else if (mask & IN_IGNORED)
+ put_inotify_watch(i_watch);
+}
+
+static const struct inotify_operations audit_inotify_ops = {
+ .handle_event = audit_handle_ievent,
+ .destroy_watch = audit_free_parent,
+};
+
+static int __init audit_watch_init(void)
+{
+ audit_ih = inotify_init(&audit_inotify_ops);
+ if (IS_ERR(audit_ih))
+ audit_panic("cannot initialize inotify handle");
+ return 0;
+}
+subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 713098ee5a0..a70604047f3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
-#include <linux/inotify.h>
#include <linux/security.h>
#include "audit.h"
@@ -44,36 +43,6 @@
* be written directly provided audit_filter_mutex is held.
*/
-/*
- * Reference counting:
- *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
- * event. Each audit_watch holds a reference to its associated parent.
- *
- * audit_watch: if added to lists, lifetime is from audit_init_watch() to
- * audit_remove_watch(). Additionally, an audit_watch may exist
- * temporarily to assist in searching existing filter data. Each
- * audit_krule holds a reference to its associated watch.
- */
-
-struct audit_parent {
- struct list_head ilist; /* entry in inotify registration list */
- struct list_head watches; /* associated watches */
- struct inotify_watch wdata; /* inotify watch data */
- unsigned flags; /* status flags */
-};
-
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID 0x001
-
/* Audit filter lists, defined in <linux/audit.h> */
struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,41 +66,6 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
DEFINE_MUTEX(audit_filter_mutex);
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-
-void audit_free_parent(struct inotify_watch *i_watch)
-{
- struct audit_parent *parent;
-
- parent = container_of(i_watch, struct audit_parent, wdata);
- WARN_ON(!list_empty(&parent->watches));
- kfree(parent);
-}
-
-static inline void audit_get_watch(struct audit_watch *watch)
-{
- atomic_inc(&watch->count);
-}
-
-static void audit_put_watch(struct audit_watch *watch)
-{
- if (atomic_dec_and_test(&watch->count)) {
- WARN_ON(watch->parent);
- WARN_ON(!list_empty(&watch->rules));
- kfree(watch->path);
- kfree(watch);
- }
-}
-
-static void audit_remove_watch(struct audit_watch *watch)
-{
- list_del(&watch->wlist);
- put_inotify_watch(&watch->parent->wdata);
- watch->parent = NULL;
- audit_put_watch(watch); /* match initial get */
-}
-
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
audit_free_rule(e);
}
-/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
-{
- struct audit_parent *parent;
- s32 wd;
-
- parent = kzalloc(sizeof(*parent), GFP_KERNEL);
- if (unlikely(!parent))
- return ERR_PTR(-ENOMEM);
-
- INIT_LIST_HEAD(&parent->watches);
- parent->flags = 0;
-
- inotify_init_watch(&parent->wdata);
- /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
- get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata,
- ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
- if (wd < 0) {
- audit_free_parent(&parent->wdata);
- return ERR_PTR(wd);
- }
-
- return parent;
-}
-
-/* Initialize a watch entry. */
-static struct audit_watch *audit_init_watch(char *path)
-{
- struct audit_watch *watch;
-
- watch = kzalloc(sizeof(*watch), GFP_KERNEL);
- if (unlikely(!watch))
- return ERR_PTR(-ENOMEM);
-
- INIT_LIST_HEAD(&watch->rules);
- atomic_set(&watch->count, 1);
- watch->path = path;
- watch->dev = (dev_t)-1;
- watch->ino = (unsigned long)-1;
-
- return watch;
-}
-
/* Initialize an audit filterlist entry. */
static inline struct audit_entry *audit_init_entry(u32 field_count)
{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
return 0;
}
-/* Translate a watch string to kernel respresentation. */
-static int audit_to_watch(struct audit_krule *krule, char *path, int len,
- u32 op)
-{
- struct audit_watch *watch;
-
- if (!audit_ih)
- return -EOPNOTSUPP;
-
- if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
- op != Audit_equal ||
- krule->inode_f || krule->watch || krule->tree)
- return -EINVAL;
-
- watch = audit_init_watch(path);
- if (IS_ERR(watch))
- return PTR_ERR(watch);
-
- audit_get_watch(watch);
- krule->watch = watch;
-
- return 0;
-}
-
static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
break;
case AUDIT_WATCH:
data->buflen += data->values[i] =
- audit_pack_string(&bufp, krule->watch->path);
+ audit_pack_string(&bufp,
+ audit_watch_path(krule->watch));
break;
case AUDIT_DIR:
data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 1;
break;
case AUDIT_WATCH:
- if (strcmp(a->watch->path, b->watch->path))
+ if (strcmp(audit_watch_path(a->watch),
+ audit_watch_path(b->watch)))
return 1;
break;
case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 0;
}
-/* Duplicate the given audit watch. The new watch's rules list is initialized
- * to an empty list and wlist is undefined. */
-static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
-{
- char *path;
- struct audit_watch *new;
-
- path = kstrdup(old->path, GFP_KERNEL);
- if (unlikely(!path))
- return ERR_PTR(-ENOMEM);
-
- new = audit_init_watch(path);
- if (IS_ERR(new)) {
- kfree(path);
- goto out;
- }
-
- new->dev = old->dev;
- new->ino = old->ino;
- get_inotify_watch(&old->parent->wdata);
- new->parent = old->parent;
-
-out:
- return new;
-}
-
/* Duplicate LSM field information. The lsm_rule is opaque, so must be
* re-initialized. */
static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch *watch)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
return entry;
}
-/* Update inode info in audit rules based on filesystem event. */
-static void audit_update_watch(struct audit_parent *parent,
- const char *dname, dev_t dev,
- unsigned long ino, unsigned invalidating)
-{
- struct audit_watch *owatch, *nwatch, *nextw;
- struct audit_krule *r, *nextr;
- struct audit_entry *oentry, *nentry;
-
- mutex_lock(&audit_filter_mutex);
- list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
- continue;
-
- /* If the update involves invalidating rules, do the inode-based
- * filtering now, so we don't omit records. */
- if (invalidating && current->audit_context)
- audit_filter_inodes(current, current->audit_context);
-
- nwatch = audit_dupe_watch(owatch);
- if (IS_ERR(nwatch)) {
- mutex_unlock(&audit_filter_mutex);
- audit_panic("error updating watch, skipping");
- return;
- }
- nwatch->dev = dev;
- nwatch->ino = ino;
-
- list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
-
- oentry = container_of(r, struct audit_entry, rule);
- list_del(&oentry->rule.rlist);
- list_del_rcu(&oentry->list);
-
- nentry = audit_dupe_rule(&oentry->rule, nwatch);
- if (IS_ERR(nentry)) {
- list_del(&oentry->rule.list);
- audit_panic("error updating watch, removing");
- } else {
- int h = audit_hash_ino((u32)ino);
- list_add(&nentry->rule.rlist, &nwatch->rules);
- list_add_rcu(&nentry->list, &audit_inode_hash[h]);
- list_replace(&oentry->rule.list,
- &nentry->rule.list);
- }
-
- call_rcu(&oentry->rcu, audit_free_rule_rcu);
- }
-
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_NOFS,
- AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "auid=%u ses=%u",
- audit_get_loginuid(current),
- audit_get_sessionid(current));
- audit_log_format(ab,
- " op=updated rules specifying path=");
- audit_log_untrustedstring(ab, owatch->path);
- audit_log_format(ab, " with dev=%u ino=%lu\n",
- dev, ino);
- audit_log_format(ab, " list=%d res=1", r->listnr);
- audit_log_end(ab);
- }
- audit_remove_watch(owatch);
- goto add_watch_to_parent; /* event applies to a single watch */
- }
- mutex_unlock(&audit_filter_mutex);
- return;
-
-add_watch_to_parent:
- list_add(&nwatch->wlist, &parent->watches);
- mutex_unlock(&audit_filter_mutex);
- return;
-}
-
-/* Remove all watches & rules associated with a parent that is going away. */
-static void audit_remove_parent_watches(struct audit_parent *parent)
-{
- struct audit_watch *w, *nextw;
- struct audit_krule *r, *nextr;
- struct audit_entry *e;
-
- mutex_lock(&audit_filter_mutex);
- parent->flags |= AUDIT_PARENT_INVALID;
- list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
- list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
- e = container_of(r, struct audit_entry, rule);
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_NOFS,
- AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "auid=%u ses=%u",
- audit_get_loginuid(current),
- audit_get_sessionid(current));
- audit_log_format(ab, " op=remove rule path=");
- audit_log_untrustedstring(ab, w->path);
- if (r->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab,
- r->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
- audit_log_format(ab, " list=%d res=1",
- r->listnr);
- audit_log_end(ab);
- }
- list_del(&r->rlist);
- list_del(&r->list);
- list_del_rcu(&e->list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
- }
- audit_remove_watch(w);
- }
- mutex_unlock(&audit_filter_mutex);
-}
-
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-static void audit_inotify_unregister(struct list_head *in_list)
-{
- struct audit_parent *p, *n;
-
- list_for_each_entry_safe(p, n, in_list, ilist) {
- list_del(&p->ilist);
- inotify_rm_watch(audit_ih, &p->wdata);
- /* the unpin matching the pin in audit_do_del_rule() */
- unpin_inotify_watch(&p->wdata);
- }
-}
-
/* Find an existing audit rule.
* Caller must hold audit_filter_mutex to prevent stale rule data. */
static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
return found;
}
-/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp,
- struct nameidata **ndw)
-{
- struct nameidata *ndparent, *ndwatch;
- int err;
-
- ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
- if (unlikely(!ndparent))
- return -ENOMEM;
-
- ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
- if (unlikely(!ndwatch)) {
- kfree(ndparent);
- return -ENOMEM;
- }
-
- err = path_lookup(path, LOOKUP_PARENT, ndparent);
- if (err) {
- kfree(ndparent);
- kfree(ndwatch);
- return err;
- }
-
- err = path_lookup(path, 0, ndwatch);
- if (err) {
- kfree(ndwatch);
- ndwatch = NULL;
- }
-
- *ndp = ndparent;
- *ndw = ndwatch;
-
- return 0;
-}
-
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
- if (ndp) {
- path_put(&ndp->path);
- kfree(ndp);
- }
- if (ndw) {
- path_put(&ndw->path);
- kfree(ndw);
- }
-}
-
-/* Associate the given rule with an existing parent inotify_watch.
- * Caller must hold audit_filter_mutex. */
-static void audit_add_to_parent(struct audit_krule *krule,
- struct audit_parent *parent)
-{
- struct audit_watch *w, *watch = krule->watch;
- int watch_found = 0;
-
- list_for_each_entry(w, &parent->watches, wlist) {
- if (strcmp(watch->path, w->path))
- continue;
-
- watch_found = 1;
-
- /* put krule's and initial refs to temporary watch */
- audit_put_watch(watch);
- audit_put_watch(watch);
-
- audit_get_watch(w);
- krule->watch = watch = w;
- break;
- }
-
- if (!watch_found) {
- get_inotify_watch(&parent->wdata);
- watch->parent = parent;
-
- list_add(&watch->wlist, &parent->watches);
- }
- list_add(&krule->rlist, &watch->rules);
-}
-
-/* Find a matching watch entry, or add this one.
- * Caller must hold audit_filter_mutex. */
-static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
- struct nameidata *ndw)
-{
- struct audit_watch *watch = krule->watch;
- struct inotify_watch *i_watch;
- struct audit_parent *parent;
- int ret = 0;
-
- /* update watch filter fields */
- if (ndw) {
- watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->path.dentry->d_inode->i_ino;
- }
-
- /* The audit_filter_mutex must not be held during inotify calls because
- * we hold it during inotify event callback processing. If an existing
- * inotify watch is found, inotify_find_watch() grabs a reference before
- * returning.
- */
- mutex_unlock(&audit_filter_mutex);
-
- if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
- &i_watch) < 0) {
- parent = audit_init_parent(ndp);
- if (IS_ERR(parent)) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
- return PTR_ERR(parent);
- }
- } else
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- mutex_lock(&audit_filter_mutex);
-
- /* parent was moved before we took audit_filter_mutex */
- if (parent->flags & AUDIT_PARENT_INVALID)
- ret = -ENOENT;
- else
- audit_add_to_parent(krule, parent);
-
- /* match get in audit_init_parent or inotify_find_watch */
- put_inotify_watch(&parent->wdata);
- return ret;
-}
-
static u64 prio_low = ~0ULL/2;
static u64 prio_high = ~0ULL/2 - 1;
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_entry *e;
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
- struct nameidata *ndp = NULL, *ndw = NULL;
struct list_head *list;
int h, err;
#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
- mutex_unlock(&audit_filter_mutex);
if (e) {
+ mutex_unlock(&audit_filter_mutex);
err = -EEXIST;
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
goto error;
}
- /* Avoid calling path_lookup under audit_filter_mutex. */
- if (watch) {
- err = audit_get_nd(watch->path, &ndp, &ndw);
- if (err)
- goto error;
- }
-
- mutex_lock(&audit_filter_mutex);
if (watch) {
/* audit_filter_mutex is dropped and re-taken during this call */
- err = audit_add_watch(&entry->rule, ndp, ndw);
+ err = audit_add_watch(&entry->rule);
if (err) {
mutex_unlock(&audit_filter_mutex);
goto error;
}
- h = audit_hash_ino((u32)watch->ino);
+ /* entry->rule.watch may have changed during audit_add_watch() */
+ watch = entry->rule.watch;
+ h = audit_hash_ino((u32)audit_watch_inode(watch));
list = &audit_inode_hash[h];
}
if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- audit_put_nd(ndp, ndw); /* NULL args OK */
return 0;
error:
- audit_put_nd(ndp, ndw); /* NULL args OK */
if (watch)
audit_put_watch(watch); /* tmp watch, matches initial get */
return err;
@@ -1372,7 +945,7 @@ error:
static inline int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
- struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+ struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
goto out;
}
- watch = e->rule.watch;
- if (watch) {
- struct audit_parent *parent = watch->parent;
-
- list_del(&e->rule.rlist);
-
- if (list_empty(&watch->rules)) {
- audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- /* Put parent on the inotify un-registration
- * list. Grab a reference before releasing
- * audit_filter_mutex, to be released in
- * audit_inotify_unregister().
- * If filesystem is going away, just leave
- * the sucker alone, eviction will take
- * care of it.
- */
- if (pin_inotify_watch(&parent->wdata))
- list_add(&parent->ilist, &inotify_list);
- }
- }
- }
+ if (e->rule.watch)
+ audit_remove_watch_rule(&e->rule, &inotify_list);
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
audit_inotify_unregister(&inotify_list);
out:
- if (tmp_watch)
- audit_put_watch(tmp_watch); /* match initial get */
+ if (watch)
+ audit_put_watch(watch); /* match initial get */
if (tree)
audit_put_tree(tree); /* that's the temporary one */
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
security_release_secctx(ctx, len);
}
}
- audit_log_format(ab, " op=%s rule key=", action);
- if (rule->filterkey)
- audit_log_untrustedstring(ab, rule->filterkey);
- else
- audit_log_format(ab, "(null)");
+ audit_log_format(ab, " op=");
+ audit_log_string(ab, action);
+ audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
audit_log_end(ab);
}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return PTR_ERR(entry);
err = audit_add_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "add",
+ audit_log_rule_change(loginuid, sessionid, sid, "add rule",
&entry->rule, !err);
if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return PTR_ERR(entry);
err = audit_del_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "remove",
+ audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
&entry->rule, !err);
audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
list_del(&r->list);
} else {
if (watch) {
- list_add(&nentry->rule.rlist, &watch->rules);
+ list_add(&nentry->rule.rlist, audit_watch_rules(watch));
list_del(&r->rlist);
} else if (tree)
list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
return err;
}
-
-/* Update watch data in audit rules based on inotify events. */
-void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
-{
- struct audit_parent *parent;
-
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
- audit_update_watch(parent, dname, inode->i_sb->s_dev,
- inode->i_ino, 0);
- else if (mask & (IN_DELETE|IN_MOVED_FROM))
- audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
- /* inotify automatically removes the watch and sends IN_IGNORED */
- else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
- audit_remove_parent_watches(parent);
- /* inotify does not remove the watch, so remove it manually */
- else if(mask & IN_MOVE_SELF) {
- audit_remove_parent_watches(parent);
- inotify_remove_watch_locked(audit_ih, i_watch);
- } else if (mask & IN_IGNORED)
- put_inotify_watch(i_watch);
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d6ac7c1f41..68d3c6a0ecd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -199,6 +199,7 @@ struct audit_context {
struct audit_tree_refs *trees, *first_trees;
int tree_count;
+ struct list_head killed_trees;
int type;
union {
@@ -548,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name && rule->watch->ino != (unsigned long)-1)
- result = (name->dev == rule->watch->dev &&
- name->ino == rule->watch->ino);
+ if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
+ result = (name->dev == audit_watch_dev(rule->watch) &&
+ name->ino == audit_watch_inode(rule->watch));
break;
case AUDIT_DIR:
if (ctx)
@@ -853,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
return NULL;
audit_zero_context(context, state);
+ INIT_LIST_HEAD(&context->killed_trees);
return context;
}
@@ -1024,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
{
char arg_num_len_buf[12];
const char __user *tmp_p = p;
- /* how many digits are in arg_num? 3 is the length of " a=" */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+ /* how many digits are in arg_num? 5 is the length of ' a=""' */
+ size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
size_t len, len_left, to_send;
size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
unsigned int i, has_cntl = 0, too_long = 0;
@@ -1137,7 +1139,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
if (has_cntl)
audit_log_n_hex(*ab, buf, to_send);
else
- audit_log_format(*ab, "\"%s\"", buf);
+ audit_log_string(*ab, buf);
p += to_send;
len_left -= to_send;
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_task_info(ab, tsk);
- if (context->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab, context->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
+ audit_log_key(ab, context->filterkey);
audit_log_end(ab);
for (aux = context->aux; aux; aux = aux->next) {
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
/* that can happen only if we are called from do_exit() */
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit(context, tsk);
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
audit_free_context(context);
}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
+
if (context->previous) {
struct audit_context *new_context = context->previous;
context->previous = NULL;
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
audit_log_format(ab, " sig=%ld", signr);
audit_log_end(ab);
}
+
+struct list_head *audit_killed_trees(void)
+{
+ struct audit_context *ctx = current->audit_context;
+ if (likely(!ctx || !ctx->in_syscall))
+ return NULL;
+ return &ctx->killed_trees;
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd376..b6eadfe30e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,8 @@
#include <linux/cgroupstats.h>
#include <linux/hash.h>
#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>
#include <asm/atomic.h>
@@ -733,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
- * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
{
- if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
+
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@@ -842,6 +856,11 @@ static int parse_cgroupfs_options(char *data,
struct cgroup_sb_opts *opts)
{
char *token, *o = data ?: "all";
+ unsigned long mask = (unsigned long)-1;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~(1UL << cpuset_subsys_id);
+#endif
opts->subsys_bits = 0;
opts->flags = 0;
@@ -886,6 +905,15 @@ static int parse_cgroupfs_options(char *data,
}
}
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
+ (opts->subsys_bits & mask))
+ return -EINVAL;
+
/* We can't have an empty hierarchy */
if (!opts->subsys_bits)
return -EINVAL;
@@ -900,6 +928,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_sb_opts opts;
+ lock_kernel();
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
@@ -927,6 +956,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
kfree(opts.release_agent);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ unlock_kernel();
return ret;
}
@@ -943,6 +973,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pids_list);
init_rwsem(&cgrp->pids_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1340,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
return 0;
}
@@ -2184,12 +2215,30 @@ err:
return ret;
}
+/*
+ * Cache pids for all threads in the same pid namespace that are
+ * opening the same "tasks" file.
+ */
+struct cgroup_pids {
+ /* The node in cgrp->pids_list */
+ struct list_head list;
+ /* The cgroup those pids belong to */
+ struct cgroup *cgrp;
+ /* The namepsace those pids belong to */
+ struct pid_namespace *ns;
+ /* Array of process ids in the cgroup */
+ pid_t *tasks_pids;
+ /* How many files are using the this tasks_pids array */
+ int use_count;
+ /* Length of the current tasks_pids array */
+ int length;
+};
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
-
/*
* seq_file methods for the "tasks" file. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
@@ -2204,45 +2253,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
int index = 0, pid = *pos;
int *iter;
down_read(&cgrp->pids_mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = cp->length;
while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (cp->tasks_pids[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (cp->tasks_pids[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= cp->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = cp->tasks_pids + index;
*pos = *iter;
return iter;
}
static void cgroup_tasks_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
up_read(&cgrp->pids_mutex);
}
static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
+ int *end = cp->tasks_pids + cp->length;
/*
* Advance to the next pid in the array. If this goes off the
@@ -2269,26 +2320,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
.show = cgroup_tasks_show,
};
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void release_cgroup_pid_array(struct cgroup_pids *cp)
{
+ struct cgroup *cgrp = cp->cgrp;
+
down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ BUG_ON(!cp->use_count);
+ if (!--cp->use_count) {
+ list_del(&cp->list);
+ put_pid_ns(cp->ns);
+ kfree(cp->tasks_pids);
+ kfree(cp);
}
up_write(&cgrp->pids_mutex);
}
static int cgroup_tasks_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct seq_file *seq;
+ struct cgroup_pids *cp;
if (!(file->f_mode & FMODE_READ))
return 0;
- release_cgroup_pid_array(cgrp);
+ seq = file->private_data;
+ cp = seq->private;
+
+ release_cgroup_pid_array(cp);
return seq_release(inode, file);
}
@@ -2307,6 +2365,8 @@ static struct file_operations cgroup_tasks_operations = {
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct pid_namespace *ns = current->nsproxy->pid_ns;
+ struct cgroup_pids *cp;
pid_t *pidarray;
int npids;
int retval;
@@ -2333,20 +2393,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
* array if necessary
*/
down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
+
+ list_for_each_entry(cp, &cgrp->pids_list, list) {
+ if (ns == cp->ns)
+ goto found;
+ }
+
+ cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp) {
+ up_write(&cgrp->pids_mutex);
+ kfree(pidarray);
+ return -ENOMEM;
+ }
+ cp->cgrp = cgrp;
+ cp->ns = ns;
+ get_pid_ns(ns);
+ list_add(&cp->list, &cgrp->pids_list);
+found:
+ kfree(cp->tasks_pids);
+ cp->tasks_pids = pidarray;
+ cp->length = npids;
+ cp->use_count++;
up_write(&cgrp->pids_mutex);
file->f_op = &cgroup_tasks_operations;
retval = seq_open(file, &cgroup_tasks_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ release_cgroup_pid_array(cp);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = cp;
return 0;
}
@@ -2679,33 +2756,42 @@ again:
mutex_unlock(&cgroup_mutex);
/*
+ * In general, subsystem has no css->refcnt after pre_destroy(). But
+ * in racy cases, subsystem may have to get css->refcnt after
+ * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
+ * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
+ * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
+ * and subsystem's reference count handling. Please see css_get/put
+ * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ */
+ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+
+ /*
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
ret = cgroup_call_pre_destroy(cgrp);
- if (ret)
+ if (ret) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
return ret;
+ }
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
- /*
- * css_put/get is provided for subsys to grab refcnt to css. In typical
- * case, subsystem has no reference after pre_destroy(). But, under
- * hierarchy management, some *temporal* refcnt can be hold.
- * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
- * is really busy, it should return -EBUSY at pre_destroy(). wake_up
- * is called when css_put() is called and refcnt goes down to 0.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
-
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
- schedule();
+ /*
+ * Because someone may call cgroup_wakeup_rmdir_waiter() before
+ * prepare_to_wait(), we need to check this flag.
+ */
+ if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
+ schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
@@ -3277,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
}
rcu_read_unlock();
}
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460..f6c204f07ea 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
}
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+ struct compat_siginfo __user *uinfo)
+{
+ siginfo_t info;
+
+ if (copy_siginfo_from_user32(&info, uinfo))
+ return -EFAULT;
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 395b6974dc8..8ce10043e4a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
-} cpu_hotplug;
-
-void __init cpu_hotplug_init(void)
-{
- cpu_hotplug.active_writer = NULL;
- mutex_init(&cpu_hotplug.lock);
- cpu_hotplug.refcount = 0;
-}
+} cpu_hotplug = {
+ .active_writer = NULL,
+ .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+ .refcount = 0,
+};
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca86..7e75a41bd50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
struct cpuset *parent; /* my parent */
- /*
- * Copy of global cpuset_mems_generation as of the most
- * recent time this cpuset changed its mems_allowed.
- */
- int mems_generation;
-
struct fmeter fmeter; /* memory_pressure filter */
/* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value. Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement. So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
-
static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
* If a task is only holding callback_mutex, then it has read-only
* access to cpusets.
*
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
*
* The cpuset_common_file_read() handlers only hold callback_mutex across
* small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
}
-/**
- * cpuset_update_task_memory_state - update task memory placement
- *
- * If the current tasks cpusets mems_allowed changed behind our
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held. May be
- * called with or without cgroup_mutex held. Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL. This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation. However this really only
- * matters on alpha systems using cpusets heavily. If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant. Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Called with callback_mutex/cgroup_mutex held
*/
-
-void cpuset_update_task_memory_state(void)
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
+ struct task_struct *tsk)
{
- int my_cpusets_mem_gen;
- struct task_struct *tsk = current;
- struct cpuset *cs;
-
- rcu_read_lock();
- my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
- rcu_read_unlock();
-
- if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
- mutex_lock(&callback_mutex);
- task_lock(tsk);
- cs = task_cs(tsk); /* Maybe changed when task not locked */
- guarantee_online_mems(cs, &tsk->mems_allowed);
- tsk->cpuset_mems_generation = cs->mems_generation;
- if (is_spread_page(cs))
- tsk->flags |= PF_SPREAD_PAGE;
- else
- tsk->flags &= ~PF_SPREAD_PAGE;
- if (is_spread_slab(cs))
- tsk->flags |= PF_SPREAD_SLAB;
- else
- tsk->flags &= ~PF_SPREAD_SLAB;
- task_unlock(tsk);
- mutex_unlock(&callback_mutex);
- mpol_rebind_task(tsk, &tsk->mems_allowed);
- }
+ if (is_spread_page(cs))
+ tsk->flags |= PF_SPREAD_PAGE;
+ else
+ tsk->flags &= ~PF_SPREAD_PAGE;
+ if (is_spread_slab(cs))
+ tsk->flags |= PF_SPREAD_SLAB;
+ else
+ tsk->flags &= ~PF_SPREAD_SLAB;
}
/*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
* migrating memory region.
- *
- * We call cpuset_update_task_memory_state() before hacking
- * our tasks mems_allowed, so that we are assured of being in
- * sync with our tasks cpuset, and in particular, callbacks to
- * cpuset_update_task_memory_state() from nested page allocations
- * won't see any mismatch of our cpuset and task mems_generation
- * values, so won't overwrite our hacked tasks mems_allowed
- * nodemask.
*/
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
{
struct task_struct *tsk = current;
- cpuset_update_task_memory_state();
-
- mutex_lock(&callback_mutex);
tsk->mems_allowed = *to;
- mutex_unlock(&callback_mutex);
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
- mutex_lock(&callback_mutex);
guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
- mutex_unlock(&callback_mutex);
}
/*
- * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
- * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ *
+ * Called with task's alloc_lock held
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ nodemask_t *newmems)
+{
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, &tsk->mems_allowed);
+ mpol_rebind_task(tsk, newmems);
+ tsk->mems_allowed = *newmems;
+}
+
+/*
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
+ * memory_migrate flag is set. Called with cgroup_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
+ nodemask_t newmems;
+
+ cs = cgroup_cs(scan->cg);
+ guarantee_online_mems(cs, &newmems);
+
+ task_lock(p);
+ cpuset_change_task_nodemask(p, &newmems);
+ task_unlock(p);
mm = get_task_mm(p);
if (!mm)
return;
- cs = cgroup_cs(scan->cg);
migrate = is_memory_migrate(cs);
mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
*
* Call with cgroup_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
- cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
}
/*
+ * cpuset_change_flag - make a task's spread flags the same as its cpuset's
+ * @tsk: task to be updated
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
+ */
+static void cpuset_change_flag(struct task_struct *tsk,
+ struct cgroup_scanner *scan)
+{
+ cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+}
+
+/*
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
+ */
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+ struct cgroup_scanner scan;
+
+ scan.cg = cs->css.cgroup;
+ scan.test_task = NULL;
+ scan.process_task = cpuset_change_flag;
+ scan.heap = heap;
+ cgroup_scan_tasks(&scan);
+}
+
+/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (see cpuset_flagbits_t)
* cs: the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset *trialcs;
- int err;
int balance_flag_changed;
+ int spread_flag_changed;
+ struct ptr_heap heap;
+ int err;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
+ err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (err < 0)
+ goto out;
+
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
+ spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+ || (is_spread_page(cs) != is_spread_page(trialcs)));
+
mutex_lock(&callback_mutex);
cs->flags = trialcs->flags;
mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
async_rebuild_sched_domains();
+ if (spread_flag_changed)
+ update_tasks_flags(cs, &heap);
+ heap_free(&heap);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
+ to = node_possible_map;
} else {
- mutex_lock(&callback_mutex);
guarantee_online_cpus(cs, cpus_attach);
- mutex_unlock(&callback_mutex);
+ guarantee_online_mems(cs, &to);
}
err = set_cpus_allowed_ptr(tsk, cpus_attach);
if (err)
return;
+ task_lock(tsk);
+ cpuset_change_task_nodemask(tsk, &to);
+ task_unlock(tsk);
+ cpuset_update_task_spread_flag(cs, tsk);
+
from = oldcs->mems_allowed;
to = cs->mems_allowed;
mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
break;
case FILE_SPREAD_PAGE:
retval = update_flag(CS_SPREAD_PAGE, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
case FILE_SPREAD_SLAB:
retval = update_flag(CS_SPREAD_SLAB, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
default:
retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
struct cpuset *parent;
if (!cont->parent) {
- /* This is early initialization for the top cgroup */
- top_cpuset.mems_generation = cpuset_mems_generation++;
return &top_cpuset.css;
}
parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
return ERR_PTR(-ENOMEM);
}
- cpuset_update_task_memory_state();
cs->flags = 0;
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
- cpuset_update_task_memory_state();
-
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
.early_init = 1,
};
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-
-int __init cpuset_init_early(void)
-{
- alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
-
- top_cpuset.mems_generation = cpuset_mems_generation++;
- return 0;
-}
-
-
/**
* cpuset_init - initialize cpusets at system boot
*
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
{
int err = 0;
+ if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+ BUG();
+
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
fmeter_init(&top_cpuset.fmeter);
- top_cpuset.mems_generation = cpuset_mems_generation++;
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d70..1bb4d7e5d61 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold current->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
- mutex_init(&p->cred_exec_mutex);
+ mutex_init(&p->cred_guard_mutex);
if (
#ifdef CONFIG_KEYS
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c..869dc221733 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
-#include <linux/mnt_namespace.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/security.h>
@@ -48,7 +47,8 @@
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
-#include <trace/sched.h>
+#include <linux/perf_counter.h>
+#include <trace/events/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -56,10 +56,6 @@
#include <asm/mmu_context.h>
#include "cred-internals.h"
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
static void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p)
@@ -158,6 +154,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+#ifdef CONFIG_PERF_COUNTERS
+ WARN_ON_ONCE(tsk->perf_counter_ctxp);
+#endif
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
@@ -174,6 +173,7 @@ repeat:
atomic_dec(&__task_cred(p)->user->processes);
proc_flush_task(p);
+
write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
__exit_signal(p);
@@ -374,9 +374,8 @@ static void set_special_pids(struct pid *pid)
}
/*
- * Let kernel threads use this to say that they
- * allow a certain signal (since daemonize() will
- * have disabled all of them by default).
+ * Let kernel threads use this to say that they allow a certain signal.
+ * Must not be used if kthread was cloned with CLONE_SIGHAND.
*/
int allow_signal(int sig)
{
@@ -384,14 +383,14 @@ int allow_signal(int sig)
return -EINVAL;
spin_lock_irq(&current->sighand->siglock);
+ /* This is only needed for daemonize()'ed kthreads */
sigdelset(&current->blocked, sig);
- if (!current->mm) {
- /* Kernel threads handle their own signals.
- Let the signal code know it'll be handled, so
- that they don't get converted to SIGKILL or
- just silently dropped */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- }
+ /*
+ * Kernel threads handle their own signals. Let the signal code
+ * know it'll be handled, so that they don't get converted to
+ * SIGKILL or just silently dropped.
+ */
+ current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
return 0;
@@ -590,7 +589,7 @@ retry:
/*
* Search in the siblings
*/
- list_for_each_entry(c, &p->parent->children, sibling) {
+ list_for_each_entry(c, &p->real_parent->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
@@ -757,7 +756,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
- if (!p->ptrace &&
+ if (!task_ptrace(p) &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
do_notify_parent(p, p->exit_signal);
if (task_detached(p)) {
@@ -782,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
if (p->parent == father) {
- BUG_ON(p->ptrace);
+ BUG_ON(task_ptrace(p));
p->parent = p->real_parent;
}
reparent_thread(father, p, &dead_children);
@@ -975,16 +974,19 @@ NORET_TYPE void do_exit(long code)
module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_counter_exit_task(tsk);
+
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
#ifdef CONFIG_FUTEX
- /*
- * This must happen late, after the PID is not
- * hashed anymore:
- */
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
@@ -1077,6 +1079,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct siginfo __user *wo_info;
+ int __user *wo_stat;
+ struct rusage __user *wo_rusage;
+
+ int notask_error;
+};
+
static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
struct pid *pid = NULL;
@@ -1087,13 +1101,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
return pid;
}
-static int eligible_child(enum pid_type type, struct pid *pid, int options,
- struct task_struct *p)
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
{
int err;
- if (type < PIDTYPE_MAX) {
- if (task_pid_type(p, type) != pid)
+ if (wo->wo_type < PIDTYPE_MAX) {
+ if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
return 0;
}
@@ -1102,8 +1115,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
* set; otherwise, wait for non-clone children *only*. (Note:
* A "clone" child here is one that reports to its parent
* using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
- && !(options & __WALL))
+ if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
+ && !(wo->wo_flags & __WALL))
return 0;
err = security_task_wait(p);
@@ -1113,14 +1126,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
return 1;
}
-static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
- int why, int status,
- struct siginfo __user *infop,
- struct rusage __user *rusagep)
+static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
+ pid_t pid, uid_t uid, int why, int status)
{
- int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+ struct siginfo __user *infop;
+ int retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
put_task_struct(p);
+ infop = wo->wo_info;
if (!retval)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval)
@@ -1144,19 +1158,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_zombie(struct task_struct *p, int options,
- struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
uid_t uid = __task_cred(p)->uid;
+ struct siginfo __user *infop;
- if (!likely(options & WEXITED))
+ if (!likely(wo->wo_flags & WEXITED))
return 0;
- if (unlikely(options & WNOWAIT)) {
+ if (unlikely(wo->wo_flags & WNOWAIT)) {
int exit_code = p->exit_code;
int why, status;
@@ -1169,8 +1182,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
status = exit_code & 0x7f;
}
- return wait_noreap_copyout(p, pid, uid, why,
- status, infop, ru);
+ return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
/*
@@ -1184,11 +1196,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
}
traced = ptrace_reparented(p);
-
- if (likely(!traced)) {
+ /*
+ * It can be ptraced but not reparented, check
+ * !task_detached() to filter out sub-threads.
+ */
+ if (likely(!traced) && likely(!task_detached(p))) {
struct signal_struct *psig;
struct signal_struct *sig;
- struct task_cputime cputime;
/*
* The resource counters for the group leader are in its
@@ -1201,26 +1215,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
* p->signal fields, because they are only touched by
* __exit_signal, which runs with tasklist_lock
* write-locked anyway, and so is excluded here. We do
- * need to protect the access to p->parent->signal fields,
+ * need to protect the access to parent->signal fields,
* as other threads in the parent group can be right
* here reaping other children at the same time.
- *
- * We use thread_group_cputime() to get times for the thread
- * group, which consolidates times for all threads in the
- * group including the group leader.
*/
- thread_group_cputime(p, &cputime);
- spin_lock_irq(&p->parent->sighand->siglock);
- psig = p->parent->signal;
+ spin_lock_irq(&p->real_parent->sighand->siglock);
+ psig = p->real_parent->signal;
sig = p->signal;
psig->cutime =
cputime_add(psig->cutime,
- cputime_add(cputime.utime,
- sig->cutime));
+ cputime_add(p->utime,
+ cputime_add(sig->utime,
+ sig->cutime)));
psig->cstime =
cputime_add(psig->cstime,
- cputime_add(cputime.stime,
- sig->cstime));
+ cputime_add(p->stime,
+ cputime_add(sig->stime,
+ sig->cstime)));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
@@ -1242,7 +1253,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
sig->oublock + sig->coublock;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- spin_unlock_irq(&p->parent->sighand->siglock);
+ spin_unlock_irq(&p->real_parent->sighand->siglock);
}
/*
@@ -1251,11 +1262,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
*/
read_unlock(&tasklist_lock);
- retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
- if (!retval && stat_addr)
- retval = put_user(status, stat_addr);
+ if (!retval && wo->wo_stat)
+ retval = put_user(status, wo->wo_stat);
+
+ infop = wo->wo_info;
if (!retval && infop)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval && infop)
@@ -1323,15 +1337,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_stopped(int ptrace, struct task_struct *p,
- int options, struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_stopped(struct wait_opts *wo,
+ int ptrace, struct task_struct *p)
{
+ struct siginfo __user *infop;
int retval, exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
- if (!(options & WUNTRACED))
+ /*
+ * Traditionally we see ptrace'd stopped tasks regardless of options.
+ */
+ if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
exit_code = 0;
@@ -1345,7 +1362,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
if (!exit_code)
goto unlock_sig;
- if (!unlikely(options & WNOWAIT))
+ if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
/* don't need the RCU readlock here as we're holding a spinlock */
@@ -1367,14 +1384,15 @@ unlock_sig:
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
- if (unlikely(options & WNOWAIT))
- return wait_noreap_copyout(p, pid, uid,
- why, exit_code,
- infop, ru);
+ if (unlikely(wo->wo_flags & WNOWAIT))
+ return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
+
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ if (!retval && wo->wo_stat)
+ retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
- retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
- if (!retval && stat_addr)
- retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+ infop = wo->wo_info;
if (!retval && infop)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval && infop)
@@ -1401,15 +1419,13 @@ unlock_sig:
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_continued(struct task_struct *p, int options,
- struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
int retval;
pid_t pid;
uid_t uid;
- if (!unlikely(options & WCONTINUED))
+ if (!unlikely(wo->wo_flags & WCONTINUED))
return 0;
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1421,7 +1437,7 @@ static int wait_task_continued(struct task_struct *p, int options,
spin_unlock_irq(&p->sighand->siglock);
return 0;
}
- if (!unlikely(options & WNOWAIT))
+ if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
uid = __task_cred(p)->uid;
spin_unlock_irq(&p->sighand->siglock);
@@ -1430,17 +1446,17 @@ static int wait_task_continued(struct task_struct *p, int options,
get_task_struct(p);
read_unlock(&tasklist_lock);
- if (!infop) {
- retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!wo->wo_info) {
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
put_task_struct(p);
- if (!retval && stat_addr)
- retval = put_user(0xffff, stat_addr);
+ if (!retval && wo->wo_stat)
+ retval = put_user(0xffff, wo->wo_stat);
if (!retval)
retval = pid;
} else {
- retval = wait_noreap_copyout(p, pid, uid,
- CLD_CONTINUED, SIGCONT,
- infop, ru);
+ retval = wait_noreap_copyout(wo, p, pid, uid,
+ CLD_CONTINUED, SIGCONT);
BUG_ON(retval == 0);
}
@@ -1450,19 +1466,16 @@ static int wait_task_continued(struct task_struct *p, int options,
/*
* Consider @p for a wait by @parent.
*
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child,
+ * then ->notask_error is 0 if @p is an eligible child,
* or another error from security_task_wait(), or still -ECHILD.
*/
-static int wait_consider_task(struct task_struct *parent, int ptrace,
- struct task_struct *p, int *notask_error,
- enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
+ int ptrace, struct task_struct *p)
{
- int ret = eligible_child(type, pid, options, p);
+ int ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1474,16 +1487,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
* to look for security policy problems, rather
* than for mysterious wait bugs.
*/
- if (*notask_error)
- *notask_error = ret;
+ if (wo->notask_error)
+ wo->notask_error = ret;
+ return 0;
}
- if (likely(!ptrace) && unlikely(p->ptrace)) {
+ if (likely(!ptrace) && unlikely(task_ptrace(p))) {
/*
* This child is hidden by ptrace.
* We aren't allowed to see it now, but eventually we will.
*/
- *notask_error = 0;
+ wo->notask_error = 0;
return 0;
}
@@ -1494,34 +1508,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
* We don't reap group leaders with subthreads.
*/
if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
- return wait_task_zombie(p, options, infop, stat_addr, ru);
+ return wait_task_zombie(wo, p);
/*
* It's stopped or running now, so it might
* later continue, exit, or stop again.
*/
- *notask_error = 0;
+ wo->notask_error = 0;
if (task_stopped_code(p, ptrace))
- return wait_task_stopped(ptrace, p, options,
- infop, stat_addr, ru);
+ return wait_task_stopped(wo, ptrace, p);
- return wait_task_continued(p, options, infop, stat_addr, ru);
+ return wait_task_continued(wo, p);
}
/*
* Do the work of do_wait() for one thread in the group, @tsk.
*
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children,
+ * ->notask_error is 0 if there were any eligible children,
* or another error from security_task_wait(), or still -ECHILD.
*/
-static int do_wait_thread(struct task_struct *tsk, int *notask_error,
- enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop, int __user *stat_addr,
- struct rusage __user *ru)
+static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
@@ -1530,9 +1540,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
* Do not consider detached threads.
*/
if (!task_detached(p)) {
- int ret = wait_consider_task(tsk, 0, p, notask_error,
- type, pid, options,
- infop, stat_addr, ru);
+ int ret = wait_consider_task(wo, tsk, 0, p);
if (ret)
return ret;
}
@@ -1541,22 +1549,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
return 0;
}
-static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
- enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop, int __user *stat_addr,
- struct rusage __user *ru)
+static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
- /*
- * Traditionally we see ptrace'd stopped tasks regardless of options.
- */
- options |= WUNTRACED;
-
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
- int ret = wait_consider_task(tsk, 1, p, notask_error,
- type, pid, options,
- infop, stat_addr, ru);
+ int ret = wait_consider_task(wo, tsk, 1, p);
if (ret)
return ret;
}
@@ -1564,65 +1562,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
return 0;
}
-static long do_wait(enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop, int __user *stat_addr,
- struct rusage __user *ru)
+static long do_wait(struct wait_opts *wo)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int retval;
- trace_sched_process_wait(pid);
+ trace_sched_process_wait(wo->wo_pid);
add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
/*
* If there is nothing that can match our critiera just get out.
- * We will clear @retval to zero if we see any child that might later
- * match our criteria, even if we are not able to reap it yet.
+ * We will clear ->notask_error to zero if we see any child that
+ * might later match our criteria, even if we are not able to reap
+ * it yet.
*/
- retval = -ECHILD;
- if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
- goto end;
+ wo->notask_error = -ECHILD;
+ if ((wo->wo_type < PIDTYPE_MAX) &&
+ (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+ goto notask;
- current->state = TASK_INTERRUPTIBLE;
+ set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
tsk = current;
do {
- int tsk_result = do_wait_thread(tsk, &retval,
- type, pid, options,
- infop, stat_addr, ru);
- if (!tsk_result)
- tsk_result = ptrace_do_wait(tsk, &retval,
- type, pid, options,
- infop, stat_addr, ru);
- if (tsk_result) {
- /*
- * tasklist_lock is unlocked and we have a final result.
- */
- retval = tsk_result;
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
goto end;
- }
- if (options & __WNOTHREAD)
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ goto end;
+
+ if (wo->wo_flags & __WNOTHREAD)
break;
- tsk = next_thread(tsk);
- BUG_ON(tsk->signal != current->signal);
- } while (tsk != current);
+ } while_each_thread(current, tsk);
read_unlock(&tasklist_lock);
- if (!retval && !(options & WNOHANG)) {
+notask:
+ retval = wo->notask_error;
+ if (!retval && !(wo->wo_flags & WNOHANG)) {
retval = -ERESTARTSYS;
if (!signal_pending(current)) {
schedule();
goto repeat;
}
}
-
end:
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit,&wait);
- if (infop) {
+ if (wo->wo_info) {
+ struct siginfo __user *infop = wo->wo_info;
+
if (retval > 0)
retval = 0;
else {
@@ -1651,6 +1643,7 @@ end:
SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
infop, int, options, struct rusage __user *, ru)
{
+ struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
@@ -1680,7 +1673,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
if (type < PIDTYPE_MAX)
pid = find_get_pid(upid);
- ret = do_wait(type, pid, options, infop, NULL, ru);
+
+ wo.wo_type = type;
+ wo.wo_pid = pid;
+ wo.wo_flags = options;
+ wo.wo_info = infop;
+ wo.wo_stat = NULL;
+ wo.wo_rusage = ru;
+ ret = do_wait(&wo);
put_pid(pid);
/* avoid REGPARM breakage on x86: */
@@ -1691,6 +1691,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
+ struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
@@ -1712,7 +1713,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
pid = find_get_pid(upid);
}
- ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+ wo.wo_type = type;
+ wo.wo_pid = pid;
+ wo.wo_flags = options | WEXITED;
+ wo.wo_info = NULL;
+ wo.wo_stat = stat_addr;
+ wo.wo_rusage = ru;
+ ret = do_wait(&wo);
put_pid(pid);
/* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 875ffbdd96d..144326b7af5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
-#include <linux/mnt_namespace.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
@@ -61,8 +60,8 @@
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
-#include <trace/sched.h>
#include <linux/magic.h>
+#include <linux/perf_counter.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -71,6 +70,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <trace/events/sched.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
@@ -83,8 +84,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
-DEFINE_TRACE(sched_process_fork);
-
int nr_processes(void)
{
int cpu;
@@ -178,7 +177,7 @@ void __init fork_init(unsigned long mempages)
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
kmem_cache_create("task_struct", sizeof(struct task_struct),
- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
#endif
/* do the arch specific task caches init */
@@ -568,18 +567,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* the value intact in a core dump, and to save the unnecessary
* trouble otherwise. Userland only wants this done for a sys_exit.
*/
- if (tsk->clear_child_tid
- && !(tsk->flags & PF_SIGNALED)
- && atomic_read(&mm->mm_users) > 1) {
- u32 __user * tidptr = tsk->clear_child_tid;
+ if (tsk->clear_child_tid) {
+ if (!(tsk->flags & PF_SIGNALED) &&
+ atomic_read(&mm->mm_users) > 1) {
+ /*
+ * We don't check the error code - if userspace has
+ * not set up a proper pointer then tough luck.
+ */
+ put_user(0, tsk->clear_child_tid);
+ sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
+ 1, NULL, NULL, 0);
+ }
tsk->clear_child_tid = NULL;
-
- /*
- * We don't check the error code - if userspace has
- * not set up a proper pointer then tough luck.
- */
- put_user(0, tidptr);
- sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
}
}
@@ -982,6 +981,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!p)
goto fork_out;
+ ftrace_graph_init_task(p);
+
rt_mutex_init_task(p);
#ifdef CONFIG_PROVE_LOCKING
@@ -1027,7 +1028,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
- clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);
p->utime = cputime_zero;
@@ -1089,12 +1089,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
- if (unlikely(current->ptrace))
- ptrace_fork(p, clone_flags);
+
+ p->bts = NULL;
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
+ retval = perf_counter_init_task(p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
+
if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_policy;
/* copy all the process information */
@@ -1131,8 +1135,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
}
- ftrace_graph_init_task(p);
-
p->pid = pid_nr(pid);
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1143,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (current->nsproxy != p->nsproxy) {
retval = ns_cgroup_clone(p, pid);
if (retval)
- goto bad_fork_free_graph;
+ goto bad_fork_free_pid;
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,7 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_graph;
+ goto bad_fork_free_pid;
}
if (clone_flags & CLONE_THREAD) {
@@ -1266,10 +1268,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ perf_counter_fork(p);
return p;
-bad_fork_free_graph:
- ftrace_graph_exit_task(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1293,6 +1294,7 @@ bad_fork_cleanup_semundo:
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_policy:
+ perf_counter_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
@@ -1461,20 +1463,20 @@ void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
- sighand_ctor);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+ SLAB_NOTRACK, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf708..bd1d42b17cb 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
+ /* prevent accounting of that task to load */
+ current->flags |= PF_FREEZING;
+
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!frozen(current))
break;
schedule();
}
+
+ /* Remove the accounting blocker */
+ current->flags &= ~PF_FREEZING;
+
pr_debug("%s left refrigerator\n", current->comm);
__set_current_state(save);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index d546b2d53a6..e18cfbdc719 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
* PRIVATE futexes by Eric Dumazet
* Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
*
+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ * Copyright (C) IBM Corporation, 2009
+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
*/
struct futex_q {
struct plist_node list;
- /* There can only be a single waiter */
- wait_queue_head_t waiter;
+ /* Waiter reference */
+ struct task_struct *task;
/* Which hash list lock to use: */
spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
- struct task_struct *task;
+
+ /* rt_waiter storage for requeue_pi: */
+ struct rt_mutex_waiter *rt_waiter;
/* Bitset for the optional bitmasked wakeup */
u32 bitset;
@@ -241,6 +247,7 @@ again:
if (err < 0)
return err;
+ page = compound_head(page);
lock_page(page);
if (!page->mapping) {
unlock_page(page);
@@ -278,6 +285,44 @@ void put_futex_key(int fshared, union futex_key *key)
drop_futex_key_refs(key);
}
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+ int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+ 1, 1, 0, NULL, NULL);
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb: the hash bucket the futex_q's reside in
+ * @key: the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+ union futex_key *key)
+{
+ struct futex_q *this;
+
+ plist_for_each_entry(this, &hb->chain, list) {
+ if (match_futex(&this->key, key))
+ return this;
+ }
+ return NULL;
+}
+
static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
{
u32 curval;
@@ -539,28 +584,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return 0;
}
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Returns:
+ * 0 - ready to wait
+ * 1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task, int set_waiters)
+{
+ int lock_taken, ret, ownerdied = 0;
+ u32 uval, newval, curval;
+
+retry:
+ ret = lock_taken = 0;
+
+ /*
+ * To avoid races, we attempt to take the lock here again
+ * (by doing a 0 -> TID atomic cmpxchg), while holding all
+ * the locks. It will most likely not succeed.
+ */
+ newval = task_pid_vnr(task);
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+
+ /*
+ * Detect deadlocks.
+ */
+ if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+ return -EDEADLK;
+
+ /*
+ * Surprise - we got the lock. Just return to userspace:
+ */
+ if (unlikely(!curval))
+ return 1;
+
+ uval = curval;
+
+ /*
+ * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+ * to wake at the next unlock.
+ */
+ newval = curval | FUTEX_WAITERS;
+
+ /*
+ * There are two cases, where a futex might have no owner (the
+ * owner TID is 0): OWNER_DIED. We take over the futex in this
+ * case. We also do an unconditional take over, when the owner
+ * of the futex died.
+ *
+ * This is safe as we are protected by the hash bucket lock !
+ */
+ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+ /* Keep the OWNER_DIED bit */
+ newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+ ownerdied = 0;
+ lock_taken = 1;
+ }
+
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+
+ /*
+ * We took the lock due to owner died take over.
+ */
+ if (unlikely(lock_taken))
+ return 1;
+
+ /*
+ * We dont have the lock. Look up the PI state (or create it if
+ * we are the first waiter):
+ */
+ ret = lookup_pi_state(uval, hb, key, ps);
+
+ if (unlikely(ret)) {
+ switch (ret) {
+ case -ESRCH:
+ /*
+ * No owner found for this futex. Check if the
+ * OWNER_DIED bit is set to figure out whether
+ * this is a robust futex or not.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ return -EFAULT;
+
+ /*
+ * We simply start over in case of a robust
+ * futex. The code above will take the futex
+ * and return happy.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ ownerdied = 1;
+ goto retry;
+ }
+ default:
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
*/
static void wake_futex(struct futex_q *q)
{
- plist_del(&q->list, &q->list.plist);
+ struct task_struct *p = q->task;
+
/*
- * The lock in wake_up_all() is a crucial memory barrier after the
- * plist_del() and also before assigning to q->lock_ptr.
+ * We set q->lock_ptr = NULL _before_ we wake up the task. If
+ * a non futex wake up happens on another CPU then the task
+ * might exit and p would dereference a non existing task
+ * struct. Prevent this by holding a reference on p across the
+ * wake up.
*/
- wake_up(&q->waiter);
+ get_task_struct(p);
+
+ plist_del(&q->list, &q->list.plist);
/*
- * The waiting task can free the futex_q as soon as this is written,
- * without taking any locks. This must come last.
- *
- * A memory barrier is required here to prevent the following store to
- * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
- * end of wake_up() does not prevent this store from moving.
+ * The waiting task can free the futex_q as soon as
+ * q->lock_ptr = NULL is written, without taking any locks. A
+ * memory barrier is required here to prevent the following
+ * store to lock_ptr from getting ahead of the plist_del.
*/
smp_wmb();
q->lock_ptr = NULL;
+
+ wake_up_state(p, TASK_NORMAL);
+ put_task_struct(p);
}
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +866,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
- if (this->pi_state) {
+ if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
break;
}
@@ -739,7 +916,6 @@ retry:
retry_private:
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
- u32 dummy;
double_unlock_hb(hb1, hb2);
@@ -757,7 +933,7 @@ retry_private:
goto out_put_keys;
}
- ret = get_user(dummy, uaddr2);
+ ret = fault_in_user_writeable(uaddr2);
if (ret)
goto out_put_keys;
@@ -802,24 +978,194 @@ out:
return ret;
}
-/*
- * Requeue all waiters hashed on one physical page to another
- * physical page.
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q: the futex_q to requeue
+ * @hb1: the source hash_bucket
+ * @hb2: the target hash_bucket
+ * @key2: the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(&hb1->chain != &hb2->chain)) {
+ plist_del(&q->list, &hb1->chain);
+ plist_add(&q->list, &hb2->chain);
+ q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb2->lock;
+#endif
+ }
+ get_futex_key_refs(key2);
+ q->key = *key2;
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q: the futex_q
+ * key: the key of the requeue target futex
+ * hb: the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal. Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
+ * to protect access to the pi_state to fixup the owner later. Must be called
+ * with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
+{
+ drop_futex_key_refs(&q->key);
+ get_futex_key_refs(key);
+ q->key = *key;
+
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
+
+ WARN_ON(!q->rt_waiter);
+ q->rt_waiter = NULL;
+
+ q->lock_ptr = &hb->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+
+ wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ * 0 - failed to acquire the lock atomicly
+ * 1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+ struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2,
+ union futex_key *key1, union futex_key *key2,
+ struct futex_pi_state **ps, int set_waiters)
+{
+ struct futex_q *top_waiter = NULL;
+ u32 curval;
+ int ret;
+
+ if (get_futex_value_locked(&curval, pifutex))
+ return -EFAULT;
+
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
+ top_waiter = futex_top_waiter(hb1, key1);
+
+ /* There are no waiters, nothing for us to do. */
+ if (!top_waiter)
+ return 0;
+
+ /*
+ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
+ * the contended case or if set_waiters is 1. The pi_state is returned
+ * in ps in contended cases.
+ */
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ set_waiters);
+ if (ret == 1)
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+
+ return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1: source futex user address
+ * uaddr2: target futex user address
+ * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ * <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval)
+ int nr_wake, int nr_requeue, u32 *cmpval,
+ int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ int drop_count = 0, task_count = 0, ret;
+ struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head1;
struct futex_q *this, *next;
- int ret, drop_count = 0;
+ u32 curval2;
+
+ if (requeue_pi) {
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+ /*
+ * requeue_pi must wake as many tasks as it can, up to nr_wake
+ * + nr_requeue, since it acquires the rt_mutex prior to
+ * returning to userspace, so as to not leave the rt_mutex with
+ * waiters and no owner. However, second and third wake-ups
+ * cannot be predicted as they involve race conditions with the
+ * first wake and a fault while looking up the pi_state. Both
+ * pthread_cond_signal() and pthread_cond_broadcast() should
+ * use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+ }
retry:
+ if (pi_state != NULL) {
+ /*
+ * We will have to lookup the pi_state again, so free this one
+ * to keep the accounting correct.
+ */
+ free_pi_state(pi_state);
+ pi_state = NULL;
+ }
+
ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ);
+ ret = get_futex_key(uaddr2, fshared, &key2,
+ requeue_pi ? VERIFY_WRITE : VERIFY_READ);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -854,32 +1200,106 @@ retry_private:
}
}
+ if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or is
+ * waiting on it. If the former, then the pi_state will not
+ * exist yet, look it up one more time to ensure we have a
+ * reference to it.
+ */
+ if (ret == 1) {
+ WARN_ON(pi_state);
+ task_count++;
+ ret = get_futex_value_locked(&curval2, uaddr2);
+ if (!ret)
+ ret = lookup_pi_state(curval2, hb2, &key2,
+ &pi_state);
+ }
+
+ switch (ret) {
+ case 0:
+ break;
+ case -EFAULT:
+ double_unlock_hb(hb1, hb2);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ goto out;
+ case -EAGAIN:
+ /* The owner was exiting, try again. */
+ double_unlock_hb(hb1, hb2);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
head1 = &hb1->chain;
plist_for_each_entry_safe(this, next, head1, list) {
- if (!match_futex (&this->key, &key1))
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!match_futex(&this->key, &key1))
continue;
- if (++ret <= nr_wake) {
+
+ /*
+ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Wake nr_wake waiters. For requeue_pi, if we acquired the
+ * lock, we already woke the top_waiter. If not, it will be
+ * woken by futex_unlock_pi().
+ */
+ if (++task_count <= nr_wake && !requeue_pi) {
wake_futex(this);
- } else {
- /*
- * If key1 and key2 hash to the same bucket, no need to
- * requeue.
- */
- if (likely(head1 != &hb2->chain)) {
- plist_del(&this->list, &hb1->chain);
- plist_add(&this->list, &hb2->chain);
- this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- this->list.plist.lock = &hb2->lock;
-#endif
- }
- this->key = key2;
- get_futex_key_refs(&key2);
- drop_count++;
+ continue;
+ }
- if (ret - nr_wake >= nr_requeue)
- break;
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ */
+ if (requeue_pi) {
+ /* Prepare the waiter to take the rt_mutex. */
+ atomic_inc(&pi_state->refcount);
+ this->pi_state = pi_state;
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task, 1);
+ if (ret == 1) {
+ /* We got the lock. */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ continue;
+ } else if (ret) {
+ /* -EDEADLK */
+ this->pi_state = NULL;
+ free_pi_state(pi_state);
+ goto out_unlock;
+ }
}
+ requeue_futex(this, hb1, hb2, &key2);
+ drop_count++;
}
out_unlock:
@@ -899,7 +1319,9 @@ out_put_keys:
out_put_key1:
put_futex_key(fshared, &key1);
out:
- return ret;
+ if (pi_state != NULL)
+ free_pi_state(pi_state);
+ return ret ? ret : task_count;
}
/* The key must be already stored in q->key. */
@@ -907,8 +1329,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
{
struct futex_hash_bucket *hb;
- init_waitqueue_head(&q->waiter);
-
get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
@@ -1097,7 +1517,7 @@ retry:
handle_fault:
spin_unlock(q->lock_ptr);
- ret = get_user(uval, uaddr);
+ ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
@@ -1119,35 +1539,149 @@ handle_fault:
*/
#define FLAGS_SHARED 0x01
#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait(u32 __user *uaddr, int fshared,
- u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+/**
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr: user address of the futex
+ * @fshared: whether the futex is shared (1) or not (0)
+ * @q: futex_q (contains pi_state and access to the rt_mutex)
+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ * 1 - success, lock taken
+ * 0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+ int locked)
{
- struct task_struct *curr = current;
- struct restart_block *restart;
- DECLARE_WAITQUEUE(wait, curr);
- struct futex_hash_bucket *hb;
- struct futex_q q;
- u32 uval;
- int ret;
- struct hrtimer_sleeper t;
- int rem = 0;
+ struct task_struct *owner;
+ int ret = 0;
- if (!bitset)
- return -EINVAL;
+ if (locked) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
+ */
+ if (q->pi_state->owner != current)
+ ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+ goto out;
+ }
- q.pi_state = NULL;
- q.bitset = bitset;
-retry:
- q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ);
- if (unlikely(ret != 0))
+ /*
+ * Catch the rare case, where the lock was released when we were on the
+ * way back before we locked the hash bucket.
+ */
+ if (q->pi_state->owner == current) {
+ /*
+ * Try to get the rt_mutex now. This might fail as some other
+ * task acquired the rt_mutex after we removed ourself from the
+ * rt_mutex waiters list.
+ */
+ if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+ locked = 1;
+ goto out;
+ }
+
+ /*
+ * pi_state is incorrect, some other task did a lock steal and
+ * we returned due to timeout or signal without taking the
+ * rt_mutex. Too late. We can access the rt_mutex_owner without
+ * locking, as the other task is now blocked on the hash bucket
+ * lock. Fix the state up.
+ */
+ owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+ ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
goto out;
+ }
-retry_private:
- hb = queue_lock(&q);
+ /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner, nor the pending owner, of the rt_mutex.
+ */
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+ "pi-state %p\n", ret,
+ q->pi_state->pi_mutex.owner,
+ q->pi_state->owner);
+
+out:
+ return ret ? ret : locked;
+}
+
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb: the futex hash bucket, must be locked by the caller
+ * @q: the futex_q to queue up on
+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ queue_me(q, hb);
+
+ /*
+ * There might have been scheduling since the queue_me(), as we
+ * cannot hold a spinlock across the get_user() in case it
+ * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+ * queueing ourselves into the futex hash. This code thus has to
+ * rely on the futex_wake() code removing us from hash when it
+ * wakes us up.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* Arm the timer */
+ if (timeout) {
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
+
+ /*
+ * !plist_node_empty() is safe here without any lock.
+ * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+ */
+ if (likely(!plist_node_empty(&q->list))) {
+ /*
+ * If the timer has already expired, current will already be
+ * flagged for rescheduling. Only call schedule if there
+ * is no timeout, or if it has yet to expire.
+ */
+ if (!timeout || timeout->task)
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+ * @val: the expected value
+ * @fshared: whether the futex is shared (1) or not (0)
+ * @q: the associated futex_q
+ * @hb: storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
+ * compare it with the expected value. Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ * 0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+ struct futex_q *q, struct futex_hash_bucket **hb)
+{
+ u32 uval;
+ int ret;
/*
* Access the page AFTER the hash-bucket is locked.
@@ -1165,95 +1699,83 @@ retry_private:
* A consequence is that futex_wait() can return zero and absorb
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
- *
- * For shared futexes, we hold the mmap semaphore, so the mapping
- * cannot have changed since we looked it up in get_futex_key.
*/
+retry:
+ q->key = FUTEX_KEY_INIT;
+ ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ *hb = queue_lock(q);
+
ret = get_futex_value_locked(&uval, uaddr);
- if (unlikely(ret)) {
- queue_unlock(&q, hb);
+ if (ret) {
+ queue_unlock(q, *hb);
ret = get_user(uval, uaddr);
if (ret)
- goto out_put_key;
+ goto out;
if (!fshared)
goto retry_private;
- put_futex_key(fshared, &q.key);
+ put_futex_key(fshared, &q->key);
goto retry;
}
- ret = -EWOULDBLOCK;
- if (unlikely(uval != val)) {
- queue_unlock(&q, hb);
- goto out_put_key;
- }
- /* Only actually queue if *uaddr contained val. */
- queue_me(&q, hb);
+ if (uval != val) {
+ queue_unlock(q, *hb);
+ ret = -EWOULDBLOCK;
+ }
- /*
- * There might have been scheduling since the queue_me(), as we
- * cannot hold a spinlock across the get_user() in case it
- * faults, and we cannot just set TASK_INTERRUPTIBLE state when
- * queueing ourselves into the futex hash. This code thus has to
- * rely on the futex_wake() code removing us from hash when it
- * wakes us up.
- */
+out:
+ if (ret)
+ put_futex_key(fshared, &q->key);
+ return ret;
+}
- /* add_wait_queue is the barrier after __set_current_state. */
- __set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&q.waiter, &wait);
- /*
- * !plist_node_empty() is safe here without any lock.
- * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
- */
- if (likely(!plist_node_empty(&q.list))) {
- if (!abs_time)
- schedule();
- else {
- hrtimer_init_on_stack(&t.timer,
- clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(&t, current);
- hrtimer_set_expires_range_ns(&t.timer, *abs_time,
- current->timer_slack_ns);
-
- hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&t.timer))
- t.task = NULL;
+static int futex_wait(u32 __user *uaddr, int fshared,
+ u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct restart_block *restart;
+ struct futex_hash_bucket *hb;
+ struct futex_q q;
+ int ret;
- /*
- * the timer could have already expired, in which
- * case current would be flagged for rescheduling.
- * Don't bother calling schedule.
- */
- if (likely(t.task))
- schedule();
+ if (!bitset)
+ return -EINVAL;
- hrtimer_cancel(&t.timer);
+ q.pi_state = NULL;
+ q.bitset = bitset;
+ q.rt_waiter = NULL;
- /* Flag if a timeout occured */
- rem = (t.task == NULL);
+ if (abs_time) {
+ to = &timeout;
- destroy_hrtimer_on_stack(&t.timer);
- }
+ hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+ CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+ current->timer_slack_ns);
}
- __set_current_state(TASK_RUNNING);
- /*
- * NOTE: we don't remove ourselves from the waitqueue because
- * we are the only user of it.
- */
+ /* Prepare to wait on uaddr. */
+ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ if (ret)
+ goto out;
+
+ /* queue_me and wait for wakeup, timeout, or a signal. */
+ futex_wait_queue_me(hb, &q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
if (!unqueue_me(&q))
goto out_put_key;
ret = -ETIMEDOUT;
- if (rem)
+ if (to && !to->task)
goto out_put_key;
/*
@@ -1270,7 +1792,7 @@ retry_private:
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
- restart->futex.flags = 0;
+ restart->futex.flags = FLAGS_HAS_TIMEOUT;
if (fshared)
restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1804,10 @@ retry_private:
out_put_key:
put_futex_key(fshared, &q.key);
out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
return ret;
}
@@ -1290,13 +1816,16 @@ static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
int fshared = 0;
- ktime_t t;
+ ktime_t t, *tp = NULL;
- t.tv64 = restart->futex.time;
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+ t.tv64 = restart->futex.time;
+ tp = &t;
+ }
restart->fn = do_no_restart_syscall;
if (restart->futex.flags & FLAGS_SHARED)
fshared = 1;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+ return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
restart->futex.bitset,
restart->futex.flags & FLAGS_CLOCKRT);
}
@@ -1312,11 +1841,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
- struct task_struct *curr = current;
struct futex_hash_bucket *hb;
- u32 uval, newval, curval;
struct futex_q q;
- int ret, lock_taken, ownerdied = 0;
+ int res, ret;
if (refill_pi_state_cache())
return -ENOMEM;
@@ -1330,6 +1857,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
}
q.pi_state = NULL;
+ q.rt_waiter = NULL;
retry:
q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1867,15 @@ retry:
retry_private:
hb = queue_lock(&q);
-retry_locked:
- ret = lock_taken = 0;
-
- /*
- * To avoid races, we attempt to take the lock here again
- * (by doing a 0 -> TID atomic cmpxchg), while holding all
- * the locks. It will most likely not succeed.
- */
- newval = task_pid_vnr(current);
-
- curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
- if (unlikely(curval == -EFAULT))
- goto uaddr_faulted;
-
- /*
- * Detect deadlocks. In case of REQUEUE_PI this is a valid
- * situation and we return success to user space.
- */
- if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
- ret = -EDEADLK;
- goto out_unlock_put_key;
- }
-
- /*
- * Surprise - we got the lock. Just return to userspace:
- */
- if (unlikely(!curval))
- goto out_unlock_put_key;
-
- uval = curval;
-
- /*
- * Set the WAITERS flag, so the owner will know it has someone
- * to wake at next unlock
- */
- newval = curval | FUTEX_WAITERS;
-
- /*
- * There are two cases, where a futex might have no owner (the
- * owner TID is 0): OWNER_DIED. We take over the futex in this
- * case. We also do an unconditional take over, when the owner
- * of the futex died.
- *
- * This is safe as we are protected by the hash bucket lock !
- */
- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
- /* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
- ownerdied = 0;
- lock_taken = 1;
- }
-
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (unlikely(curval == -EFAULT))
- goto uaddr_faulted;
- if (unlikely(curval != uval))
- goto retry_locked;
-
- /*
- * We took the lock due to owner died take over.
- */
- if (unlikely(lock_taken))
- goto out_unlock_put_key;
-
- /*
- * We dont have the lock. Look up the PI state (or create it if
- * we are the first waiter):
- */
- ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
-
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
if (unlikely(ret)) {
switch (ret) {
-
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
case -EAGAIN:
/*
* Task is exiting and we just wait for the
@@ -1423,25 +1885,6 @@ retry_locked:
put_futex_key(fshared, &q.key);
cond_resched();
goto retry;
-
- case -ESRCH:
- /*
- * No owner found for this futex. Check if the
- * OWNER_DIED bit is set to figure out whether
- * this is a robust futex or not.
- */
- if (get_futex_value_locked(&curval, uaddr))
- goto uaddr_faulted;
-
- /*
- * We simply start over in case of a robust
- * futex. The code above will take the futex
- * and return happy.
- */
- if (curval & FUTEX_OWNER_DIED) {
- ownerdied = 1;
- goto retry_locked;
- }
default:
goto out_unlock_put_key;
}
@@ -1465,71 +1908,21 @@ retry_locked:
}
spin_lock(q.lock_ptr);
-
- if (!ret) {
- /*
- * Got the lock. We might not be the anticipated owner
- * if we did a lock-steal - fix up the PI-state in
- * that case:
- */
- if (q.pi_state->owner != curr)
- ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
- } else {
- /*
- * Catch the rare case, where the lock was released
- * when we were on the way back before we locked the
- * hash bucket.
- */
- if (q.pi_state->owner == curr) {
- /*
- * Try to get the rt_mutex now. This might
- * fail as some other task acquired the
- * rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q.pi_state->pi_mutex))
- ret = 0;
- else {
- /*
- * pi_state is incorrect, some other
- * task did a lock steal and we
- * returned due to timeout or signal
- * without taking the rt_mutex. Too
- * late. We can access the
- * rt_mutex_owner without locking, as
- * the other task is now blocked on
- * the hash bucket lock. Fix the state
- * up.
- */
- struct task_struct *owner;
- int res;
-
- owner = rt_mutex_owner(&q.pi_state->pi_mutex);
- res = fixup_pi_state_owner(uaddr, &q, owner,
- fshared);
-
- /* propagate -EFAULT, if the fixup failed */
- if (res)
- ret = res;
- }
- } else {
- /*
- * Paranoia check. If we did not take the lock
- * in the trylock above, then we should not be
- * the owner of the rtmutex, neither the real
- * nor the pending one:
- */
- if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
- printk(KERN_ERR "futex_lock_pi: ret = %d "
- "pi-mutex: %p pi-state %p\n", ret,
- q.pi_state->pi_mutex.owner,
- q.pi_state->owner);
- }
- }
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_owner(uaddr, fshared, &q, !ret);
+ /*
+ * If fixup_owner() returned an error, proprogate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
/*
- * If fixup_pi_state_owner() faulted and was unable to handle the
- * fault, unlock it and return the fault to userspace.
+ * If fixup_owner() faulted and was unable to handle the fault, unlock
+ * it and return the fault to userspace.
*/
if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1930,7 @@ retry_locked:
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- if (to)
- destroy_hrtimer_on_stack(&to->timer);
- return ret != -EINTR ? ret : -ERESTARTNOINTR;
+ goto out;
out_unlock_put_key:
queue_unlock(&q, hb);
@@ -1549,19 +1940,12 @@ out_put_key:
out:
if (to)
destroy_hrtimer_on_stack(&to->timer);
- return ret;
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
- /*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
- */
queue_unlock(&q, hb);
- ret = get_user(uval, uaddr);
+ ret = fault_in_user_writeable(uaddr);
if (ret)
goto out_put_key;
@@ -1572,7 +1956,6 @@ uaddr_faulted:
goto retry;
}
-
/*
* Userspace attempted a TID -> 0 atomic transition, and failed.
* This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1657,23 +2040,239 @@ out:
return ret;
pi_faulted:
- /*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
- */
spin_unlock(&hb->lock);
put_futex_key(fshared, &key);
- ret = get_user(uval, uaddr);
+ ret = fault_in_user_writeable(uaddr);
if (!ret)
goto retry;
return ret;
}
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb: the hash_bucket futex_q was original enqueued on
+ * @q: the futex_q woken while waiting to be requeued
+ * @key2: the futex_key of the requeue target futex
+ * @timeout: the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex. If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller. Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ * 0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+ struct futex_q *q, union futex_key *key2,
+ struct hrtimer_sleeper *timeout)
+{
+ int ret = 0;
+
+ /*
+ * With the hb lock held, we avoid races while we process the wakeup.
+ * We only need to hold hb (and not hb2) to ensure atomicity as the
+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+ * It can't be requeued from uaddr2 to something else since we don't
+ * support a PI aware source futex for requeue.
+ */
+ if (!match_futex(&q->key, key2)) {
+ WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &q->list.plist);
+ drop_futex_key_refs(&q->key);
+
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else
+ ret = -ERESTARTNOINTR;
+ }
+ return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr: the futex we initialyl wait on (non-pi)
+ * @fshared: whether the futexes are shared (1) or not (0). They must be
+ * the same type, no requeueing from private to shared, etc.
+ * @val: the expected value of uaddr
+ * @abs_time: absolute timeout
+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2: the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ * 0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+ u32 val, ktime_t *abs_time, u32 bitset,
+ int clockrt, u32 __user *uaddr2)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct rt_mutex *pi_mutex = NULL;
+ struct futex_hash_bucket *hb;
+ union futex_key key2;
+ struct futex_q q;
+ int res, ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ if (abs_time) {
+ to = &timeout;
+ hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+ CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+ current->timer_slack_ns);
+ }
+
+ /*
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+ debug_rt_mutex_init_waiter(&rt_waiter);
+ rt_waiter.task = NULL;
+
+ q.pi_state = NULL;
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+
+ key2 = FUTEX_KEY_INIT;
+ ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+ /* Prepare to wait on uaddr. */
+ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ if (ret)
+ goto out_key2;
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+ futex_wait_queue_me(hb, &q, to);
+
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+ spin_unlock(&hb->lock);
+ if (ret)
+ goto out_put_keys;
+
+ /*
+ * In order for us to be here, we know our q.key == key2, and since
+ * we took the hb->lock above, we also know that futex_requeue() has
+ * completed and we no longer have to concern ourselves with a wakeup
+ * race with the atomic proxy lock acquition by the requeue code.
+ */
+
+ /* Check if the requeue code acquired the second futex for us. */
+ if (!q.rt_waiter) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_state_owner(uaddr2, &q, current,
+ fshared);
+ spin_unlock(q.lock_ptr);
+ }
+ } else {
+ /*
+ * We have been woken up by futex_unlock_pi(), a timeout, or a
+ * signal. futex_unlock_pi() will not destroy the lock_ptr nor
+ * the pi_state.
+ */
+ WARN_ON(!&q.pi_state);
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+ debug_rt_mutex_free_waiter(&rt_waiter);
+
+ spin_lock(q.lock_ptr);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_owner(uaddr2, fshared, &q, !ret);
+ /*
+ * If fixup_owner() returned an error, proprogate that. If it
+ * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ /* Unqueue and drop the lock. */
+ unqueue_me_pi(&q);
+ }
+
+ /*
+ * If fixup_pi_state_owner() faulted and was unable to handle the
+ * fault, unlock the rt_mutex and return the fault to userspace.
+ */
+ if (ret == -EFAULT) {
+ if (rt_mutex_owner(pi_mutex) == current)
+ rt_mutex_unlock(pi_mutex);
+ } else if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but we have no way to
+ * restart by calling futex_lock_pi() directly. We
+ * could restart the syscall, but that will look at
+ * the user space value and return right away. So we
+ * drop back with EWOULDBLOCK to tell user space that
+ * "val" has been changed. That's the same what the
+ * restart of the syscall would do in
+ * futex_wait_setup().
+ */
+ ret = -EWOULDBLOCK;
+ }
+
+out_put_keys:
+ put_futex_key(fshared, &q.key);
+out_key2:
+ put_futex_key(fshared, &key2);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
/*
* Support for robust futexes: the kernel cleans up held futexes at
* thread exit time.
@@ -1896,7 +2495,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
fshared = 1;
clockrt = op & FUTEX_CLOCK_REALTIME;
- if (clockrt && cmd != FUTEX_WAIT_BITSET)
+ if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
switch (cmd) {
@@ -1911,10 +2510,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
ret = futex_wake(uaddr, fshared, val, val3);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+ 0);
break;
case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2531,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (futex_cmpxchg_enabled)
ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
break;
+ case FUTEX_WAIT_REQUEUE_PI:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+ clockrt, uaddr2);
+ break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+ 1);
+ break;
default:
ret = -ENOSYS;
}
@@ -1948,7 +2557,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET)) {
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
@@ -1960,11 +2570,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
tp = &t;
}
/*
- * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+ * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
* number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
*/
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_WAKE_OP)
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (u32) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee2..235716556bf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET)) {
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (get_compat_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 00000000000..22e9dcfaa3d
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
+menu "GCOV-based kernel profiling"
+
+config GCOV_KERNEL
+ bool "Enable gcov-based kernel profiling"
+ depends on DEBUG_FS && CONSTRUCTORS
+ default n
+ ---help---
+ This option enables gcov-based code profiling (e.g. for code coverage
+ measurements).
+
+ If unsure, say N.
+
+ Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
+ for the entire kernel. To enable profiling for specific files or
+ directories, add a line similar to the following to the respective
+ Makefile:
+
+ For a single file (e.g. main.o):
+ GCOV_PROFILE_main.o := y
+
+ For all files in one directory:
+ GCOV_PROFILE := y
+
+ To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+ is specified, use:
+
+ GCOV_PROFILE_main.o := n
+ and:
+ GCOV_PROFILE := n
+
+ Note that the debugfs filesystem has to be mounted to access
+ profiling data.
+
+config GCOV_PROFILE_ALL
+ bool "Profile entire Kernel"
+ depends on GCOV_KERNEL
+ depends on S390 || X86
+ default n
+ ---help---
+ This options activates profiling for the entire kernel.
+
+ If unsure, say N.
+
+ Note that a kernel compiled with profiling flags will be significantly
+ larger and run slower. Also be sure to exclude files from profiling
+ which are not linked to the kernel image to prevent linker errors.
+
+endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 00000000000..3f761001d51
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 00000000000..9b22d03cc58
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
+/*
+ * This code maintains a list of active profiling data structures.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ * Based on the gcov-kernel patch by:
+ * Hubertus Franke <frankeh@us.ibm.com>
+ * Nigel Hinds <nhinds@us.ibm.com>
+ * Rajan Ravindran <rajancr@us.ibm.com>
+ * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ * Paul Larson
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+static struct gcov_info *gcov_info_head;
+static int gcov_events_enabled;
+static DEFINE_MUTEX(gcov_lock);
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+ static unsigned int gcov_version;
+
+ mutex_lock(&gcov_lock);
+ if (gcov_version == 0) {
+ gcov_version = info->version;
+ /*
+ * Printing gcc's version magic may prove useful for debugging
+ * incompatibility reports.
+ */
+ pr_info("version magic: 0x%x\n", gcov_version);
+ }
+ /*
+ * Add new profiling data structure to list and inform event
+ * listener.
+ */
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+/**
+ * gcov_enable_events - enable event reporting through gcov_event()
+ *
+ * Turn on reporting of profiling data load/unload-events through the
+ * gcov_event() callback. Also replay all previous events once. This function
+ * is needed because some events are potentially generated too early for the
+ * callback implementation to handle them initially.
+ */
+void gcov_enable_events(void)
+{
+ struct gcov_info *info;
+
+ mutex_lock(&gcov_lock);
+ gcov_events_enabled = 1;
+ /* Perform event callback for previously registered entries. */
+ for (info = gcov_info_head; info; info = info->next)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+
+#ifdef CONFIG_MODULES
+static inline int within(void *addr, void *start, unsigned long size)
+{
+ return ((addr >= start) && (addr < start + size));
+}
+
+/* Update list and generate events when modules are unloaded. */
+static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
+ void *data)
+{
+ struct module *mod = data;
+ struct gcov_info *info;
+ struct gcov_info *prev;
+
+ if (event != MODULE_STATE_GOING)
+ return NOTIFY_OK;
+ mutex_lock(&gcov_lock);
+ prev = NULL;
+ /* Remove entries located in module from linked list. */
+ for (info = gcov_info_head; info; info = info->next) {
+ if (within(info, mod->module_core, mod->core_size)) {
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_REMOVE, info);
+ } else
+ prev = info;
+ }
+ mutex_unlock(&gcov_lock);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block gcov_nb = {
+ .notifier_call = gcov_module_notifier,
+};
+
+static int __init gcov_init(void)
+{
+ return register_module_notifier(&gcov_nb);
+}
+device_initcall(gcov_init);
+#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 00000000000..ef3c3f88a7a
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
+/*
+ * This code exports profiling data as debugfs files to userspace.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ * Based on the gcov-kernel patch by:
+ * Hubertus Franke <frankeh@us.ibm.com>
+ * Nigel Hinds <nhinds@us.ibm.com>
+ * Rajan Ravindran <rajancr@us.ibm.com>
+ * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ * Paul Larson
+ * Yi CDL Yang
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include "gcov.h"
+
+/**
+ * struct gcov_node - represents a debugfs entry
+ * @list: list head for child node list
+ * @children: child nodes
+ * @all: list head for list of all nodes
+ * @parent: parent node
+ * @info: associated profiling data structure if not a directory
+ * @ghost: when an object file containing profiling data is unloaded we keep a
+ * copy of the profiling data here to allow collecting coverage data
+ * for cleanup code. Such a node is called a "ghost".
+ * @dentry: main debugfs entry, either a directory or data file
+ * @links: associated symbolic links
+ * @name: data file basename
+ *
+ * struct gcov_node represents an entity within the gcov/ subdirectory
+ * of debugfs. There are directory and data file nodes. The latter represent
+ * the actual synthesized data file plus any associated symbolic links which
+ * are needed by the gcov tool to work correctly.
+ */
+struct gcov_node {
+ struct list_head list;
+ struct list_head children;
+ struct list_head all;
+ struct gcov_node *parent;
+ struct gcov_info *info;
+ struct gcov_info *ghost;
+ struct dentry *dentry;
+ struct dentry **links;
+ char name[0];
+};
+
+static const char objtree[] = OBJTREE;
+static const char srctree[] = SRCTREE;
+static struct gcov_node root_node;
+static struct dentry *reset_dentry;
+static LIST_HEAD(all_head);
+static DEFINE_MUTEX(node_lock);
+
+/* If non-zero, keep copies of profiling data for unloaded modules. */
+static int gcov_persist = 1;
+
+static int __init gcov_persist_setup(char *str)
+{
+ unsigned long val;
+
+ if (strict_strtoul(str, 0, &val)) {
+ pr_warning("invalid gcov_persist parameter '%s'\n", str);
+ return 0;
+ }
+ gcov_persist = val;
+ pr_info("setting gcov_persist to %d\n", gcov_persist);
+
+ return 1;
+}
+__setup("gcov_persist=", gcov_persist_setup);
+
+/*
+ * seq_file.start() implementation for gcov data files. Note that the
+ * gcov_iterator interface is designed to be more restrictive than seq_file
+ * (no start from arbitrary position, etc.), to simplify the iterator
+ * implementation.
+ */
+static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t i;
+
+ gcov_iter_start(seq->private);
+ for (i = 0; i < *pos; i++) {
+ if (gcov_iter_next(seq->private))
+ return NULL;
+ }
+ return seq->private;
+}
+
+/* seq_file.next() implementation for gcov data files. */
+static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
+{
+ struct gcov_iterator *iter = data;
+
+ if (gcov_iter_next(iter))
+ return NULL;
+ (*pos)++;
+
+ return iter;
+}
+
+/* seq_file.show() implementation for gcov data files. */
+static int gcov_seq_show(struct seq_file *seq, void *data)
+{
+ struct gcov_iterator *iter = data;
+
+ if (gcov_iter_write(iter, seq))
+ return -EINVAL;
+ return 0;
+}
+
+static void gcov_seq_stop(struct seq_file *seq, void *data)
+{
+ /* Unused. */
+}
+
+static const struct seq_operations gcov_seq_ops = {
+ .start = gcov_seq_start,
+ .next = gcov_seq_next,
+ .show = gcov_seq_show,
+ .stop = gcov_seq_stop,
+};
+
+/*
+ * Return the profiling data set for a given node. This can either be the
+ * original profiling data structure or a duplicate (also called "ghost")
+ * in case the associated object file has been unloaded.
+ */
+static struct gcov_info *get_node_info(struct gcov_node *node)
+{
+ if (node->info)
+ return node->info;
+
+ return node->ghost;
+}
+
+/*
+ * open() implementation for gcov data files. Create a copy of the profiling
+ * data set and initialize the iterator and seq_file interface.
+ */
+static int gcov_seq_open(struct inode *inode, struct file *file)
+{
+ struct gcov_node *node = inode->i_private;
+ struct gcov_iterator *iter;
+ struct seq_file *seq;
+ struct gcov_info *info;
+ int rc = -ENOMEM;
+
+ mutex_lock(&node_lock);
+ /*
+ * Read from a profiling data copy to minimize reference tracking
+ * complexity and concurrent access.
+ */
+ info = gcov_info_dup(get_node_info(node));
+ if (!info)
+ goto out_unlock;
+ iter = gcov_iter_new(info);
+ if (!iter)
+ goto err_free_info;
+ rc = seq_open(file, &gcov_seq_ops);
+ if (rc)
+ goto err_free_iter_info;
+ seq = file->private_data;
+ seq->private = iter;
+out_unlock:
+ mutex_unlock(&node_lock);
+ return rc;
+
+err_free_iter_info:
+ gcov_iter_free(iter);
+err_free_info:
+ gcov_info_free(info);
+ goto out_unlock;
+}
+
+/*
+ * release() implementation for gcov data files. Release resources allocated
+ * by open().
+ */
+static int gcov_seq_release(struct inode *inode, struct file *file)
+{
+ struct gcov_iterator *iter;
+ struct gcov_info *info;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ iter = seq->private;
+ info = gcov_iter_get_info(iter);
+ gcov_iter_free(iter);
+ gcov_info_free(info);
+ seq_release(inode, file);
+
+ return 0;
+}
+
+/*
+ * Find a node by the associated data file name. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *get_node_by_name(const char *name)
+{
+ struct gcov_node *node;
+ struct gcov_info *info;
+
+ list_for_each_entry(node, &all_head, all) {
+ info = get_node_info(node);
+ if (info && (strcmp(info->filename, name) == 0))
+ return node;
+ }
+
+ return NULL;
+}
+
+static void remove_node(struct gcov_node *node);
+
+/*
+ * write() implementation for gcov data files. Reset profiling data for the
+ * associated file. If the object file has been unloaded (i.e. this is
+ * a "ghost" node), remove the debug fs node as well.
+ */
+static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
+ size_t len, loff_t *pos)
+{
+ struct seq_file *seq;
+ struct gcov_info *info;
+ struct gcov_node *node;
+
+ seq = file->private_data;
+ info = gcov_iter_get_info(seq->private);
+ mutex_lock(&node_lock);
+ node = get_node_by_name(info->filename);
+ if (node) {
+ /* Reset counts or remove node for unloaded modules. */
+ if (node->ghost)
+ remove_node(node);
+ else
+ gcov_info_reset(node->info);
+ }
+ /* Reset counts for open file. */
+ gcov_info_reset(info);
+ mutex_unlock(&node_lock);
+
+ return len;
+}
+
+/*
+ * Given a string <path> representing a file path of format:
+ * path/to/file.gcda
+ * construct and return a new string:
+ * <dir/>path/to/file.<ext>
+ */
+static char *link_target(const char *dir, const char *path, const char *ext)
+{
+ char *target;
+ char *old_ext;
+ char *copy;
+
+ copy = kstrdup(path, GFP_KERNEL);
+ if (!copy)
+ return NULL;
+ old_ext = strrchr(copy, '.');
+ if (old_ext)
+ *old_ext = '\0';
+ if (dir)
+ target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
+ else
+ target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
+ kfree(copy);
+
+ return target;
+}
+
+/*
+ * Construct a string representing the symbolic link target for the given
+ * gcov data file name and link type. Depending on the link type and the
+ * location of the data file, the link target can either point to a
+ * subdirectory of srctree, objtree or in an external location.
+ */
+static char *get_link_target(const char *filename, const struct gcov_link *ext)
+{
+ const char *rel;
+ char *result;
+
+ if (strncmp(filename, objtree, strlen(objtree)) == 0) {
+ rel = filename + strlen(objtree) + 1;
+ if (ext->dir == SRC_TREE)
+ result = link_target(srctree, rel, ext->ext);
+ else
+ result = link_target(objtree, rel, ext->ext);
+ } else {
+ /* External compilation. */
+ result = link_target(NULL, filename, ext->ext);
+ }
+
+ return result;
+}
+
+#define SKEW_PREFIX ".tmp_"
+
+/*
+ * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
+ * for filename skewing caused by the mod-versioning mechanism.
+ */
+static const char *deskew(const char *basename)
+{
+ if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
+ return basename + sizeof(SKEW_PREFIX) - 1;
+ return basename;
+}
+
+/*
+ * Create links to additional files (usually .c and .gcno files) which the
+ * gcov tool expects to find in the same directory as the gcov data file.
+ */
+static void add_links(struct gcov_node *node, struct dentry *parent)
+{
+ char *basename;
+ char *target;
+ int num;
+ int i;
+
+ for (num = 0; gcov_link[num].ext; num++)
+ /* Nothing. */;
+ node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
+ if (!node->links)
+ return;
+ for (i = 0; i < num; i++) {
+ target = get_link_target(get_node_info(node)->filename,
+ &gcov_link[i]);
+ if (!target)
+ goto out_err;
+ basename = strrchr(target, '/');
+ if (!basename)
+ goto out_err;
+ basename++;
+ node->links[i] = debugfs_create_symlink(deskew(basename),
+ parent, target);
+ if (!node->links[i])
+ goto out_err;
+ kfree(target);
+ }
+
+ return;
+out_err:
+ kfree(target);
+ while (i-- > 0)
+ debugfs_remove(node->links[i]);
+ kfree(node->links);
+ node->links = NULL;
+}
+
+static const struct file_operations gcov_data_fops = {
+ .open = gcov_seq_open,
+ .release = gcov_seq_release,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = gcov_seq_write,
+};
+
+/* Basic initialization of a new node. */
+static void init_node(struct gcov_node *node, struct gcov_info *info,
+ const char *name, struct gcov_node *parent)
+{
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->children);
+ INIT_LIST_HEAD(&node->all);
+ node->info = info;
+ node->parent = parent;
+ if (name)
+ strcpy(node->name, name);
+}
+
+/*
+ * Create a new node and associated debugfs entry. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *new_node(struct gcov_node *parent,
+ struct gcov_info *info, const char *name)
+{
+ struct gcov_node *node;
+
+ node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
+ if (!node) {
+ pr_warning("out of memory\n");
+ return NULL;
+ }
+ init_node(node, info, name, parent);
+ /* Differentiate between gcov data file nodes and directory nodes. */
+ if (info) {
+ node->dentry = debugfs_create_file(deskew(node->name), 0600,
+ parent->dentry, node, &gcov_data_fops);
+ } else
+ node->dentry = debugfs_create_dir(node->name, parent->dentry);
+ if (!node->dentry) {
+ pr_warning("could not create file\n");
+ kfree(node);
+ return NULL;
+ }
+ if (info)
+ add_links(node, parent->dentry);
+ list_add(&node->list, &parent->children);
+ list_add(&node->all, &all_head);
+
+ return node;
+}
+
+/* Remove symbolic links associated with node. */
+static void remove_links(struct gcov_node *node)
+{
+ int i;
+
+ if (!node->links)
+ return;
+ for (i = 0; gcov_link[i].ext; i++)
+ debugfs_remove(node->links[i]);
+ kfree(node->links);
+ node->links = NULL;
+}
+
+/*
+ * Remove node from all lists and debugfs and release associated resources.
+ * Needs to be called with node_lock held.
+ */
+static void release_node(struct gcov_node *node)
+{
+ list_del(&node->list);
+ list_del(&node->all);
+ debugfs_remove(node->dentry);
+ remove_links(node);
+ if (node->ghost)
+ gcov_info_free(node->ghost);
+ kfree(node);
+}
+
+/* Release node and empty parents. Needs to be called with node_lock held. */
+static void remove_node(struct gcov_node *node)
+{
+ struct gcov_node *parent;
+
+ while ((node != &root_node) && list_empty(&node->children)) {
+ parent = node->parent;
+ release_node(node);
+ node = parent;
+ }
+}
+
+/*
+ * Find child node with given basename. Needs to be called with node_lock
+ * held.
+ */
+static struct gcov_node *get_child_by_name(struct gcov_node *parent,
+ const char *name)
+{
+ struct gcov_node *node;
+
+ list_for_each_entry(node, &parent->children, list) {
+ if (strcmp(node->name, name) == 0)
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * write() implementation for reset file. Reset all profiling data to zero
+ * and remove ghost nodes.
+ */
+static ssize_t reset_write(struct file *file, const char __user *addr,
+ size_t len, loff_t *pos)
+{
+ struct gcov_node *node;
+
+ mutex_lock(&node_lock);
+restart:
+ list_for_each_entry(node, &all_head, all) {
+ if (node->info)
+ gcov_info_reset(node->info);
+ else if (list_empty(&node->children)) {
+ remove_node(node);
+ /* Several nodes may have gone - restart loop. */
+ goto restart;
+ }
+ }
+ mutex_unlock(&node_lock);
+
+ return len;
+}
+
+/* read() implementation for reset file. Unused. */
+static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
+ loff_t *pos)
+{
+ /* Allow read operation so that a recursive copy won't fail. */
+ return 0;
+}
+
+static const struct file_operations gcov_reset_fops = {
+ .write = reset_write,
+ .read = reset_read,
+};
+
+/*
+ * Create a node for a given profiling data set and add it to all lists and
+ * debugfs. Needs to be called with node_lock held.
+ */
+static void add_node(struct gcov_info *info)
+{
+ char *filename;
+ char *curr;
+ char *next;
+ struct gcov_node *parent;
+ struct gcov_node *node;
+
+ filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!filename)
+ return;
+ parent = &root_node;
+ /* Create directory nodes along the path. */
+ for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
+ if (curr == next)
+ continue;
+ *next = 0;
+ if (strcmp(curr, ".") == 0)
+ continue;
+ if (strcmp(curr, "..") == 0) {
+ if (!parent->parent)
+ goto err_remove;
+ parent = parent->parent;
+ continue;
+ }
+ node = get_child_by_name(parent, curr);
+ if (!node) {
+ node = new_node(parent, NULL, curr);
+ if (!node)
+ goto err_remove;
+ }
+ parent = node;
+ }
+ /* Create file node. */
+ node = new_node(parent, info, curr);
+ if (!node)
+ goto err_remove;
+out:
+ kfree(filename);
+ return;
+
+err_remove:
+ remove_node(parent);
+ goto out;
+}
+
+/*
+ * The profiling data set associated with this node is being unloaded. Store a
+ * copy of the profiling data and turn this node into a "ghost".
+ */
+static int ghost_node(struct gcov_node *node)
+{
+ node->ghost = gcov_info_dup(node->info);
+ if (!node->ghost) {
+ pr_warning("could not save data for '%s' (out of memory)\n",
+ node->info->filename);
+ return -ENOMEM;
+ }
+ node->info = NULL;
+
+ return 0;
+}
+
+/*
+ * Profiling data for this node has been loaded again. Add profiling data
+ * from previous instantiation and turn this node into a regular node.
+ */
+static void revive_node(struct gcov_node *node, struct gcov_info *info)
+{
+ if (gcov_info_is_compatible(node->ghost, info))
+ gcov_info_add(info, node->ghost);
+ else {
+ pr_warning("discarding saved data for '%s' (version changed)\n",
+ info->filename);
+ }
+ gcov_info_free(node->ghost);
+ node->ghost = NULL;
+ node->info = info;
+}
+
+/*
+ * Callback to create/remove profiling files when code compiled with
+ * -fprofile-arcs is loaded/unloaded.
+ */
+void gcov_event(enum gcov_action action, struct gcov_info *info)
+{
+ struct gcov_node *node;
+
+ mutex_lock(&node_lock);
+ node = get_node_by_name(info->filename);
+ switch (action) {
+ case GCOV_ADD:
+ /* Add new node or revive ghost. */
+ if (!node) {
+ add_node(info);
+ break;
+ }
+ if (gcov_persist)
+ revive_node(node, info);
+ else {
+ pr_warning("could not add '%s' (already exists)\n",
+ info->filename);
+ }
+ break;
+ case GCOV_REMOVE:
+ /* Remove node or turn into ghost. */
+ if (!node) {
+ pr_warning("could not remove '%s' (not found)\n",
+ info->filename);
+ break;
+ }
+ if (gcov_persist) {
+ if (!ghost_node(node))
+ break;
+ }
+ remove_node(node);
+ break;
+ }
+ mutex_unlock(&node_lock);
+}
+
+/* Create debugfs entries. */
+static __init int gcov_fs_init(void)
+{
+ int rc = -EIO;
+
+ init_node(&root_node, NULL, NULL, NULL);
+ /*
+ * /sys/kernel/debug/gcov will be parent for the reset control file
+ * and all profiling files.
+ */
+ root_node.dentry = debugfs_create_dir("gcov", NULL);
+ if (!root_node.dentry)
+ goto err_remove;
+ /*
+ * Create reset file which resets all profiling counts when written
+ * to.
+ */
+ reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
+ NULL, &gcov_reset_fops);
+ if (!reset_dentry)
+ goto err_remove;
+ /* Replay previous events to get our fs hierarchy up-to-date. */
+ gcov_enable_events();
+ return 0;
+
+err_remove:
+ pr_err("init failed\n");
+ if (root_node.dentry)
+ debugfs_remove(root_node.dentry);
+
+ return rc;
+}
+device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 00000000000..ae5bb426003
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
+/*
+ * This code provides functions to handle gcc's profiling data format
+ * introduced with gcc 3.4. Future versions of gcc may change the gcov
+ * format (as happened before), so all format-specific information needs
+ * to be kept modular and easily exchangeable.
+ *
+ * This file is based on gcc-internal definitions. Functions and data
+ * structures are defined to be compatible with gcc counterparts.
+ * For a better understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Based on gcc magic. Doesn't change
+ * at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+ return (1 << type) & info->ctr_mask;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+ unsigned int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(info, i))
+ result++;
+ }
+ return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ unsigned int active = num_counter_active(info);
+ unsigned int i;
+
+ for (i = 0; i < active; i++) {
+ memset(info->counts[i].values, 0,
+ info->counts[i].num * sizeof(gcov_type));
+ }
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
+{
+ unsigned int i;
+ unsigned int j;
+
+ for (i = 0; i < num_counter_active(dest); i++) {
+ for (j = 0; j < dest->counts[i].num; j++) {
+ dest->counts[i].values[j] +=
+ source->counts[i].values[j];
+ }
+ }
+}
+
+/* Get size of function info entry. Based on gcc magic. */
+static size_t get_fn_size(struct gcov_info *info)
+{
+ size_t size;
+
+ size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
+ sizeof(unsigned int);
+ if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
+ size = ALIGN(size, __alignof__(struct gcov_fn_info));
+ return size;
+}
+
+/* Get address of function info entry. Based on gcc magic. */
+static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
+{
+ return (struct gcov_fn_info *)
+ ((char *) info->functions + fn * get_fn_size(info));
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ unsigned int i;
+ unsigned int active;
+
+ /* Duplicate gcov_info. */
+ active = num_counter_active(info);
+ dup = kzalloc(sizeof(struct gcov_info) +
+ sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ dup->version = info->version;
+ dup->stamp = info->stamp;
+ dup->n_functions = info->n_functions;
+ dup->ctr_mask = info->ctr_mask;
+ /* Duplicate filename. */
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err_free;
+ /* Duplicate table of functions. */
+ dup->functions = kmemdup(info->functions, info->n_functions *
+ get_fn_size(info), GFP_KERNEL);
+ if (!dup->functions)
+ goto err_free;
+ /* Duplicate counter arrays. */
+ for (i = 0; i < active ; i++) {
+ struct gcov_ctr_info *ctr = &info->counts[i];
+ size_t size = ctr->num * sizeof(gcov_type);
+
+ dup->counts[i].num = ctr->num;
+ dup->counts[i].merge = ctr->merge;
+ dup->counts[i].values = vmalloc(size);
+ if (!dup->counts[i].values)
+ goto err_free;
+ memcpy(dup->counts[i].values, ctr->values, size);
+ }
+ return dup;
+
+err_free:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ unsigned int active = num_counter_active(info);
+ unsigned int i;
+
+ for (i = 0; i < active ; i++)
+ vfree(info->counts[i].values);
+ kfree(info->functions);
+ kfree(info->filename);
+ kfree(info);
+}
+
+/**
+ * struct type_info - iterator helper array
+ * @ctr_type: counter type
+ * @offset: index of the first value of the current function for this type
+ *
+ * This array is needed to convert the in-memory data format into the in-file
+ * data format:
+ *
+ * In-memory:
+ * for each counter type
+ * for each function
+ * values
+ *
+ * In-file:
+ * for each function
+ * for each counter type
+ * values
+ *
+ * See gcc source gcc/gcov-io.h for more information on data organization.
+ */
+struct type_info {
+ int ctr_type;
+ unsigned int offset;
+};
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @record: record type
+ * @function: function number
+ * @type: counter type
+ * @count: index into values array
+ * @num_types: number of counter types
+ * @type_info: helper array to get values-array offset for current function
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+
+ int record;
+ unsigned int function;
+ unsigned int type;
+ unsigned int count;
+
+ int num_types;
+ struct type_info type_info[0];
+};
+
+static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
+{
+ return get_fn_info(iter->info, iter->function);
+}
+
+static struct type_info *get_type(struct gcov_iterator *iter)
+{
+ return &iter->type_info[iter->type];
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator) +
+ num_counter_active(info) * sizeof(struct type_info),
+ GFP_KERNEL);
+ if (iter)
+ iter->info = info;
+
+ return iter;
+}
+
+/**
+ * gcov_iter_free - release memory for iterator
+ * @iter: file iterator to free
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ int i;
+
+ iter->record = 0;
+ iter->function = 0;
+ iter->type = 0;
+ iter->count = 0;
+ iter->num_types = 0;
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(iter->info, i)) {
+ iter->type_info[iter->num_types].ctr_type = i;
+ iter->type_info[iter->num_types++].offset = 0;
+ }
+ }
+}
+
+/* Mapping of logical record number to actual file content. */
+#define RECORD_FILE_MAGIC 0
+#define RECORD_GCOV_VERSION 1
+#define RECORD_TIME_STAMP 2
+#define RECORD_FUNCTION_TAG 3
+#define RECORD_FUNCTON_TAG_LEN 4
+#define RECORD_FUNCTION_IDENT 5
+#define RECORD_FUNCTION_CHECK 6
+#define RECORD_COUNT_TAG 7
+#define RECORD_COUNT_LEN 8
+#define RECORD_COUNT 9
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ switch (iter->record) {
+ case RECORD_FILE_MAGIC:
+ case RECORD_GCOV_VERSION:
+ case RECORD_FUNCTION_TAG:
+ case RECORD_FUNCTON_TAG_LEN:
+ case RECORD_FUNCTION_IDENT:
+ case RECORD_COUNT_TAG:
+ /* Advance to next record */
+ iter->record++;
+ break;
+ case RECORD_COUNT:
+ /* Advance to next count */
+ iter->count++;
+ /* fall through */
+ case RECORD_COUNT_LEN:
+ if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
+ iter->record = 9;
+ break;
+ }
+ /* Advance to next counter type */
+ get_type(iter)->offset += iter->count;
+ iter->count = 0;
+ iter->type++;
+ /* fall through */
+ case RECORD_FUNCTION_CHECK:
+ if (iter->type < iter->num_types) {
+ iter->record = 7;
+ break;
+ }
+ /* Advance to next function */
+ iter->type = 0;
+ iter->function++;
+ /* fall through */
+ case RECORD_TIME_STAMP:
+ if (iter->function < iter->info->n_functions)
+ iter->record = 3;
+ else
+ iter->record = -1;
+ break;
+ }
+ /* Check for EOF. */
+ if (iter->record == -1)
+ return -EINVAL;
+ else
+ return 0;
+}
+
+/**
+ * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file.
+ */
+static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
+{
+ return seq_write(seq, &v, sizeof(v));
+}
+
+/**
+ * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
+{
+ u32 data[2];
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ return seq_write(seq, data, sizeof(data));
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ int rc = -EINVAL;
+
+ switch (iter->record) {
+ case RECORD_FILE_MAGIC:
+ rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
+ break;
+ case RECORD_GCOV_VERSION:
+ rc = seq_write_gcov_u32(seq, iter->info->version);
+ break;
+ case RECORD_TIME_STAMP:
+ rc = seq_write_gcov_u32(seq, iter->info->stamp);
+ break;
+ case RECORD_FUNCTION_TAG:
+ rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
+ break;
+ case RECORD_FUNCTON_TAG_LEN:
+ rc = seq_write_gcov_u32(seq, 2);
+ break;
+ case RECORD_FUNCTION_IDENT:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
+ break;
+ case RECORD_FUNCTION_CHECK:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+ break;
+ case RECORD_COUNT_TAG:
+ rc = seq_write_gcov_u32(seq,
+ GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
+ break;
+ case RECORD_COUNT_LEN:
+ rc = seq_write_gcov_u32(seq,
+ get_func(iter)->n_ctrs[iter->type] * 2);
+ break;
+ case RECORD_COUNT:
+ rc = seq_write_gcov_u64(seq,
+ iter->info->counts[iter->type].
+ values[iter->count + get_type(iter)->offset]);
+ break;
+ }
+ return rc;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 00000000000..060073ebf7a
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
+/*
+ * Profiling infrastructure declarations.
+ *
+ * This file is based on gcc-internal definitions. Data structures are
+ * defined to be compatible with gcc counterparts. For a better
+ * understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#ifndef GCOV_H
+#define GCOV_H GCOV_H
+
+#include <linux/types.h>
+
+/*
+ * Profiling data types used for gcc 3.4 and above - these are defined by
+ * gcc and need to be kept as close to the original definition as possible to
+ * remain compatible.
+ */
+#define GCOV_COUNTERS 5
+#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
+#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
+#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
+#define GCOV_TAG_FOR_COUNTER(count) \
+ (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
+
+#if BITS_PER_LONG >= 64
+typedef long gcov_type;
+#else
+typedef long long gcov_type;
+#endif
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+ unsigned int ident;
+ unsigned int checksum;
+ unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+ void (*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ unsigned int n_functions;
+ const struct gcov_fn_info *functions;
+ unsigned int ctr_mask;
+ struct gcov_ctr_info counts[0];
+};
+
+/* Base interface. */
+enum gcov_action {
+ GCOV_ADD,
+ GCOV_REMOVE,
+};
+
+void gcov_event(enum gcov_action action, struct gcov_info *info);
+void gcov_enable_events(void);
+
+/* Iterator control. */
+struct seq_file;
+struct gcov_iterator;
+
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
+void gcov_iter_free(struct gcov_iterator *iter);
+void gcov_iter_start(struct gcov_iterator *iter);
+int gcov_iter_next(struct gcov_iterator *iter);
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+
+/* gcov_info control. */
+void gcov_info_reset(struct gcov_info *info);
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
+struct gcov_info *gcov_info_dup(struct gcov_info *info);
+void gcov_info_free(struct gcov_info *info);
+
+struct gcov_link {
+ enum {
+ OBJ_TREE,
+ SRC_TREE,
+ } dir;
+ const char *ext;
+};
+extern const struct gcov_link gcov_link[];
+
+#endif /* GCOV_H */
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 00000000000..2b45b2ee396
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+ struct group_info *group_info;
+ int nblocks;
+ int i;
+
+ nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+ /* Make sure we always allocate at least one indirect block pointer */
+ nblocks = nblocks ? : 1;
+ group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+ if (!group_info)
+ return NULL;
+ group_info->ngroups = gidsetsize;
+ group_info->nblocks = nblocks;
+ atomic_set(&group_info->usage, 1);
+
+ if (gidsetsize <= NGROUPS_SMALL)
+ group_info->blocks[0] = group_info->small_block;
+ else {
+ for (i = 0; i < nblocks; i++) {
+ gid_t *b;
+ b = (void *)__get_free_page(GFP_USER);
+ if (!b)
+ goto out_undo_partial_alloc;
+ group_info->blocks[i] = b;
+ }
+ }
+ return group_info;
+
+out_undo_partial_alloc:
+ while (--i >= 0) {
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+ return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+ if (group_info->blocks[0] != group_info->small_block) {
+ int i;
+ for (i = 0; i < group_info->nblocks; i++)
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+ const struct group_info *group_info)
+{
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < group_info->nblocks; i++) {
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
+
+ if (copy_to_user(grouplist, group_info->blocks[i], len))
+ return -EFAULT;
+
+ grouplist += NGROUPS_PER_BLOCK;
+ count -= cp_count;
+ }
+ return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+ gid_t __user *grouplist)
+{
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < group_info->nblocks; i++) {
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
+
+ if (copy_from_user(group_info->blocks[i], grouplist, len))
+ return -EFAULT;
+
+ grouplist += NGROUPS_PER_BLOCK;
+ count -= cp_count;
+ }
+ return 0;
+}
+
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+ int base, max, stride;
+ int gidsetsize = group_info->ngroups;
+
+ for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+ ; /* nothing */
+ stride /= 3;
+
+ while (stride) {
+ max = gidsetsize - stride;
+ for (base = 0; base < max; base++) {
+ int left = base;
+ int right = left + stride;
+ gid_t tmp = GROUP_AT(group_info, right);
+
+ while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+ GROUP_AT(group_info, right) =
+ GROUP_AT(group_info, left);
+ right = left;
+ left -= stride;
+ }
+ GROUP_AT(group_info, right) = tmp;
+ }
+ stride /= 3;
+ }
+}
+
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, gid_t grp)
+{
+ unsigned int left, right;
+
+ if (!group_info)
+ return 0;
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ unsigned int mid = (left+right)/2;
+ int cmp = grp - GROUP_AT(group_info, mid);
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
+{
+ int retval;
+
+ retval = security_task_setgroups(group_info);
+ if (retval)
+ return retval;
+
+ put_group_info(new->group_info);
+ groups_sort(group_info);
+ get_group_info(group_info);
+ new->group_info = group_info;
+ return 0;
+}
+
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+ struct cred *new;
+ int ret;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ ret = set_groups(new, group_info);
+ if (ret < 0) {
+ abort_creds(new);
+ return ret;
+ }
+
+ return commit_creds(new);
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ const struct cred *cred = current_cred();
+ int i;
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+
+ /* no need to grab task_lock here; it cannot change */
+ i = cred->group_info->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize) {
+ i = -EINVAL;
+ goto out;
+ }
+ if (groups_to_user(grouplist, cred->group_info)) {
+ i = -EFAULT;
+ goto out;
+ }
+ }
+out:
+ return i;
+}
+
+/*
+ * SMP: Our groups are copy-on-write. We can set them safely
+ * without another task interfering.
+ */
+
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ struct group_info *group_info;
+ int retval;
+
+ if (!capable(CAP_SETGID))
+ return -EPERM;
+ if ((unsigned)gidsetsize > NGROUPS_MAX)
+ return -EINVAL;
+
+ group_info = groups_alloc(gidsetsize);
+ if (!group_info)
+ return -ENOMEM;
+ retval = groups_from_user(group_info, grouplist);
+ if (retval) {
+ put_group_info(group_info);
+ return retval;
+ }
+
+ retval = set_current_groups(group_info);
+ put_group_info(group_info);
+
+ return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+ const struct cred *cred = current_cred();
+ int retval = 1;
+
+ if (grp != cred->fsgid)
+ retval = groups_search(cred->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(gid_t grp)
+{
+ const struct cred *cred = current_cred();
+ int retval = 1;
+
+ if (grp != cred->egid)
+ retval = groups_search(cred->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c1958..49da79ab848 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -189,21 +191,65 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
}
+
+/*
+ * Get the preferred target CPU for NOHZ
+ */
+static int hrtimer_get_target(int this_cpu, int pinned)
+{
+#ifdef CONFIG_NO_HZ
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
+ int preferred_cpu = get_nohz_load_balancer();
+
+ if (preferred_cpu >= 0)
+ return preferred_cpu;
+ }
+#endif
+ return this_cpu;
+}
+
+/*
+ * With HIGHRES=y we do not migrate the timer when it is expiring
+ * before the next event on the target cpu because we cannot reprogram
+ * the target cpu hardware and we would cause it to fire late.
+ *
+ * Called with cpu_base->lock of target cpu held.
+ */
+static int
+hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ ktime_t expires;
+
+ if (!new_base->cpu_base->hres_active)
+ return 0;
+
+ expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
+ return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+#else
+ return 0;
+#endif
+}
+
/*
* Switch the timer base to the current CPU when possible.
*/
static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ int pinned)
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int this_cpu = smp_processor_id();
+ int cpu = hrtimer_get_target(this_cpu, pinned);
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
/*
- * We are trying to schedule the timer on the local CPU.
+ * We are trying to move timer to new_base.
* However we can't change timer's base while it is running,
* so we keep it on the same CPU. No hassle vs. reprogramming
* the event source in the high resolution case. The softirq
@@ -218,6 +264,14 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
+ }
timer->base = new_base;
}
return new_base;
@@ -235,7 +289,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
return base;
}
-# define switch_hrtimer_base(t, b) (b)
+# define switch_hrtimer_base(t, b, p) (b)
#endif /* !CONFIG_SMP */
@@ -332,6 +386,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
return res;
}
+EXPORT_SYMBOL_GPL(ktime_add_safe);
+
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
static struct debug_obj_descr hrtimer_debug_descr;
@@ -907,9 +963,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
ret = remove_hrtimer(timer, base);
/* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base);
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- if (mode == HRTIMER_MODE_REL) {
+ if (mode & HRTIMER_MODE_REL) {
tim = ktime_add_safe(tim, new_base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -1226,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
expires_next.tv64 = KTIME_MAX;
+ spin_lock(&cpu_base->lock);
+ /*
+ * We set expires_next to KTIME_MAX here with cpu_base->lock
+ * held to prevent that a timer is enqueued in our queue via
+ * the migration code. This does not affect enqueueing of
+ * timers which run their callback and need to be requeued on
+ * this CPU.
+ */
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
base = cpu_base->clock_base;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
ktime_t basenow;
struct rb_node *node;
- spin_lock(&cpu_base->lock);
-
basenow = ktime_add(now, base->offset);
while ((node = base->first)) {
@@ -1266,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
__run_hrtimer(timer);
}
- spin_unlock(&cpu_base->lock);
base++;
}
+ /*
+ * Store the new expiry value so the migration code can verify
+ * against it.
+ */
cpu_base->expires_next = expires_next;
+ spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
if (expires_next.tv64 != KTIME_MAX) {
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f5296..7d047808419 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f..13c68e71b72 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
spin_lock(&desc->lock);
mask_ack_irq(desc, irq);
- desc = irq_remap_to_desc(irq, desc);
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
desc->status &= ~IRQ_INPROGRESS;
out:
desc->chip->eoi(irq);
- desc = irq_remap_to_desc(irq, desc);
spin_unlock(&desc->lock);
}
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
!desc->action)) {
desc->status |= (IRQ_PENDING | IRQ_MASKED);
mask_ack_irq(desc, irq);
- desc = irq_remap_to_desc(irq, desc);
goto out_unlock;
}
kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
/* Start handling the irq */
if (desc->chip->ack)
desc->chip->ack(irq);
- desc = irq_remap_to_desc(irq, desc);
/* Mark the IRQ currently in progress.*/
desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
- if (desc->chip->eoi) {
+ if (desc->chip->eoi)
desc->chip->eoi(irq);
- desc = irq_remap_to_desc(irq, desc);
- }
}
void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
/* Uninstall? */
if (handle == handle_bad_irq) {
- if (desc->chip != &no_irq_chip) {
+ if (desc->chip != &no_irq_chip)
mask_ack_irq(desc, irq);
- desc = irq_remap_to_desc(irq, desc);
- }
desc->status |= IRQ_DISABLED;
desc->depth = 1;
}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e08754744..065205bdd92 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,14 +11,15 @@
*/
#include <linux/irq.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/rculist.h>
#include <linux/hash.h>
-#include <trace/irq.h>
#include <linux/bootmem.h>
+#include <trace/events/irq.h>
#include "internals.h"
@@ -44,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
static void __init init_irq_default_affinity(void)
{
- alloc_bootmem_cpumask_var(&irq_default_affinity);
+ alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
cpumask_setall(irq_default_affinity);
}
#else
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
};
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
{
- int node;
void *ptr;
- node = cpu_to_node(cpu);
- ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
+ if (slab_is_available())
+ ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+ GFP_ATOMIC, node);
+ else
+ ptr = alloc_bootmem_node(NODE_DATA(node),
+ nr * sizeof(*desc->kstat_irqs));
/*
* don't overwite if can not get new one
* init_copy_kstat_irqs() could still use old one
*/
if (ptr) {
- printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n",
- cpu, node);
+ printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
desc->kstat_irqs = ptr;
}
}
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
{
memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
spin_lock_init(&desc->lock);
desc->irq = irq;
#ifdef CONFIG_SMP
- desc->cpu = cpu;
+ desc->node = node;
#endif
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- init_kstat_irqs(desc, cpu, nr_cpu_ids);
+ init_kstat_irqs(desc, node, nr_cpu_ids);
if (!desc->kstat_irqs) {
printk(KERN_ERR "can not alloc kstat_irqs\n");
BUG_ON(1);
}
- if (!init_alloc_desc_masks(desc, cpu, false)) {
+ if (!alloc_desc_masks(desc, node, false)) {
printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
BUG_ON(1);
}
- arch_init_chip_data(desc, cpu);
+ init_desc_masks(desc);
+ arch_init_chip_data(desc, node);
}
/*
@@ -146,6 +150,7 @@ int __init early_irq_init(void)
{
struct irq_desc *desc;
int legacy_count;
+ int node;
int i;
init_irq_default_affinity();
@@ -156,20 +161,21 @@ int __init early_irq_init(void)
desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ node = first_online_node;
/* allocate irq_desc_ptrs array based on nr_irqs */
- irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+ irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
/* allocate based on nr_cpu_ids */
- /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
- kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
- sizeof(int));
+ kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
+ sizeof(int), GFP_NOWAIT, node);
for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- init_alloc_desc_masks(&desc[i], 0, true);
+ alloc_desc_masks(&desc[i], node, true);
+ init_desc_masks(&desc[i]);
irq_desc_ptrs[i] = desc + i;
}
@@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
return NULL;
}
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
{
struct irq_desc *desc;
unsigned long flags;
- int node;
if (irq >= nr_irqs) {
WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
if (desc)
goto out_unlock;
- node = cpu_to_node(cpu);
- desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
- printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
- irq, cpu, node);
+ if (slab_is_available())
+ desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+ else
+ desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+
+ printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
if (!desc) {
printk(KERN_ERR "can not alloc irq_desc\n");
BUG_ON(1);
}
- init_one_irq_desc(irq, desc, cpu);
+ init_one_irq_desc(irq, desc, node);
irq_desc_ptrs[irq] = desc;
@@ -256,7 +263,8 @@ int __init early_irq_init(void)
for (i = 0; i < count; i++) {
desc[i].irq = i;
- init_alloc_desc_masks(&desc[i], 0, true);
+ alloc_desc_masks(&desc[i], 0, true);
+ init_desc_masks(&desc[i]);
desc[i].kstat_irqs = kstat_irqs_all[i];
}
return arch_early_irq_init();
@@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
{
return irq_to_desc(irq);
}
@@ -348,9 +356,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
"but no thread function available.", irq, action->name);
}
-DEFINE_TRACE(irq_handler_entry);
-DEFINE_TRACE(irq_handler_exit);
-
/**
* handle_IRQ_event - irq action chain handler
* @irq: the interrupt number
@@ -453,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
/*
* No locking required for CPU-local interrupts:
*/
- if (desc->chip->ack) {
+ if (desc->chip->ack)
desc->chip->ack(irq);
- /* get new one */
- desc = irq_remap_to_desc(irq, desc);
- }
if (likely(!(desc->status & IRQ_DISABLED))) {
action_ret = handle_IRQ_event(irq, desc->action);
if (!noirqdebug)
@@ -468,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
}
spin_lock(&desc->lock);
- if (desc->chip->ack) {
+ if (desc->chip->ack)
desc->chip->ack(irq);
- desc = irq_remap_to_desc(irq, desc);
- }
/*
* REPLAY is when Linux resends an IRQ that was dropped earlier
* WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38..e70ed5592eb 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
extern void clear_kstat_irqs(struct irq_desc *desc);
extern spinlock_t sparse_irq_lock;
@@ -42,6 +42,8 @@ static inline void unregister_handler_proc(unsigned int irq,
extern int irq_select_affinity_usr(unsigned int irq);
+extern void irq_set_thread_affinity(struct irq_desc *desc);
+
/*
* Debugging printout:
*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca5924..0ec9ed83173 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
return 1;
}
-static void
-irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+/**
+ * irq_set_thread_affinity - Notify irq threads to adjust affinity
+ * @desc: irq descriptor which has affitnity changed
+ *
+ * We just set IRQTF_AFFINITY and delegate the affinity setting
+ * to the interrupt thread itself. We can not call
+ * set_cpus_allowed_ptr() here as we hold desc->lock and this
+ * code can be called from hard interrupt context.
+ */
+void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action = desc->action;
while (action) {
if (action->thread)
- set_cpus_allowed_ptr(action->thread, cpumask);
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
action = action->next;
}
}
@@ -109,17 +117,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
spin_lock_irqsave(&desc->lock, flags);
#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (desc->status & IRQ_MOVE_PCNTXT)
- desc->chip->set_affinity(irq, cpumask);
+ if (desc->status & IRQ_MOVE_PCNTXT) {
+ if (!desc->chip->set_affinity(irq, cpumask)) {
+ cpumask_copy(desc->affinity, cpumask);
+ irq_set_thread_affinity(desc);
+ }
+ }
else {
desc->status |= IRQ_MOVE_PENDING;
cpumask_copy(desc->pending_mask, cpumask);
}
#else
- cpumask_copy(desc->affinity, cpumask);
- desc->chip->set_affinity(irq, cpumask);
+ if (!desc->chip->set_affinity(irq, cpumask)) {
+ cpumask_copy(desc->affinity, cpumask);
+ irq_set_thread_affinity(desc);
+ }
#endif
- irq_set_thread_affinity(desc, cpumask);
desc->status |= IRQ_AFFINITY_SET;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
@@ -171,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
spin_lock_irqsave(&desc->lock, flags);
ret = setup_affinity(irq, desc);
if (!ret)
- irq_set_thread_affinity(desc, desc->affinity);
+ irq_set_thread_affinity(desc);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
@@ -438,6 +451,39 @@ static int irq_wait_for_interrupt(struct irqaction *action)
return -1;
}
+#ifdef CONFIG_SMP
+/*
+ * Check whether we need to change the affinity of the interrupt thread.
+ */
+static void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+{
+ cpumask_var_t mask;
+
+ if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
+ return;
+
+ /*
+ * In case we are out of memory we set IRQTF_AFFINITY again and
+ * try again next time
+ */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ return;
+ }
+
+ spin_lock_irq(&desc->lock);
+ cpumask_copy(mask, desc->affinity);
+ spin_unlock_irq(&desc->lock);
+
+ set_cpus_allowed_ptr(current, mask);
+ free_cpumask_var(mask);
+}
+#else
+static inline void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
+#endif
+
/*
* Interrupt handler thread
*/
@@ -453,6 +499,8 @@ static int irq_thread(void *data)
while (!irq_wait_for_interrupt(action)) {
+ irq_thread_check_affinity(desc, action);
+
atomic_inc(&desc->threads_active);
spin_lock_irq(&desc->lock);
@@ -559,7 +607,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
get_task_struct(t);
new->thread = t;
- wake_up_process(t);
}
/*
@@ -642,6 +689,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
(int)(new->flags & IRQF_TRIGGER_MASK));
}
+ new->irq = irq;
*old_ptr = new;
/* Reset broken irq detection when installing new handler */
@@ -659,7 +707,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
spin_unlock_irqrestore(&desc->lock, flags);
- new->irq = irq;
+ /*
+ * Strictly no need to wake it up, but hung_task complains
+ * when no hard interrupt wakes the thread up.
+ */
+ if (new->thread)
+ wake_up_process(new->thread);
+
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
@@ -713,7 +767,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action, **action_ptr;
- struct task_struct *irqthread;
unsigned long flags;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -761,9 +814,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
desc->chip->disable(irq);
}
- irqthread = action->thread;
- action->thread = NULL;
-
spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -771,12 +821,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Make sure it's not being used on another CPU: */
synchronize_irq(irq);
- if (irqthread) {
- if (!test_bit(IRQTF_DIED, &action->thread_flags))
- kthread_stop(irqthread);
- put_task_struct(irqthread);
- }
-
#ifdef CONFIG_DEBUG_SHIRQ
/*
* It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -792,6 +836,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
local_irq_restore(flags);
}
#endif
+
+ if (action->thread) {
+ if (!test_bit(IRQTF_DIED, &action->thread_flags))
+ kthread_stop(action->thread);
+ put_task_struct(action->thread);
+ }
+
return action;
}
@@ -851,7 +902,7 @@ EXPORT_SYMBOL(free_irq);
* still called in hard interrupt context and has to check
* whether the interrupt originates from the device. If yes it
* needs to disable the interrupt on the device and return
- * IRQ_THREAD_WAKE which will wake up the handler thread and run
+ * IRQ_WAKE_THREAD which will wake up the handler thread and run
* @thread_fn. This split handler design is necessary to support
* shared interrupts.
*
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b..fcb6c96f262 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
#include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
void move_masked_irq(int irq)
{
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
* masking the irqs.
*/
if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
- < nr_cpu_ids)) {
- cpumask_and(desc->affinity,
- desc->pending_mask, cpu_online_mask);
- desc->chip->set_affinity(irq, desc->affinity);
- }
+ < nr_cpu_ids))
+ if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+ cpumask_copy(desc->affinity, desc->pending_mask);
+ irq_set_thread_affinity(desc);
+ }
+
cpumask_clear(desc->pending_mask);
}
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d..3fd30197da2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
static void init_copy_kstat_irqs(struct irq_desc *old_desc,
struct irq_desc *desc,
- int cpu, int nr)
+ int node, int nr)
{
- init_kstat_irqs(desc, cpu, nr);
+ init_kstat_irqs(desc, node, nr);
if (desc->kstat_irqs != old_desc->kstat_irqs)
memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
}
static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
- struct irq_desc *desc, int cpu)
+ struct irq_desc *desc, int node)
{
memcpy(desc, old_desc, sizeof(struct irq_desc));
- if (!init_alloc_desc_masks(desc, cpu, false)) {
+ if (!alloc_desc_masks(desc, node, false)) {
printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
"for migration.\n", irq);
return false;
}
spin_lock_init(&desc->lock);
- desc->cpu = cpu;
+ desc->node = node;
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+ init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
init_copy_desc_masks(old_desc, desc);
- arch_init_copy_chip_data(old_desc, desc, cpu);
+ arch_init_copy_chip_data(old_desc, desc, node);
return true;
}
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
}
static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
- int cpu)
+ int node)
{
struct irq_desc *desc;
unsigned int irq;
unsigned long flags;
- int node;
irq = old_desc->irq;
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
if (desc && old_desc != desc)
goto out_unlock;
- node = cpu_to_node(cpu);
desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
if (!desc) {
printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
desc = old_desc;
goto out_unlock;
}
- if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+ if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
/* still use old one */
kfree(desc);
desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
/* free the old one */
free_one_irq_desc(old_desc, desc);
- spin_unlock(&old_desc->lock);
kfree(old_desc);
- spin_lock(&desc->lock);
return desc;
@@ -109,24 +105,14 @@ out_unlock:
return desc;
}
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
{
- int old_cpu;
- int node, old_node;
-
- /* those all static, do move them */
- if (desc->irq < NR_IRQS_LEGACY)
+ /* those static or target node is -1, do not move them */
+ if (desc->irq < NR_IRQS_LEGACY || node == -1)
return desc;
- old_cpu = desc->cpu;
- if (old_cpu != cpu) {
- node = cpu_to_node(cpu);
- old_node = cpu_to_node(old_cpu);
- if (old_node != node)
- desc = __real_move_irq_desc(desc, cpu);
- else
- desc->cpu = cpu;
- }
+ if (desc->node != node)
+ desc = __real_move_irq_desc(desc, node);
return desc;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc..3a29dbe7898 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
#define all_var 0
#endif
-/* These will be re-linked against their real values during the second link stage */
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
extern const unsigned long kallsyms_addresses[] __attribute__((weak));
extern const u8 kallsyms_names[] __attribute__((weak));
-/* tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV)
+/*
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
*/
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
return is_kernel_text(addr) || is_kernel_inittext(addr);
}
-/* expand a compressed symbol data into the resulting uncompressed string,
- given the offset to where the symbol is in the compressed stream */
+/*
+ * Expand a compressed symbol data into the resulting uncompressed string,
+ * given the offset to where the symbol is in the compressed stream.
+ */
static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
- /* get the compressed symbol length from the first symbol byte */
+ /* Get the compressed symbol length from the first symbol byte. */
data = &kallsyms_names[off];
len = *data;
data++;
- /* update the offset to return the offset for the next symbol on
- * the compressed stream */
+ /*
+ * Update the offset to return the offset for the next symbol on
+ * the compressed stream.
+ */
off += len + 1;
- /* for every byte on the compressed symbol data, copy the table
- entry for that byte */
- while(len) {
- tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+ /*
+ * For every byte on the compressed symbol data, copy the table
+ * entry for that byte.
+ */
+ while (len) {
+ tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
data++;
len--;
while (*tptr) {
- if(skipped_first) {
+ if (skipped_first) {
*result = *tptr;
result++;
} else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
*result = '\0';
- /* return to offset to the next symbol */
+ /* Return to offset to the next symbol. */
return off;
}
-/* get symbol type information. This is encoded as a single char at the
- * begining of the symbol name */
+/*
+ * Get symbol type information. This is encoded as a single char at the
+ * beginning of the symbol name.
+ */
static char kallsyms_get_symbol_type(unsigned int off)
{
- /* get just the first code, look it up in the token table, and return the
- * first char from this token */
- return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+ /*
+ * Get just the first code, look it up in the token table,
+ * and return the first char from this token.
+ */
+ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
}
-/* find the offset on the compressed stream given and index in the
- * kallsyms array */
+/*
+ * Find the offset on the compressed stream given and index in the
+ * kallsyms array.
+ */
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
int i;
- /* use the closest marker we have. We have markers every 256 positions,
- * so that should be close enough */
- name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+ /*
+ * Use the closest marker we have. We have markers every 256 positions,
+ * so that should be close enough.
+ */
+ name = &kallsyms_names[kallsyms_markers[pos >> 8]];
- /* sequentially scan all the symbols up to the point we're searching for.
- * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
- * just need to add the len to the current pointer for every symbol we
- * wish to skip */
- for(i = 0; i < (pos&0xFF); i++)
+ /*
+ * Sequentially scan all the symbols up to the point we're searching
+ * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
+ * so we just need to add the len to the current pointer for every
+ * symbol we wish to skip.
+ */
+ for (i = 0; i < (pos & 0xFF); i++)
name = name + (*name) + 1;
return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
/* This kernel should never had been booted. */
BUG_ON(!kallsyms_addresses);
- /* do a binary search on the sorted kallsyms_addresses array */
+ /* Do a binary search on the sorted kallsyms_addresses array. */
low = 0;
high = kallsyms_num_syms;
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
}
/*
- * search for the first aliased symbol. Aliased
- * symbols are symbols with the same address
+ * Search for the first aliased symbol. Aliased
+ * symbols are symbols with the same address.
*/
while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
--low;
symbol_start = kallsyms_addresses[low];
- /* Search for next non-aliased symbol */
+ /* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
if (kallsyms_addresses[i] > symbol_start) {
symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
}
}
- /* if we found no next symbol, we use the end of the section */
+ /* If we found no next symbol, we use the end of the section. */
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
/*
* Lookup an address
- * - modname is set to NULL if it's in the kernel
- * - we guarantee that the returned name is valid until we reschedule even if
- * it resides in a module
- * - we also guarantee that modname will be valid until rescheduled
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ * It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
*/
const char *kallsyms_lookup(unsigned long addr,
unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
return namebuf;
}
- /* see if it's in a module */
+ /* See if it's in a module. */
return module_address_lookup(addr, symbolsize, offset, modname,
namebuf);
}
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
kallsyms_expand_symbol(get_symbol_offset(pos), symname);
return 0;
}
- /* see if it's in a module */
+ /* See if it's in a module. */
return lookup_module_symbol_name(addr, symname);
}
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
modname[0] = '\0';
return 0;
}
- /* see if it's in a module */
+ /* See if it's in a module. */
return lookup_module_symbol_attrs(addr, size, offset, modname, name);
}
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
return len;
}
+EXPORT_SYMBOL_GPL(sprint_symbol);
/* Look up a kernel symbol and print it to the kernel messages. */
void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
printk(fmt, buffer);
}
+EXPORT_SYMBOL(__print_symbol);
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
-struct kallsym_iter
-{
+struct kallsym_iter {
loff_t pos;
unsigned long value;
- unsigned int nameoff; /* If iterating in core kernel symbols */
+ unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
iter->pos = pos;
return get_ksymbol_mod(iter);
}
-
+
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
{
struct kallsym_iter *iter = m->private;
- /* Some debugging symbols have no name. Ignore them. */
+ /* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
if (iter->module_name[0]) {
char type;
- /* Label it "global" if it is exported,
- * "local" if not exported. */
+ /*
+ * Label it "global" if it is exported,
+ * "local" if not exported.
+ */
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
seq_printf(m, "%0*lx %c %s\t[%s]\n",
- (int)(2*sizeof(void*)),
+ (int)(2 * sizeof(void *)),
iter->value, type, iter->name, iter->module_name);
} else
seq_printf(m, "%0*lx %c %s\n",
- (int)(2*sizeof(void*)),
+ (int)(2 * sizeof(void *)),
iter->value, iter->type, iter->name);
return 0;
}
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
static int kallsyms_open(struct inode *inode, struct file *file)
{
- /* We keep iterator in m->private, since normal case is to
+ /*
+ * We keep iterator in m->private, since normal case is to
* s_start from where we left off, so we avoid doing
- * using get_symbol_offset for every symbol */
+ * using get_symbol_offset for every symbol.
+ */
struct kallsym_iter *iter;
int ret;
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
return 0;
}
-__initcall(kallsyms_init);
-
-EXPORT_SYMBOL(__print_symbol);
-EXPORT_SYMBOL_GPL(sprint_symbol);
+device_initcall(kallsyms_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e4983770913..f336e2107f9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
} while (*cur++ == ',');
if (*crash_size > 0) {
- while (*cur != ' ' && *cur != '@')
+ while (*cur && *cur != ' ' && *cur != '@')
cur++;
if (*cur == '@') {
cur++;
@@ -1448,17 +1448,17 @@ int kernel_kexec(void)
goto Restore_console;
}
suspend_console();
- error = device_suspend(PMSG_FREEZE);
+ error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Resume_console;
- /* At this point, device_suspend() has been called,
- * but *not* device_power_down(). We *must*
- * device_power_down() now. Otherwise, drivers for
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_noirq(). We *must* call
+ * dpm_suspend_noirq() now. Otherwise, drivers for
* some devices (e.g. interrupt controllers) become
* desynchronized with the actual state of the
* hardware at resume time, and evil weirdness ensues.
*/
- error = device_power_down(PMSG_FREEZE);
+ error = dpm_suspend_noirq(PMSG_FREEZE);
if (error)
goto Resume_devices;
error = disable_nonboot_cpus();
@@ -1486,9 +1486,9 @@ int kernel_kexec(void)
local_irq_enable();
Enable_cpus:
enable_nonboot_cpus();
- device_power_up(PMSG_RESTORE);
+ dpm_resume_noirq(PMSG_RESTORE);
Resume_devices:
- device_resume(PMSG_RESTORE);
+ dpm_resume_end(PMSG_RESTORE);
Resume_console:
resume_console();
thaw_processes();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f..26539e3228e 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
/*
* round up to the next power of 2, since our 'let the indices
- * wrap' tachnique works only in this case.
+ * wrap' technique works only in this case.
*/
- if (size & (size - 1)) {
+ if (!is_power_of_2(size)) {
BUG_ON(size > 0x80000000);
size = roundup_pow_of_two(size);
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bf..385c31a1bdb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
-#include <linux/mnt_namespace.h>
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/fdtable.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d..0540948e29a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
{
struct kprobe_insn_page *kip;
struct hlist_node *pos, *next;
- int safety;
/* Ensure no-one is preepmted on the garbages */
- mutex_unlock(&kprobe_insn_mutex);
- safety = check_safety();
- mutex_lock(&kprobe_insn_mutex);
- if (safety != 0)
+ if (check_safety())
return -EAGAIN;
hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -698,7 +694,7 @@ int __kprobes register_kprobe(struct kprobe *p)
p->addr = addr;
preempt_disable();
- if (!__kernel_text_address((unsigned long) p->addr) ||
+ if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr)) {
preempt_enable();
return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ebaf8519ab..eb8751aa041 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,11 +9,12 @@
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
+#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#define KTHREAD_NICE_LEVEL (-5)
@@ -21,15 +22,11 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
-
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
- struct completion started;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
@@ -38,17 +35,13 @@ struct kthread_create_info
struct list_head list;
};
-struct kthread_stop_info
-{
- struct task_struct *k;
- int err;
- struct completion done;
+struct kthread {
+ int should_stop;
+ struct completion exited;
};
-/* Thread stopping is done by setthing this var: lock serializes
- * multiple kthread_stop calls. */
-static DEFINE_MUTEX(kthread_stop_lock);
-static struct kthread_stop_info kthread_stop_info;
+#define to_kthread(tsk) \
+ container_of((tsk)->vfork_done, struct kthread, exited)
/**
* kthread_should_stop - should this kthread return now?
@@ -59,36 +52,35 @@ static struct kthread_stop_info kthread_stop_info;
*/
int kthread_should_stop(void)
{
- return (kthread_stop_info.k == current);
+ return to_kthread(current)->should_stop;
}
EXPORT_SYMBOL(kthread_should_stop);
static int kthread(void *_create)
{
+ /* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
- int (*threadfn)(void *data);
- void *data;
- int ret = -EINTR;
+ int (*threadfn)(void *data) = create->threadfn;
+ void *data = create->data;
+ struct kthread self;
+ int ret;
- /* Copy data: it's on kthread's stack */
- threadfn = create->threadfn;
- data = create->data;
+ self.should_stop = 0;
+ init_completion(&self.exited);
+ current->vfork_done = &self.exited;
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
- complete(&create->started);
+ complete(&create->done);
schedule();
- if (!kthread_should_stop())
+ ret = -EINTR;
+ if (!self.should_stop)
ret = threadfn(data);
- /* It might have exited on its own, w/o kthread_stop. Check. */
- if (kthread_should_stop()) {
- kthread_stop_info.err = ret;
- complete(&kthread_stop_info.done);
- }
- return 0;
+ /* we can't just return, we must preserve "self" on stack */
+ do_exit(ret);
}
static void create_kthread(struct kthread_create_info *create)
@@ -97,11 +89,10 @@ static void create_kthread(struct kthread_create_info *create)
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
- if (pid < 0)
+ if (pid < 0) {
create->result = ERR_PTR(pid);
- else
- wait_for_completion(&create->started);
- complete(&create->done);
+ complete(&create->done);
+ }
}
/**
@@ -132,7 +123,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
create.threadfn = threadfn;
create.data = data;
- init_completion(&create.started);
init_completion(&create.done);
spin_lock(&kthread_create_lock);
@@ -190,40 +180,34 @@ EXPORT_SYMBOL(kthread_bind);
* @k: thread created by kthread_create().
*
* Sets kthread_should_stop() for @k to return true, wakes it, and
- * waits for it to exit. Your threadfn() must not call do_exit()
- * itself if you use this function! This can also be called after
- * kthread_create() instead of calling wake_up_process(): the thread
- * will exit without calling threadfn().
+ * waits for it to exit. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will exit without
+ * calling threadfn().
+ *
+ * If threadfn() may call do_exit() itself, the caller must ensure
+ * task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
* was never called.
*/
int kthread_stop(struct task_struct *k)
{
+ struct kthread *kthread;
int ret;
- mutex_lock(&kthread_stop_lock);
-
- /* It could exit after stop_info.k set, but before wake_up_process. */
- get_task_struct(k);
-
trace_sched_kthread_stop(k);
+ get_task_struct(k);
- /* Must init completion *before* thread sees kthread_stop_info.k */
- init_completion(&kthread_stop_info.done);
- smp_wmb();
+ kthread = to_kthread(k);
+ barrier(); /* it might have exited */
+ if (k->vfork_done != NULL) {
+ kthread->should_stop = 1;
+ wake_up_process(k);
+ wait_for_completion(&kthread->exited);
+ }
+ ret = k->exit_code;
- /* Now set kthread_should_stop() to true, and wake it up. */
- kthread_stop_info.k = k;
- wake_up_process(k);
put_task_struct(k);
-
- /* Once it dies, reset stop ptr, gather result and we're done. */
- wait_for_completion(&kthread_stop_info.done);
- kthread_stop_info.k = NULL;
- ret = kthread_stop_info.err;
- mutex_unlock(&kthread_stop_lock);
-
trace_sched_kthread_stop_ret(ret);
return ret;
@@ -239,6 +223,7 @@ int kthreadd(void *unused)
ignore_signals(tsk);
set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_mems_allowed(node_possible_map);
current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index accb40cdb12..8bbeef996c7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
-#include <trace/lockdep.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/lockdep.h>
+
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
}
EXPORT_SYMBOL_GPL(lock_set_class);
-DEFINE_TRACE(lock_acquire);
-
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
EXPORT_SYMBOL_GPL(lock_acquire);
-DEFINE_TRACE(lock_release);
-
void lock_release(struct lockdep_map *lock, int nested,
unsigned long ip)
{
@@ -3105,6 +3103,8 @@ found_it:
hlock->holdtime_stamp = now;
}
+ trace_lock_acquired(lock, ip, waittime);
+
stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
lock->ip = ip;
}
-DEFINE_TRACE(lock_contended);
-
void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_contended);
-DEFINE_TRACE(lock_acquired);
-
void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- trace_lock_acquired(lock, ip);
-
if (unlikely(!lock_stat))
return;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c..e94caa666db 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -758,7 +758,8 @@ static int __init lockdep_proc_init(void)
&proc_lockdep_stats_operations);
#ifdef CONFIG_LOCK_STAT
- proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
+ &proc_lock_stat_operations);
#endif
return 0;
diff --git a/kernel/module.c b/kernel/module.c
index e797812a4d9..fd141140355 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
*/
#include <linux/module.h>
#include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/fs.h>
@@ -52,6 +53,7 @@
#include <linux/ftrace.h>
#include <linux/async.h>
#include <linux/percpu.h>
+#include <linux/kmemleak.h>
#if 0
#define DEBUGP printk
@@ -72,6 +74,9 @@ DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+
/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -429,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
unsigned long extra;
unsigned int i;
void *ptr;
+ int cpu;
if (align > PAGE_SIZE) {
printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -458,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
if (!split_block(i, size))
return NULL;
+ /* add the per-cpu scanning areas */
+ for_each_possible_cpu(cpu)
+ kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
+ GFP_KERNEL);
+
/* Mark allocated */
pcpu_size[i] = -pcpu_size[i];
return ptr;
@@ -472,6 +483,7 @@ static void percpu_modfree(void *freeme)
{
unsigned int i;
void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+ int cpu;
/* First entry is core kernel percpu data. */
for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -483,6 +495,10 @@ static void percpu_modfree(void *freeme)
BUG();
free:
+ /* remove the per-cpu scanning areas */
+ for_each_possible_cpu(cpu)
+ kmemleak_free(freeme + per_cpu_offset(cpu));
+
/* Merge with previous? */
if (pcpu_size[i-1] >= 0) {
pcpu_size[i-1] += pcpu_size[i];
@@ -777,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
char name[MODULE_NAME_LEN];
int ret, forced = 0;
- if (!capable(CAP_SYS_MODULE))
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -1052,7 +1068,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
- if (!find_symbol("module_layout", NULL, &crc, true, false))
+ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+ &crc, true, false))
BUG();
return check_version(sechdrs, versindex, "module_layout", mod, crc);
}
@@ -1489,9 +1506,6 @@ static void free_module(struct module *mod)
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
- /* release any pointers to mcount in this module */
- ftrace_release(mod->module_core, mod->core_size);
-
/* This may be NULL, but that's OK */
module_free(mod, mod->module_init);
kfree(mod->args);
@@ -1878,6 +1892,36 @@ static void *module_alloc_update_bounds(unsigned long size)
return ret;
}
+#ifdef CONFIG_DEBUG_KMEMLEAK
+static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs, char *secstrings)
+{
+ unsigned int i;
+
+ /* only scan the sections containing data */
+ kmemleak_scan_area(mod->module_core, (unsigned long)mod -
+ (unsigned long)mod->module_core,
+ sizeof(struct module), GFP_KERNEL);
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+ if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
+ && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+ continue;
+
+ kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
+ (unsigned long)mod->module_core,
+ sechdrs[i].sh_size, GFP_KERNEL);
+ }
+}
+#else
+static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs, char *secstrings)
+{
+}
+#endif
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static noinline struct module *load_module(void __user *umod,
@@ -1892,11 +1936,9 @@ static noinline struct module *load_module(void __user *umod,
unsigned int symindex = 0;
unsigned int strindex = 0;
unsigned int modindex, versindex, infoindex, pcpuindex;
- unsigned int num_mcount;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
- unsigned long *mseg;
mm_segment_t old_fs;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2050,6 +2092,12 @@ static noinline struct module *load_module(void __user *umod,
/* Do the allocs. */
ptr = module_alloc_update_bounds(mod->core_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. Just mark it as not being a
+ * leak.
+ */
+ kmemleak_not_leak(ptr);
if (!ptr) {
err = -ENOMEM;
goto free_percpu;
@@ -2058,6 +2106,13 @@ static noinline struct module *load_module(void __user *umod,
mod->module_core = ptr;
ptr = module_alloc_update_bounds(mod->init_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. This block doesn't need to be
+ * scanned as it contains data and code that will be freed
+ * after the module is initialized.
+ */
+ kmemleak_ignore(ptr);
if (!ptr && mod->init_size) {
err = -ENOMEM;
goto free_core;
@@ -2088,6 +2143,7 @@ static noinline struct module *load_module(void __user *umod,
}
/* Module has been moved. */
mod = (void *)sechdrs[modindex].sh_addr;
+ kmemleak_load_module(mod, hdr, sechdrs, secstrings);
#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
@@ -2161,6 +2217,10 @@ static noinline struct module *load_module(void __user *umod,
mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
"__kcrctab_unused_gpl");
#endif
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+#endif
#ifdef CONFIG_MARKERS
mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2172,7 +2232,19 @@ static noinline struct module *load_module(void __user *umod,
sizeof(*mod->tracepoints),
&mod->num_tracepoints);
#endif
-
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+ "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+ "__mcount_loc",
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2309,6 @@ static noinline struct module *load_module(void __user *umod,
dynamic_debug_setup(debug, num_debug);
}
- /* sechdrs[0].sh_size is always zero */
- mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
- sizeof(*mseg), &num_mcount);
- ftrace_init_module(mod, mseg, mseg + num_mcount);
-
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
goto cleanup;
@@ -2302,7 +2369,6 @@ static noinline struct module *load_module(void __user *umod,
cleanup:
kobject_del(&mod->mkobj.kobj);
kobject_put(&mod->mkobj.kobj);
- ftrace_release(mod->module_core, mod->core_size);
free_unload:
module_unload_free(mod);
#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
@@ -2328,6 +2394,17 @@ static noinline struct module *load_module(void __user *umod,
goto free_hdr;
}
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
/* This is where the real work happens */
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
@@ -2336,7 +2413,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
int ret = 0;
/* Must have permission */
- if (!capable(CAP_SYS_MODULE))
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
/* Only one module load at a time, please */
@@ -2356,6 +2433,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
+ do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
@@ -2374,9 +2452,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
return ret;
}
if (ret > 0) {
- printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
- "it should follow 0/-E convention\n"
- KERN_WARNING "%s: loading module anyway...\n",
+ printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
__func__, mod->name, ret,
__func__);
dump_stack();
@@ -2394,6 +2472,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
+ trim_init_extable(mod);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
@@ -2837,7 +2916,7 @@ void print_modules(void)
struct module *mod;
char buf[8];
- printk("Modules linked in:");
+ printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f..947b3ad551f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
*
* This function is similar to (but not equivalent to) down().
*/
-void inline __sched mutex_lock(struct mutex *lock)
+void __sched mutex_lock(struct mutex *lock)
{
might_sleep();
/*
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
/* didnt get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
- __schedule();
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
spin_lock_mutex(&lock->wait_lock, flags);
}
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
return ret;
}
-
EXPORT_SYMBOL(mutex_trylock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0..09b4ff9711b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
-/*
- * creates a copy of "orig" with refcount 1.
- */
-static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
+static inline struct nsproxy *create_nsproxy(void)
{
- struct nsproxy *ns;
+ struct nsproxy *nsproxy;
- ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
- if (ns) {
- memcpy(ns, orig, sizeof(struct nsproxy));
- atomic_set(&ns->count, 1);
- }
- return ns;
+ nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
+ if (nsproxy)
+ atomic_set(&nsproxy->count, 1);
+ return nsproxy;
}
/*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
struct nsproxy *new_nsp;
int err;
- new_nsp = clone_nsproxy(tsk->nsproxy);
+ new_nsp = create_nsproxy();
if (!new_nsp)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72..512ab73b0ca 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -301,6 +301,7 @@ int oops_may_print(void)
*/
void oops_enter(void)
{
+ tracing_off();
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
do_oops_enter_exit();
diff --git a/kernel/params.c b/kernel/params.c
index de273ec85bd..7f6912ced2b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
#include <linux/err.h>
#include <linux/slab.h>
-/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
-#define KPARAM_KMALLOCED 0x80000000
-
#if 0
#define DEBUGP printk
#else
@@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)
return -ENOSPC;
}
- if (kp->perm & KPARAM_KMALLOCED)
+ if (kp->flags & KPARAM_KMALLOCED)
kfree(*(char **)kp->arg);
/* This is a hack. We can't need to strdup in early boot, and we
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
- kp->perm |= KPARAM_KMALLOCED;
+ kp->flags |= KPARAM_KMALLOCED;
*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
if (!kp->arg)
return -ENOMEM;
@@ -241,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
return sprintf(buffer, "%s", *((char **)kp->arg));
}
+/* Actually could be a bool or an int, for historical reasons. */
int param_set_bool(const char *val, struct kernel_param *kp)
{
+ bool v;
+
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
switch (val[0]) {
case 'y': case 'Y': case '1':
- *(int *)kp->arg = 1;
- return 0;
+ v = true;
+ break;
case 'n': case 'N': case '0':
- *(int *)kp->arg = 0;
- return 0;
+ v = false;
+ break;
+ default:
+ return -EINVAL;
}
- return -EINVAL;
+
+ if (kp->flags & KPARAM_ISBOOL)
+ *(bool *)kp->arg = v;
+ else
+ *(int *)kp->arg = v;
+ return 0;
}
int param_get_bool(char *buffer, struct kernel_param *kp)
{
+ bool val;
+ if (kp->flags & KPARAM_ISBOOL)
+ val = *(bool *)kp->arg;
+ else
+ val = *(int *)kp->arg;
+
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+ return sprintf(buffer, "%c", val ? 'Y' : 'N');
}
+/* This one must be bool. */
int param_set_invbool(const char *val, struct kernel_param *kp)
{
- int boolval, ret;
+ int ret;
+ bool boolval;
struct kernel_param dummy;
dummy.arg = &boolval;
+ dummy.flags = KPARAM_ISBOOL;
ret = param_set_bool(val, &dummy);
if (ret == 0)
- *(int *)kp->arg = !boolval;
+ *(bool *)kp->arg = !boolval;
return ret;
}
int param_get_invbool(char *buffer, struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
}
/* We break the rule and mangle the string. */
@@ -591,7 +607,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
unsigned int i;
for (i = 0; i < num; i++)
- if (params[i].perm & KPARAM_KMALLOCED)
+ if (params[i].flags & KPARAM_KMALLOCED)
kfree(*(char **)params[i].arg);
}
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 00000000000..36f65e2b8b5
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4860 @@
+/*
+ * Performance counter core code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_counter.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU counters:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_counters __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_counters __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
+static atomic_t nr_task_counters __read_mostly;
+
+/*
+ * perf counter paranoia level:
+ * 0 - not paranoid
+ * 1 - disallow cpu counters to unpriv
+ * 2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly;
+
+static inline bool perf_paranoid_cpu(void)
+{
+ return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+ return sysctl_perf_counter_paranoid > 1;
+}
+
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_counter_id;
+
+/*
+ * Lock for (sysadmin-configurable) counter reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+ return NULL;
+}
+
+void __weak hw_perf_disable(void) { barrier(); }
+void __weak hw_perf_enable(void) { barrier(); }
+
+void __weak hw_perf_counter_setup(int cpu) { barrier(); }
+void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_counter *group_leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx, int cpu)
+{
+ return 0;
+}
+
+void __weak perf_counter_print_debug(void) { }
+
+static DEFINE_PER_CPU(int, disable_count);
+
+void __perf_disable(void)
+{
+ __get_cpu_var(disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+ return !--__get_cpu_var(disable_count);
+}
+
+void perf_disable(void)
+{
+ __perf_disable();
+ hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+ if (__perf_enable())
+ hw_perf_enable();
+}
+
+static void get_ctx(struct perf_counter_context *ctx)
+{
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+ struct perf_counter_context *ctx;
+
+ ctx = container_of(head, struct perf_counter_context, rcu_head);
+ kfree(ctx);
+}
+
+static void put_ctx(struct perf_counter_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ if (ctx->parent_ctx)
+ put_ctx(ctx->parent_ctx);
+ if (ctx->task)
+ put_task_struct(ctx->task);
+ call_rcu(&ctx->rcu_head, free_ctx);
+ }
+}
+
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+ if (ctx->parent_ctx) {
+ put_ctx(ctx->parent_ctx);
+ ctx->parent_ctx = NULL;
+ }
+}
+
+/*
+ * If we inherit counters we want to return the parent counter id
+ * to userspace.
+ */
+static u64 primary_counter_id(struct perf_counter *counter)
+{
+ u64 id = counter->id;
+
+ if (counter->parent)
+ id = counter->parent->id;
+
+ return id;
+}
+
+/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+ struct perf_counter_context *ctx;
+
+ rcu_read_lock();
+ retry:
+ ctx = rcu_dereference(task->perf_counter_ctxp);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_counter_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ goto retry;
+ }
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+ struct perf_counter_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ put_ctx(ctx);
+}
+
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+ struct perf_counter *group_leader = counter->group_leader;
+
+ /*
+ * Depending on whether it is a standalone or sibling counter,
+ * add it straight to the context's counter list, or to the group
+ * leader's sibling list:
+ */
+ if (group_leader == counter)
+ list_add_tail(&counter->list_entry, &ctx->counter_list);
+ else {
+ list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+ }
+
+ list_add_rcu(&counter->event_entry, &ctx->event_list);
+ ctx->nr_counters++;
+ if (counter->attr.inherit_stat)
+ ctx->nr_stat++;
+}
+
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+ struct perf_counter *sibling, *tmp;
+
+ if (list_empty(&counter->list_entry))
+ return;
+ ctx->nr_counters--;
+ if (counter->attr.inherit_stat)
+ ctx->nr_stat--;
+
+ list_del_init(&counter->list_entry);
+ list_del_rcu(&counter->event_entry);
+
+ if (counter->group_leader != counter)
+ counter->group_leader->nr_siblings--;
+
+ /*
+ * If this was a group counter with sibling counters then
+ * upgrade the siblings to singleton counters by adding them
+ * to the context list directly:
+ */
+ list_for_each_entry_safe(sibling, tmp,
+ &counter->sibling_list, list_entry) {
+
+ list_move_tail(&sibling->list_entry, &ctx->counter_list);
+ sibling->group_leader = sibling;
+ }
+}
+
+static void
+counter_sched_out(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return;
+
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ if (counter->pending_disable) {
+ counter->pending_disable = 0;
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
+ counter->tstamp_stopped = ctx->time;
+ counter->pmu->disable(counter);
+ counter->oncpu = -1;
+
+ if (!is_software_counter(counter))
+ cpuctx->active_oncpu--;
+ ctx->nr_active--;
+ if (counter->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+
+ if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return;
+
+ counter_sched_out(group_counter, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+ counter_sched_out(counter, cpuctx, ctx);
+
+ if (group_counter->attr.exclusive)
+ cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance counter
+ *
+ * We disable the counter on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_counter_remove_from_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter *counter = info;
+ struct perf_counter_context *ctx = counter->ctx;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+ /*
+ * Protect the list operation against NMI by disabling the
+ * counters on a global level.
+ */
+ perf_disable();
+
+ counter_sched_out(counter, cpuctx, ctx);
+
+ list_del_counter(counter, ctx);
+
+ if (!ctx->task) {
+ /*
+ * Allow more per task counters with respect to the
+ * reservation:
+ */
+ cpuctx->max_pertask =
+ min(perf_max_counters - ctx->nr_counters,
+ perf_max_counters - perf_reserved_percpu);
+ }
+
+ perf_enable();
+ spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the counter from a task's (or a CPU's) list of counters.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU counters are removed with a smp call. For task counters we only
+ * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid. This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_counter_remove_from_context(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu counters are removed via an smp call and
+ * the removal is always sucessful.
+ */
+ smp_call_function_single(counter->cpu,
+ __perf_counter_remove_from_context,
+ counter, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_counter_remove_from_context,
+ counter);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the context is active we need to retry the smp call.
+ */
+ if (ctx->nr_active && !list_empty(&counter->list_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can remove the counter safely, if the call above did not
+ * succeed.
+ */
+ if (!list_empty(&counter->list_entry)) {
+ list_del_counter(counter, ctx);
+ }
+ spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+ return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx)
+{
+ u64 now = perf_clock();
+
+ ctx->time += now - ctx->timestamp;
+ ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ u64 run_end;
+
+ if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+ return;
+
+ counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
+
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+ run_end = counter->tstamp_stopped;
+ else
+ run_end = ctx->time;
+
+ counter->total_time_running = run_end - counter->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+ struct perf_counter *counter;
+
+ update_counter_times(leader);
+ list_for_each_entry(counter, &leader->sibling_list, list_entry)
+ update_counter_times(counter);
+}
+
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+ struct perf_counter *counter = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = counter->ctx;
+
+ /*
+ * If this is a per-task counter, need to check whether this
+ * counter's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+
+ /*
+ * If the counter is on, turn it off.
+ * If it is in error state, leave it in error state.
+ */
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+ update_context_time(ctx);
+ update_counter_times(counter);
+ if (counter == counter->group_leader)
+ group_sched_out(counter, cpuctx, ctx);
+ else
+ counter_sched_out(counter, cpuctx, ctx);
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid. This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Disable the counter on the cpu that it's on
+ */
+ smp_call_function_single(counter->cpu, __perf_counter_disable,
+ counter, 1);
+ return;
+ }
+
+ retry:
+ task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the counter is still active, we need to retry the cross-call.
+ */
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_counter_times(counter);
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
+
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int
+counter_sched_in(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ if (counter->state <= PERF_COUNTER_STATE_OFF)
+ return 0;
+
+ counter->state = PERF_COUNTER_STATE_ACTIVE;
+ counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ if (counter->pmu->enable(counter)) {
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->oncpu = -1;
+ return -EAGAIN;
+ }
+
+ counter->tstamp_running += ctx->time - counter->tstamp_stopped;
+
+ if (!is_software_counter(counter))
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+
+ if (counter->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return 0;
+}
+
+static int
+group_sched_in(struct perf_counter *group_counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ struct perf_counter *counter, *partial_group;
+ int ret;
+
+ if (group_counter->state == PERF_COUNTER_STATE_OFF)
+ return 0;
+
+ ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+ return -EAGAIN;
+
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+ if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+ partial_group = counter;
+ goto group_error;
+ }
+ }
+
+ return 0;
+
+group_error:
+ /*
+ * Groups can be scheduled in as one unit only, so undo any
+ * partial group before returning:
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+ if (counter == partial_group)
+ break;
+ counter_sched_out(counter, cpuctx, ctx);
+ }
+ counter_sched_out(group_counter, cpuctx, ctx);
+
+ return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+ struct perf_counter *counter;
+
+ if (!is_software_counter(leader))
+ return 0;
+
+ list_for_each_entry(counter, &leader->sibling_list, list_entry)
+ if (!is_software_counter(counter))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ int can_add_hw)
+{
+ /*
+ * Groups consisting entirely of software counters can always go on.
+ */
+ if (is_software_only_group(counter))
+ return 1;
+ /*
+ * If an exclusive group is already on, no other hardware
+ * counters can go on.
+ */
+ if (cpuctx->exclusive)
+ return 0;
+ /*
+ * If this group is exclusive and there are already
+ * counters on the CPU, it can't go on.
+ */
+ if (counter->attr.exclusive && cpuctx->active_oncpu)
+ return 0;
+ /*
+ * Otherwise, try to add it if all previous groups were able
+ * to go on.
+ */
+ return can_add_hw;
+}
+
+static void add_counter_to_ctx(struct perf_counter *counter,
+ struct perf_counter_context *ctx)
+{
+ list_add_counter(counter, ctx);
+ counter->tstamp_enabled = ctx->time;
+ counter->tstamp_running = ctx->time;
+ counter->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter *counter = info;
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *leader = counter->group_leader;
+ int cpu = smp_processor_id();
+ int err;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ * Or possibly this is the right context but it isn't
+ * on this cpu because it had no counters.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ /*
+ * Protect the list operation against NMI by disabling the
+ * counters on a global level. NOP for non NMI based counters.
+ */
+ perf_disable();
+
+ add_counter_to_ctx(counter, ctx);
+
+ /*
+ * Don't put the counter on if it is disabled or if
+ * it is in a group and the group isn't on.
+ */
+ if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+ (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+ goto unlock;
+
+ /*
+ * An exclusive counter can't go on if there are already active
+ * hardware counters, and no hardware counter can go on if there
+ * is already an exclusive counter on.
+ */
+ if (!group_can_go_on(counter, cpuctx, 1))
+ err = -EEXIST;
+ else
+ err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+ if (err) {
+ /*
+ * This counter couldn't go on. If it is in a group
+ * then we have to pull the whole group off.
+ * If the counter group is pinned then put it in error state.
+ */
+ if (leader != counter)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_COUNTER_STATE_ERROR;
+ }
+ }
+
+ if (!err && !ctx->task && cpuctx->max_pertask)
+ cpuctx->max_pertask--;
+
+ unlock:
+ perf_enable();
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance counter to a context
+ *
+ * First we add the counter to the list with the hardware enable bit
+ * in counter->hw_config cleared.
+ *
+ * If the counter is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_counter_context *ctx,
+ struct perf_counter *counter,
+ int cpu)
+{
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu counters are installed via an smp call and
+ * the install is always sucessful.
+ */
+ smp_call_function_single(cpu, __perf_install_in_context,
+ counter, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_install_in_context,
+ counter);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * we need to retry the smp call.
+ */
+ if (ctx->is_active && list_empty(&counter->list_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can add the counter safely, if it the call above did not
+ * succeed.
+ */
+ if (list_empty(&counter->list_entry))
+ add_counter_to_ctx(counter, ctx);
+ spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
+{
+ struct perf_counter *counter = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *leader = counter->group_leader;
+ int err;
+
+ /*
+ * If this is a per-task counter, need to check whether this
+ * counter's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ goto unlock;
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+
+ /*
+ * If the counter is in a group and isn't the group leader,
+ * then don't put it on unless the group is on.
+ */
+ if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+ goto unlock;
+
+ if (!group_can_go_on(counter, cpuctx, 1)) {
+ err = -EEXIST;
+ } else {
+ perf_disable();
+ if (counter == leader)
+ err = group_sched_in(counter, cpuctx, ctx,
+ smp_processor_id());
+ else
+ err = counter_sched_in(counter, cpuctx, ctx,
+ smp_processor_id());
+ perf_enable();
+ }
+
+ if (err) {
+ /*
+ * If this counter can't go on and it's part of a
+ * group, then the whole group has to come off.
+ */
+ if (leader != counter)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_COUNTER_STATE_ERROR;
+ }
+ }
+
+ unlock:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid. This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Enable the counter on the cpu that it's on
+ */
+ smp_call_function_single(counter->cpu, __perf_counter_enable,
+ counter, 1);
+ return;
+ }
+
+ spin_lock_irq(&ctx->lock);
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ goto out;
+
+ /*
+ * If the counter is in error state, clear that first.
+ * That way, if we see the counter in error state below, we
+ * know that it has gone back into error state, as distinct
+ * from the task having been scheduled away before the
+ * cross-call arrived.
+ */
+ if (counter->state == PERF_COUNTER_STATE_ERROR)
+ counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+ spin_unlock_irq(&ctx->lock);
+ task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+ spin_lock_irq(&ctx->lock);
+
+ /*
+ * If the context is active and the counter is still off,
+ * we need to retry the cross-call.
+ */
+ if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+ goto retry;
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (counter->state == PERF_COUNTER_STATE_OFF) {
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled =
+ ctx->time - counter->total_time_enabled;
+ }
+ out:
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_counter_refresh(struct perf_counter *counter, int refresh)
+{
+ /*
+ * not supported on inherited counters
+ */
+ if (counter->attr.inherit)
+ return -EINVAL;
+
+ atomic_add(refresh, &counter->event_limit);
+ perf_counter_enable(counter);
+
+ return 0;
+}
+
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_counter *counter;
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 0;
+ if (likely(!ctx->nr_counters))
+ goto out;
+ update_context_time(ctx);
+
+ perf_disable();
+ if (ctx->nr_active) {
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter != counter->group_leader)
+ counter_sched_out(counter, cpuctx, ctx);
+ else
+ group_sched_out(counter, cpuctx, ctx);
+ }
+ }
+ perf_enable();
+ out:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled counters.
+ * If the number of enabled counters is the same, then the set
+ * of enabled counters should be the same, because these are both
+ * inherited contexts, therefore we can't access individual counters
+ * in them directly with an fd; we can only enable/disable all
+ * counters via prctl, or enable/disable all counters in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_counter_context *ctx1,
+ struct perf_counter_context *ctx2)
+{
+ return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+ && ctx1->parent_gen == ctx2->parent_gen
+ && !ctx1->pin_count && !ctx2->pin_count;
+}
+
+static void __perf_counter_read(void *counter);
+
+static void __perf_counter_sync_stat(struct perf_counter *counter,
+ struct perf_counter *next_counter)
+{
+ u64 value;
+
+ if (!counter->attr.inherit_stat)
+ return;
+
+ /*
+ * Update the counter value, we cannot use perf_counter_read()
+ * because we're in the middle of a context switch and have IRQs
+ * disabled, which upsets smp_call_function_single(), however
+ * we know the counter must be on the current CPU, therefore we
+ * don't need to use it.
+ */
+ switch (counter->state) {
+ case PERF_COUNTER_STATE_ACTIVE:
+ __perf_counter_read(counter);
+ break;
+
+ case PERF_COUNTER_STATE_INACTIVE:
+ update_counter_times(counter);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * In order to keep per-task stats reliable we need to flip the counter
+ * values when we flip the contexts.
+ */
+ value = atomic64_read(&next_counter->count);
+ value = atomic64_xchg(&counter->count, value);
+ atomic64_set(&next_counter->count, value);
+
+ swap(counter->total_time_enabled, next_counter->total_time_enabled);
+ swap(counter->total_time_running, next_counter->total_time_running);
+
+ /*
+ * Since we swizzled the values, update the user visible data too.
+ */
+ perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(next_counter);
+}
+
+#define list_next_entry(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_counter_sync_stat(struct perf_counter_context *ctx,
+ struct perf_counter_context *next_ctx)
+{
+ struct perf_counter *counter, *next_counter;
+
+ if (!ctx->nr_stat)
+ return;
+
+ counter = list_first_entry(&ctx->event_list,
+ struct perf_counter, event_entry);
+
+ next_counter = list_first_entry(&next_ctx->event_list,
+ struct perf_counter, event_entry);
+
+ while (&counter->event_entry != &ctx->event_list &&
+ &next_counter->event_entry != &next_ctx->event_list) {
+
+ __perf_counter_sync_stat(counter, next_counter);
+
+ counter = list_next_entry(counter, event_entry);
+ next_counter = list_next_entry(next_counter, event_entry);
+ }
+}
+
+/*
+ * Called from scheduler to remove the counters of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each counter and update the counter value in counter->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * not restart the counter.
+ */
+void perf_counter_task_sched_out(struct task_struct *task,
+ struct task_struct *next, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = task->perf_counter_ctxp;
+ struct perf_counter_context *next_ctx;
+ struct perf_counter_context *parent;
+ struct pt_regs *regs;
+ int do_switch = 1;
+
+ regs = task_pt_regs(task);
+ perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+ if (likely(!ctx || !cpuctx->task_ctx))
+ return;
+
+ update_context_time(ctx);
+
+ rcu_read_lock();
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_ctx = next->perf_counter_ctxp;
+ if (parent && next_ctx &&
+ rcu_dereference(next_ctx->parent_ctx) == parent) {
+ /*
+ * Looks like the two contexts are clones, so we might be
+ * able to optimize the context switch. We lock both
+ * contexts and check that they are clones under the
+ * lock (including re-checking that neither has been
+ * uncloned in the meantime). It doesn't matter which
+ * order we take the locks because no other cpu could
+ * be trying to lock both of these tasks.
+ */
+ spin_lock(&ctx->lock);
+ spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+ if (context_equiv(ctx, next_ctx)) {
+ /*
+ * XXX do we need a memory barrier of sorts
+ * wrt to rcu_dereference() of perf_counter_ctxp
+ */
+ task->perf_counter_ctxp = next_ctx;
+ next->perf_counter_ctxp = ctx;
+ ctx->task = next;
+ next_ctx->task = task;
+ do_switch = 0;
+
+ perf_counter_sync_stat(ctx, next_ctx);
+ }
+ spin_unlock(&next_ctx->lock);
+ spin_unlock(&ctx->lock);
+ }
+ rcu_read_unlock();
+
+ if (do_switch) {
+ __perf_counter_sched_out(ctx, cpuctx);
+ cpuctx->task_ctx = NULL;
+ }
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ __perf_counter_sched_out(ctx, cpuctx);
+ cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+ __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
+}
+
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct perf_counter *counter;
+ int can_add_hw = 1;
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ if (likely(!ctx->nr_counters))
+ goto out;
+
+ ctx->timestamp = perf_clock();
+
+ perf_disable();
+
+ /*
+ * First go through the list and put on any pinned groups
+ * in order to give them the best chance of going on.
+ */
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter->state <= PERF_COUNTER_STATE_OFF ||
+ !counter->attr.pinned)
+ continue;
+ if (counter->cpu != -1 && counter->cpu != cpu)
+ continue;
+
+ if (counter != counter->group_leader)
+ counter_sched_in(counter, cpuctx, ctx, cpu);
+ else {
+ if (group_can_go_on(counter, cpuctx, 1))
+ group_sched_in(counter, cpuctx, ctx, cpu);
+ }
+
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_group_times(counter);
+ counter->state = PERF_COUNTER_STATE_ERROR;
+ }
+ }
+
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ /*
+ * Ignore counters in OFF or ERROR state, and
+ * ignore pinned counters since we did them already.
+ */
+ if (counter->state <= PERF_COUNTER_STATE_OFF ||
+ counter->attr.pinned)
+ continue;
+
+ /*
+ * Listen to the 'cpu' scheduling filter constraint
+ * of counters:
+ */
+ if (counter->cpu != -1 && counter->cpu != cpu)
+ continue;
+
+ if (counter != counter->group_leader) {
+ if (counter_sched_in(counter, cpuctx, ctx, cpu))
+ can_add_hw = 0;
+ } else {
+ if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+ if (group_sched_in(counter, cpuctx, ctx, cpu))
+ can_add_hw = 0;
+ }
+ }
+ }
+ perf_enable();
+ out:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = task->perf_counter_ctxp;
+
+ if (likely(!ctx))
+ return;
+ if (cpuctx->task_ctx == ctx)
+ return;
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
+ cpuctx->task_ctx = ctx;
+}
+
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_counter *counter, int enable);
+
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 period, sample_period;
+ s64 delta;
+
+ events *= hwc->sample_period;
+ period = div64_u64(events, counter->attr.sample_freq);
+
+ delta = (s64)(period - hwc->sample_period);
+ delta = (delta + 7) / 8; /* low pass filter */
+
+ sample_period = hwc->sample_period + delta;
+
+ if (!sample_period)
+ sample_period = 1;
+
+ hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ u64 interrupts, freq;
+
+ spin_lock(&ctx->lock);
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ continue;
+
+ hwc = &counter->hw;
+
+ interrupts = hwc->interrupts;
+ hwc->interrupts = 0;
+
+ /*
+ * unthrottle counters on the tick
+ */
+ if (interrupts == MAX_INTERRUPTS) {
+ perf_log_throttle(counter, 1);
+ counter->pmu->unthrottle(counter);
+ interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
+ }
+
+ if (!counter->attr.freq || !counter->attr.sample_freq)
+ continue;
+
+ /*
+ * if the specified freq < HZ then we need to skip ticks
+ */
+ if (counter->attr.sample_freq < HZ) {
+ freq = counter->attr.sample_freq;
+
+ hwc->freq_count += freq;
+ hwc->freq_interrupts += interrupts;
+
+ if (hwc->freq_count < HZ)
+ continue;
+
+ interrupts = hwc->freq_interrupts;
+ hwc->freq_interrupts = 0;
+ hwc->freq_count -= HZ;
+ } else
+ freq = HZ;
+
+ perf_adjust_period(counter, freq * interrupts);
+
+ /*
+ * In order to avoid being stalled by an (accidental) huge
+ * sample period, force reset the sample period if we didn't
+ * get any events in this freq period.
+ */
+ if (!interrupts) {
+ perf_disable();
+ counter->pmu->disable(counter);
+ atomic64_set(&hwc->period_left, 0);
+ counter->pmu->enable(counter);
+ perf_enable();
+ }
+ }
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+
+ if (!ctx->nr_counters)
+ return;
+
+ spin_lock(&ctx->lock);
+ /*
+ * Rotate the first entry last (works just fine for group counters too):
+ */
+ perf_disable();
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ list_move_tail(&counter->list_entry, &ctx->counter_list);
+ break;
+ }
+ perf_enable();
+
+ spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx;
+
+ if (!atomic_read(&nr_counters))
+ return;
+
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ ctx = curr->perf_counter_ctxp;
+
+ perf_ctx_adjust_freq(&cpuctx->ctx);
+ if (ctx)
+ perf_ctx_adjust_freq(ctx);
+
+ perf_counter_cpu_sched_out(cpuctx);
+ if (ctx)
+ __perf_counter_task_sched_out(ctx);
+
+ rotate_ctx(&cpuctx->ctx);
+ if (ctx)
+ rotate_ctx(ctx);
+
+ perf_counter_cpu_sched_in(cpuctx, cpu);
+ if (ctx)
+ perf_counter_task_sched_in(curr, cpu);
+}
+
+/*
+ * Enable all of a task's counters that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_counter_enable_on_exec(struct task_struct *task)
+{
+ struct perf_counter_context *ctx;
+ struct perf_counter *counter;
+ unsigned long flags;
+ int enabled = 0;
+
+ local_irq_save(flags);
+ ctx = task->perf_counter_ctxp;
+ if (!ctx || !ctx->nr_counters)
+ goto out;
+
+ __perf_counter_task_sched_out(ctx);
+
+ spin_lock(&ctx->lock);
+
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (!counter->attr.enable_on_exec)
+ continue;
+ counter->attr.enable_on_exec = 0;
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ continue;
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled =
+ ctx->time - counter->total_time_enabled;
+ enabled = 1;
+ }
+
+ /*
+ * Unclone this context if we enabled any counter.
+ */
+ if (enabled)
+ unclone_ctx(ctx);
+
+ spin_unlock(&ctx->lock);
+
+ perf_counter_task_sched_in(task, smp_processor_id());
+ out:
+ local_irq_restore(flags);
+}
+
+/*
+ * Cross CPU call to read the hardware counter
+ */
+static void __perf_counter_read(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter *counter = info;
+ struct perf_counter_context *ctx = counter->ctx;
+ unsigned long flags;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived. In that case
+ * counter->count would have been updated to a recent sample
+ * when the counter was scheduled out.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ local_irq_save(flags);
+ if (ctx->is_active)
+ update_context_time(ctx);
+ counter->pmu->read(counter);
+ update_counter_times(counter);
+ local_irq_restore(flags);
+}
+
+static u64 perf_counter_read(struct perf_counter *counter)
+{
+ /*
+ * If counter is enabled and currently active on a CPU, update the
+ * value in the counter structure:
+ */
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ smp_call_function_single(counter->oncpu,
+ __perf_counter_read, counter, 1);
+ } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_counter_times(counter);
+ }
+
+ return atomic64_read(&counter->count);
+}
+
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+ struct task_struct *task)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ spin_lock_init(&ctx->lock);
+ mutex_init(&ctx->mutex);
+ INIT_LIST_HEAD(&ctx->counter_list);
+ INIT_LIST_HEAD(&ctx->event_list);
+ atomic_set(&ctx->refcount, 1);
+ ctx->task = task;
+}
+
+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
+{
+ struct perf_counter_context *ctx;
+ struct perf_cpu_context *cpuctx;
+ struct task_struct *task;
+ unsigned long flags;
+ int err;
+
+ /*
+ * If cpu is not a wildcard then this is a percpu counter:
+ */
+ if (cpu != -1) {
+ /* Must be root to operate on a CPU counter: */
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EACCES);
+
+ if (cpu < 0 || cpu > num_possible_cpus())
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * We could be clever and allow to attach a counter to an
+ * offline CPU and activate it when the CPU comes up, but
+ * that's for later.
+ */
+ if (!cpu_isset(cpu, cpu_online_map))
+ return ERR_PTR(-ENODEV);
+
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+ get_ctx(ctx);
+
+ return ctx;
+ }
+
+ rcu_read_lock();
+ if (!pid)
+ task = current;
+ else
+ task = find_task_by_vpid(pid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ /*
+ * Can't attach counters to a dying task.
+ */
+ err = -ESRCH;
+ if (task->flags & PF_EXITING)
+ goto errout;
+
+ /* Reuse ptrace permission checks for now. */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto errout;
+
+ retry:
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ unclone_ctx(ctx);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+
+ if (!ctx) {
+ ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!ctx)
+ goto errout;
+ __perf_counter_init_context(ctx, task);
+ get_ctx(ctx);
+ if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
+ /*
+ * We raced with some other task; use
+ * the context they set.
+ */
+ kfree(ctx);
+ goto retry;
+ }
+ get_task_struct(task);
+ }
+
+ put_task_struct(task);
+ return ctx;
+
+ errout:
+ put_task_struct(task);
+ return ERR_PTR(err);
+}
+
+static void free_counter_rcu(struct rcu_head *head)
+{
+ struct perf_counter *counter;
+
+ counter = container_of(head, struct perf_counter, rcu_head);
+ if (counter->ns)
+ put_pid_ns(counter->ns);
+ kfree(counter);
+}
+
+static void perf_pending_sync(struct perf_counter *counter);
+
+static void free_counter(struct perf_counter *counter)
+{
+ perf_pending_sync(counter);
+
+ if (!counter->parent) {
+ atomic_dec(&nr_counters);
+ if (counter->attr.mmap)
+ atomic_dec(&nr_mmap_counters);
+ if (counter->attr.comm)
+ atomic_dec(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_dec(&nr_task_counters);
+ }
+
+ if (counter->destroy)
+ counter->destroy(counter);
+
+ put_ctx(counter->ctx);
+ call_rcu(&counter->rcu_head, free_counter_rcu);
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+ struct perf_counter *counter = file->private_data;
+ struct perf_counter_context *ctx = counter->ctx;
+
+ file->private_data = NULL;
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ perf_counter_remove_from_context(counter);
+ mutex_unlock(&ctx->mutex);
+
+ mutex_lock(&counter->owner->perf_counter_mutex);
+ list_del_init(&counter->owner_entry);
+ mutex_unlock(&counter->owner->perf_counter_mutex);
+ put_task_struct(counter->owner);
+
+ free_counter(counter);
+
+ return 0;
+}
+
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += counter->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+
+ return size;
+}
+
+static u64 perf_counter_read_value(struct perf_counter *counter)
+{
+ struct perf_counter *child;
+ u64 total = 0;
+
+ total += perf_counter_read(counter);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ total += perf_counter_read(child);
+
+ return total;
+}
+
+static int perf_counter_read_entry(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ int n = 0, count = 0;
+ u64 values[2];
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ count = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, count))
+ return -EFAULT;
+
+ return count;
+}
+
+static int perf_counter_read_group(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ struct perf_counter *leader = counter->group_leader, *sub;
+ int n = 0, size = 0, err = -EFAULT;
+ u64 values[3];
+
+ values[n++] = 1 + leader->nr_siblings;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = leader->total_time_enabled +
+ atomic64_read(&leader->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = leader->total_time_running +
+ atomic64_read(&leader->child_total_time_running);
+ }
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, size))
+ return -EFAULT;
+
+ err = perf_counter_read_entry(leader, read_format, buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ err = perf_counter_read_entry(counter, read_format,
+ buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+ }
+
+ return size;
+}
+
+static int perf_counter_read_one(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ if (copy_to_user(buf, values, n * sizeof(u64)))
+ return -EFAULT;
+
+ return n * sizeof(u64);
+}
+
+/*
+ * Read the performance counter - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+{
+ u64 read_format = counter->attr.read_format;
+ int ret;
+
+ /*
+ * Return end-of-file for a read on a counter that is in
+ * error state (i.e. because it was pinned but it couldn't be
+ * scheduled on to the CPU at some point).
+ */
+ if (counter->state == PERF_COUNTER_STATE_ERROR)
+ return 0;
+
+ if (count < perf_counter_read_size(counter))
+ return -ENOSPC;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ mutex_lock(&counter->child_mutex);
+ if (read_format & PERF_FORMAT_GROUP)
+ ret = perf_counter_read_group(counter, read_format, buf);
+ else
+ ret = perf_counter_read_one(counter, read_format, buf);
+ mutex_unlock(&counter->child_mutex);
+
+ return ret;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct perf_counter *counter = file->private_data;
+
+ return perf_read_hw(counter, buf, count);
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+ struct perf_counter *counter = file->private_data;
+ struct perf_mmap_data *data;
+ unsigned int events = POLL_HUP;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (data)
+ events = atomic_xchg(&data->poll, 0);
+ rcu_read_unlock();
+
+ poll_wait(file, &counter->waitq, wait);
+
+ return events;
+}
+
+static void perf_counter_reset(struct perf_counter *counter)
+{
+ (void)perf_counter_read(counter);
+ atomic64_set(&counter->count, 0);
+ perf_counter_update_userpage(counter);
+}
+
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
+static void perf_counter_for_each_child(struct perf_counter *counter,
+ void (*func)(struct perf_counter *))
+{
+ struct perf_counter *child;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ mutex_lock(&counter->child_mutex);
+ func(counter);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ func(child);
+ mutex_unlock(&counter->child_mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+ void (*func)(struct perf_counter *))
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *sibling;
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ counter = counter->group_leader;
+
+ perf_counter_for_each_child(counter, func);
+ func(counter);
+ list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+ perf_counter_for_each_child(counter, func);
+ mutex_unlock(&ctx->mutex);
+}
+
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ unsigned long size;
+ int ret = 0;
+ u64 value;
+
+ if (!counter->attr.sample_period)
+ return -EINVAL;
+
+ size = copy_from_user(&value, arg, sizeof(value));
+ if (size != sizeof(value))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ spin_lock_irq(&ctx->lock);
+ if (counter->attr.freq) {
+ if (value > sysctl_perf_counter_sample_rate) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ counter->attr.sample_freq = value;
+ } else {
+ counter->attr.sample_period = value;
+ counter->hw.sample_period = value;
+ }
+unlock:
+ spin_unlock_irq(&ctx->lock);
+
+ return ret;
+}
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_counter *counter = file->private_data;
+ void (*func)(struct perf_counter *);
+ u32 flags = arg;
+
+ switch (cmd) {
+ case PERF_COUNTER_IOC_ENABLE:
+ func = perf_counter_enable;
+ break;
+ case PERF_COUNTER_IOC_DISABLE:
+ func = perf_counter_disable;
+ break;
+ case PERF_COUNTER_IOC_RESET:
+ func = perf_counter_reset;
+ break;
+
+ case PERF_COUNTER_IOC_REFRESH:
+ return perf_counter_refresh(counter, arg);
+
+ case PERF_COUNTER_IOC_PERIOD:
+ return perf_counter_period(counter, (u64 __user *)arg);
+
+ default:
+ return -ENOTTY;
+ }
+
+ if (flags & PERF_IOC_FLAG_GROUP)
+ perf_counter_for_each(counter, func);
+ else
+ perf_counter_for_each_child(counter, func);
+
+ return 0;
+}
+
+int perf_counter_task_enable(void)
+{
+ struct perf_counter *counter;
+
+ mutex_lock(&current->perf_counter_mutex);
+ list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+ perf_counter_for_each_child(counter, perf_counter_enable);
+ mutex_unlock(&current->perf_counter_mutex);
+
+ return 0;
+}
+
+int perf_counter_task_disable(void)
+{
+ struct perf_counter *counter;
+
+ mutex_lock(&current->perf_counter_mutex);
+ list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+ perf_counter_for_each_child(counter, perf_counter_disable);
+ mutex_unlock(&current->perf_counter_mutex);
+
+ return 0;
+}
+
+#ifndef PERF_COUNTER_INDEX_OFFSET
+# define PERF_COUNTER_INDEX_OFFSET 0
+#endif
+
+static int perf_counter_index(struct perf_counter *counter)
+{
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return 0;
+
+ return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+ struct perf_counter_mmap_page *userpg;
+ struct perf_mmap_data *data;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto unlock;
+
+ userpg = data->user_page;
+
+ /*
+ * Disable preemption so as to not let the corresponding user-space
+ * spin too long if we get preempted.
+ */
+ preempt_disable();
+ ++userpg->lock;
+ barrier();
+ userpg->index = perf_counter_index(counter);
+ userpg->offset = atomic64_read(&counter->count);
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+ userpg->time_enabled = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+
+ userpg->time_running = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+
+ barrier();
+ ++userpg->lock;
+ preempt_enable();
+unlock:
+ rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+ struct perf_mmap_data *data;
+ int ret = VM_FAULT_SIGBUS;
+
+ if (vmf->flags & FAULT_FLAG_MKWRITE) {
+ if (vmf->pgoff == 0)
+ ret = 0;
+ return ret;
+ }
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto unlock;
+
+ if (vmf->pgoff == 0) {
+ vmf->page = virt_to_page(data->user_page);
+ } else {
+ int nr = vmf->pgoff - 1;
+
+ if ((unsigned)nr > data->nr_pages)
+ goto unlock;
+
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ goto unlock;
+
+ vmf->page = virt_to_page(data->data_pages[nr]);
+ }
+
+ get_page(vmf->page);
+ vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->index = vmf->pgoff;
+
+ ret = 0;
+unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+ struct perf_mmap_data *data;
+ unsigned long size;
+ int i;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ size = sizeof(struct perf_mmap_data);
+ size += nr_pages * sizeof(void *);
+
+ data = kzalloc(size, GFP_KERNEL);
+ if (!data)
+ goto fail;
+
+ data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->user_page)
+ goto fail_user_page;
+
+ for (i = 0; i < nr_pages; i++) {
+ data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->data_pages[i])
+ goto fail_data_pages;
+ }
+
+ data->nr_pages = nr_pages;
+ atomic_set(&data->lock, -1);
+
+ rcu_assign_pointer(counter->data, data);
+
+ return 0;
+
+fail_data_pages:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)data->data_pages[i]);
+
+ free_page((unsigned long)data->user_page);
+
+fail_user_page:
+ kfree(data);
+
+fail:
+ return -ENOMEM;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+ struct page *page = virt_to_page((void *)addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+ struct perf_mmap_data *data;
+ int i;
+
+ data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
+ perf_mmap_free_page((unsigned long)data->user_page);
+ for (i = 0; i < data->nr_pages; i++)
+ perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
+ kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+ struct perf_mmap_data *data = counter->data;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ rcu_assign_pointer(counter->data, NULL);
+ call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
+ struct user_struct *user = current_user();
+
+ atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
+ vma->vm_mm->locked_vm -= counter->data->nr_locked;
+ perf_mmap_data_free(counter);
+ mutex_unlock(&counter->mmap_mutex);
+ }
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+ .open = perf_mmap_open,
+ .close = perf_mmap_close,
+ .fault = perf_mmap_fault,
+ .page_mkwrite = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = file->private_data;
+ unsigned long user_locked, user_lock_limit;
+ struct user_struct *user = current_user();
+ unsigned long locked, lock_limit;
+ unsigned long vma_size;
+ unsigned long nr_pages;
+ long user_extra, extra;
+ int ret = 0;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+ /*
+ * If we have data pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
+ */
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ return -EINVAL;
+
+ if (vma_size != PAGE_SIZE * (1 + nr_pages))
+ return -EINVAL;
+
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ mutex_lock(&counter->mmap_mutex);
+ if (atomic_inc_not_zero(&counter->mmap_count)) {
+ if (nr_pages != counter->data->nr_pages)
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ user_extra = nr_pages + 1;
+ user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+
+ /*
+ * Increase the limit linearly with more CPUs:
+ */
+ user_lock_limit *= num_online_cpus();
+
+ user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+ extra = 0;
+ if (user_locked > user_lock_limit)
+ extra = user_locked - user_lock_limit;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+ locked = vma->vm_mm->locked_vm + extra;
+
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ ret = -EPERM;
+ goto unlock;
+ }
+
+ WARN_ON(counter->data);
+ ret = perf_mmap_data_alloc(counter, nr_pages);
+ if (ret)
+ goto unlock;
+
+ atomic_set(&counter->mmap_count, 1);
+ atomic_long_add(user_extra, &user->locked_vm);
+ vma->vm_mm->locked_vm += extra;
+ counter->data->nr_locked = extra;
+ if (vma->vm_flags & VM_WRITE)
+ counter->data->writable = 1;
+
+unlock:
+ mutex_unlock(&counter->mmap_mutex);
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_ops = &perf_mmap_vmops;
+
+ return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct perf_counter *counter = filp->private_data;
+ int retval;
+
+ mutex_lock(&inode->i_mutex);
+ retval = fasync_helper(fd, filp, on, &counter->fasync);
+ mutex_unlock(&inode->i_mutex);
+
+ if (retval < 0)
+ return retval;
+
+ return 0;
+}
+
+static const struct file_operations perf_fops = {
+ .release = perf_release,
+ .read = perf_read,
+ .poll = perf_poll,
+ .unlocked_ioctl = perf_ioctl,
+ .compat_ioctl = perf_ioctl,
+ .mmap = perf_mmap,
+ .fasync = perf_fasync,
+};
+
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+ wake_up_all(&counter->waitq);
+
+ if (counter->pending_kill) {
+ kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
+ counter->pending_kill = 0;
+ }
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+static void perf_pending_counter(struct perf_pending_entry *entry)
+{
+ struct perf_counter *counter = container_of(entry,
+ struct perf_counter, pending);
+
+ if (counter->pending_disable) {
+ counter->pending_disable = 0;
+ __perf_counter_disable(counter);
+ }
+
+ if (counter->pending_wakeup) {
+ counter->pending_wakeup = 0;
+ perf_counter_wakeup(counter);
+ }
+}
+
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
+ PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_pending_entry *entry,
+ void (*func)(struct perf_pending_entry *))
+{
+ struct perf_pending_entry **head;
+
+ if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
+ return;
+
+ entry->func = func;
+
+ head = &get_cpu_var(perf_pending_head);
+
+ do {
+ entry->next = *head;
+ } while (cmpxchg(head, entry->next, entry) != entry->next);
+
+ set_perf_counter_pending();
+
+ put_cpu_var(perf_pending_head);
+}
+
+static int __perf_pending_run(void)
+{
+ struct perf_pending_entry *list;
+ int nr = 0;
+
+ list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
+ while (list != PENDING_TAIL) {
+ void (*func)(struct perf_pending_entry *);
+ struct perf_pending_entry *entry = list;
+
+ list = list->next;
+
+ func = entry->func;
+ entry->next = NULL;
+ /*
+ * Ensure we observe the unqueue before we issue the wakeup,
+ * so that we won't be waiting forever.
+ * -- see perf_not_pending().
+ */
+ smp_wmb();
+
+ func(entry);
+ nr++;
+ }
+
+ return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+ /*
+ * If we flush on whatever cpu we run, there is a chance we don't
+ * need to wait.
+ */
+ get_cpu();
+ __perf_pending_run();
+ put_cpu();
+
+ /*
+ * Ensure we see the proper queue state before going to sleep
+ * so that we do not miss the wakeup. -- see perf_pending_handle()
+ */
+ smp_rmb();
+ return counter->pending.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+ wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+ __perf_pending_run();
+}
+
+/*
+ * Callchain support -- arch specific
+ */
+
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ return NULL;
+}
+
+/*
+ * Output
+ */
+
+struct perf_output_handle {
+ struct perf_counter *counter;
+ struct perf_mmap_data *data;
+ unsigned long head;
+ unsigned long offset;
+ int nmi;
+ int sample;
+ int locked;
+ unsigned long flags;
+};
+
+static bool perf_output_space(struct perf_mmap_data *data,
+ unsigned int offset, unsigned int head)
+{
+ unsigned long tail;
+ unsigned long mask;
+
+ if (!data->writable)
+ return true;
+
+ mask = (data->nr_pages << PAGE_SHIFT) - 1;
+ /*
+ * Userspace could choose to issue a mb() before updating the tail
+ * pointer. So that all reads will be completed before the write is
+ * issued.
+ */
+ tail = ACCESS_ONCE(data->user_page->data_tail);
+ smp_rmb();
+
+ offset = (offset - tail) & mask;
+ head = (head - tail) & mask;
+
+ if ((int)(head - offset) < 0)
+ return false;
+
+ return true;
+}
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+ atomic_set(&handle->data->poll, POLL_IN);
+
+ if (handle->nmi) {
+ handle->counter->pending_wakeup = 1;
+ perf_pending_queue(&handle->counter->pending,
+ perf_pending_counter);
+ } else
+ perf_counter_wakeup(handle->counter);
+}
+
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ int cpu;
+
+ handle->locked = 0;
+
+ local_irq_save(handle->flags);
+ cpu = smp_processor_id();
+
+ if (in_nmi() && atomic_read(&data->lock) == cpu)
+ return;
+
+ while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+ cpu_relax();
+
+ handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ unsigned long head;
+ int cpu;
+
+ data->done_head = data->head;
+
+ if (!handle->locked)
+ goto out;
+
+again:
+ /*
+ * The xchg implies a full barrier that ensures all writes are done
+ * before we publish the new head, matched by a rmb() in userspace when
+ * reading this position.
+ */
+ while ((head = atomic_long_xchg(&data->done_head, 0)))
+ data->user_page->data_head = head;
+
+ /*
+ * NMI can happen here, which means we can miss a done_head update.
+ */
+
+ cpu = atomic_xchg(&data->lock, -1);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ /*
+ * Therefore we have to validate we did not indeed do so.
+ */
+ if (unlikely(atomic_long_read(&data->done_head))) {
+ /*
+ * Since we had it locked, we can lock it again.
+ */
+ while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+ cpu_relax();
+
+ goto again;
+ }
+
+ if (atomic_xchg(&data->wakeup, 0))
+ perf_output_wakeup(handle);
+out:
+ local_irq_restore(handle->flags);
+}
+
+static void perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
+{
+ unsigned int pages_mask;
+ unsigned int offset;
+ unsigned int size;
+ void **pages;
+
+ offset = handle->offset;
+ pages_mask = handle->data->nr_pages - 1;
+ pages = handle->data->data_pages;
+
+ do {
+ unsigned int page_offset;
+ int nr;
+
+ nr = (offset >> PAGE_SHIFT) & pages_mask;
+ page_offset = offset & (PAGE_SIZE - 1);
+ size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+ memcpy(pages[nr] + page_offset, buf, size);
+
+ len -= size;
+ buf += size;
+ offset += size;
+ } while (len);
+
+ handle->offset = offset;
+
+ /*
+ * Check we didn't copy past our reservation window, taking the
+ * possible unsigned int wrap into account.
+ */
+ WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+#define perf_output_put(handle, x) \
+ perf_output_copy((handle), &(x), sizeof(x))
+
+static int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_counter *counter, unsigned int size,
+ int nmi, int sample)
+{
+ struct perf_mmap_data *data;
+ unsigned int offset, head;
+ int have_lost;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ } lost_event;
+
+ /*
+ * For inherited counters we send all the output towards the parent.
+ */
+ if (counter->parent)
+ counter = counter->parent;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto out;
+
+ handle->data = data;
+ handle->counter = counter;
+ handle->nmi = nmi;
+ handle->sample = sample;
+
+ if (!data->nr_pages)
+ goto fail;
+
+ have_lost = atomic_read(&data->lost);
+ if (have_lost)
+ size += sizeof(lost_event);
+
+ perf_output_lock(handle);
+
+ do {
+ offset = head = atomic_long_read(&data->head);
+ head += size;
+ if (unlikely(!perf_output_space(data, offset, head)))
+ goto fail;
+ } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+
+ handle->offset = offset;
+ handle->head = head;
+
+ if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+ atomic_set(&data->wakeup, 1);
+
+ if (have_lost) {
+ lost_event.header.type = PERF_EVENT_LOST;
+ lost_event.header.misc = 0;
+ lost_event.header.size = sizeof(lost_event);
+ lost_event.id = counter->id;
+ lost_event.lost = atomic_xchg(&data->lost, 0);
+
+ perf_output_put(handle, lost_event);
+ }
+
+ return 0;
+
+fail:
+ atomic_inc(&data->lost);
+ perf_output_unlock(handle);
+out:
+ rcu_read_unlock();
+
+ return -ENOSPC;
+}
+
+static void perf_output_end(struct perf_output_handle *handle)
+{
+ struct perf_counter *counter = handle->counter;
+ struct perf_mmap_data *data = handle->data;
+
+ int wakeup_events = counter->attr.wakeup_events;
+
+ if (handle->sample && wakeup_events) {
+ int events = atomic_inc_return(&data->events);
+ if (events >= wakeup_events) {
+ atomic_sub(wakeup_events, &data->events);
+ atomic_set(&data->wakeup, 1);
+ }
+ }
+
+ perf_output_unlock(handle);
+ rcu_read_unlock();
+}
+
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+ /*
+ * only top level counters have the pid namespace they were created in
+ */
+ if (counter->parent)
+ counter = counter->parent;
+
+ return task_tgid_nr_ns(p, counter->ns);
+}
+
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+ /*
+ * only top level counters have the pid namespace they were created in
+ */
+ if (counter->parent)
+ counter = counter->parent;
+
+ return task_pid_nr_ns(p, counter->ns);
+}
+
+static void perf_output_read_one(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ u64 read_format = counter->attr.read_format;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = atomic64_read(&counter->count);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ struct perf_counter *leader = counter->group_leader, *sub;
+ u64 read_format = counter->attr.read_format;
+ u64 values[5];
+ int n = 0;
+
+ values[n++] = 1 + leader->nr_siblings;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = leader->total_time_enabled;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = leader->total_time_running;
+
+ if (leader != counter)
+ leader->pmu->read(leader);
+
+ values[n++] = atomic64_read(&leader->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(leader);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ n = 0;
+
+ if (sub != counter)
+ sub->pmu->read(sub);
+
+ values[n++] = atomic64_read(&sub->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(sub);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+ }
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ if (counter->attr.read_format & PERF_FORMAT_GROUP)
+ perf_output_read_group(handle, counter);
+ else
+ perf_output_read_one(handle, counter);
+}
+
+void perf_counter_output(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
+{
+ int ret;
+ u64 sample_type = counter->attr.sample_type;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ u64 ip;
+ struct {
+ u32 pid, tid;
+ } tid_entry;
+ struct perf_callchain_entry *callchain = NULL;
+ int callchain_size = 0;
+ u64 time;
+ struct {
+ u32 cpu, reserved;
+ } cpu_entry;
+
+ header.type = PERF_EVENT_SAMPLE;
+ header.size = sizeof(header);
+
+ header.misc = 0;
+ header.misc |= perf_misc_flags(data->regs);
+
+ if (sample_type & PERF_SAMPLE_IP) {
+ ip = perf_instruction_pointer(data->regs);
+ header.size += sizeof(ip);
+ }
+
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ tid_entry.pid = perf_counter_pid(counter, current);
+ tid_entry.tid = perf_counter_tid(counter, current);
+
+ header.size += sizeof(tid_entry);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME) {
+ /*
+ * Maybe do better on x86 and provide cpu_clock_nmi()
+ */
+ time = sched_clock();
+
+ header.size += sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ header.size += sizeof(cpu_entry);
+
+ cpu_entry.cpu = raw_smp_processor_id();
+ cpu_entry.reserved = 0;
+ }
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ header.size += perf_counter_read_size(counter);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ callchain = perf_callchain(data->regs);
+
+ if (callchain) {
+ callchain_size = (1 + callchain->nr) * sizeof(u64);
+ header.size += callchain_size;
+ } else
+ header.size += sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header.size += size;
+ }
+
+ ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, header);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ perf_output_put(&handle, ip);
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(&handle, tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(&handle, time);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ perf_output_put(&handle, data->addr);
+
+ if (sample_type & PERF_SAMPLE_ID) {
+ u64 id = primary_counter_id(counter);
+
+ perf_output_put(&handle, id);
+ }
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(&handle, counter->id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(&handle, cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ perf_output_put(&handle, data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ perf_output_read(&handle, counter);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ if (callchain)
+ perf_output_copy(&handle, callchain, callchain_size);
+ else {
+ u64 nr = 0;
+ perf_output_put(&handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ if (data->raw) {
+ perf_output_put(&handle, data->raw->size);
+ perf_output_copy(&handle, data->raw->data, data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(&handle, raw);
+ }
+ }
+
+ perf_output_end(&handle);
+}
+
+/*
+ * read event
+ */
+
+struct perf_read_event {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+};
+
+static void
+perf_counter_read_event(struct perf_counter *counter,
+ struct task_struct *task)
+{
+ struct perf_output_handle handle;
+ struct perf_read_event event = {
+ .header = {
+ .type = PERF_EVENT_READ,
+ .misc = 0,
+ .size = sizeof(event) + perf_counter_read_size(counter),
+ },
+ .pid = perf_counter_pid(counter, task),
+ .tid = perf_counter_tid(counter, task),
+ };
+ int ret;
+
+ ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, event);
+ perf_output_read(&handle, counter);
+
+ perf_output_end(&handle);
+}
+
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
+ */
+
+struct perf_task_event {
+ struct task_struct *task;
+ struct perf_counter_context *task_ctx;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 ppid;
+ u32 tid;
+ u32 ptid;
+ } event;
+};
+
+static void perf_counter_task_output(struct perf_counter *counter,
+ struct perf_task_event *task_event)
+{
+ struct perf_output_handle handle;
+ int size = task_event->event.header.size;
+ struct task_struct *task = task_event->task;
+ int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+ if (ret)
+ return;
+
+ task_event->event.pid = perf_counter_pid(counter, task);
+ task_event->event.ppid = perf_counter_pid(counter, current);
+
+ task_event->event.tid = perf_counter_tid(counter, task);
+ task_event->event.ptid = perf_counter_tid(counter, current);
+
+ perf_output_put(&handle, task_event->event);
+ perf_output_end(&handle);
+}
+
+static int perf_counter_task_match(struct perf_counter *counter)
+{
+ if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
+ return 1;
+
+ return 0;
+}
+
+static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+ struct perf_task_event *task_event)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_counter_task_match(counter))
+ perf_counter_task_output(counter, task_event);
+ }
+ rcu_read_unlock();
+}
+
+static void perf_counter_task_event(struct perf_task_event *task_event)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx = task_event->task_ctx;
+
+ cpuctx = &get_cpu_var(perf_cpu_context);
+ perf_counter_task_ctx(&cpuctx->ctx, task_event);
+ put_cpu_var(perf_cpu_context);
+
+ rcu_read_lock();
+ if (!ctx)
+ ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
+ if (ctx)
+ perf_counter_task_ctx(ctx, task_event);
+ rcu_read_unlock();
+}
+
+static void perf_counter_task(struct task_struct *task,
+ struct perf_counter_context *task_ctx,
+ int new)
+{
+ struct perf_task_event task_event;
+
+ if (!atomic_read(&nr_comm_counters) &&
+ !atomic_read(&nr_mmap_counters) &&
+ !atomic_read(&nr_task_counters))
+ return;
+
+ task_event = (struct perf_task_event){
+ .task = task,
+ .task_ctx = task_ctx,
+ .event = {
+ .header = {
+ .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
+ .misc = 0,
+ .size = sizeof(task_event.event),
+ },
+ /* .pid */
+ /* .ppid */
+ /* .tid */
+ /* .ptid */
+ },
+ };
+
+ perf_counter_task_event(&task_event);
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+ perf_counter_task(task, NULL, 1);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+ struct task_struct *task;
+ char *comm;
+ int comm_size;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ } event;
+};
+
+static void perf_counter_comm_output(struct perf_counter *counter,
+ struct perf_comm_event *comm_event)
+{
+ struct perf_output_handle handle;
+ int size = comm_event->event.header.size;
+ int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+ if (ret)
+ return;
+
+ comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+ comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+
+ perf_output_put(&handle, comm_event->event);
+ perf_output_copy(&handle, comm_event->comm,
+ comm_event->comm_size);
+ perf_output_end(&handle);
+}
+
+static int perf_counter_comm_match(struct perf_counter *counter)
+{
+ if (counter->attr.comm)
+ return 1;
+
+ return 0;
+}
+
+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
+ struct perf_comm_event *comm_event)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_counter_comm_match(counter))
+ perf_counter_comm_output(counter, comm_event);
+ }
+ rcu_read_unlock();
+}
+
+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx;
+ unsigned int size;
+ char comm[TASK_COMM_LEN];
+
+ memset(comm, 0, sizeof(comm));
+ strncpy(comm, comm_event->task->comm, sizeof(comm));
+ size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+ comm_event->comm = comm;
+ comm_event->comm_size = size;
+
+ comm_event->event.header.size = sizeof(comm_event->event) + size;
+
+ cpuctx = &get_cpu_var(perf_cpu_context);
+ perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+ put_cpu_var(perf_cpu_context);
+
+ rcu_read_lock();
+ /*
+ * doesn't really matter which of the child contexts the
+ * events ends up in.
+ */
+ ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (ctx)
+ perf_counter_comm_ctx(ctx, comm_event);
+ rcu_read_unlock();
+}
+
+void perf_counter_comm(struct task_struct *task)
+{
+ struct perf_comm_event comm_event;
+
+ if (task->perf_counter_ctxp)
+ perf_counter_enable_on_exec(task);
+
+ if (!atomic_read(&nr_comm_counters))
+ return;
+
+ comm_event = (struct perf_comm_event){
+ .task = task,
+ /* .comm */
+ /* .comm_size */
+ .event = {
+ .header = {
+ .type = PERF_EVENT_COMM,
+ .misc = 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ },
+ };
+
+ perf_counter_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+ struct vm_area_struct *vma;
+
+ const char *file_name;
+ int file_size;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 start;
+ u64 len;
+ u64 pgoff;
+ } event;
+};
+
+static void perf_counter_mmap_output(struct perf_counter *counter,
+ struct perf_mmap_event *mmap_event)
+{
+ struct perf_output_handle handle;
+ int size = mmap_event->event.header.size;
+ int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+ if (ret)
+ return;
+
+ mmap_event->event.pid = perf_counter_pid(counter, current);
+ mmap_event->event.tid = perf_counter_tid(counter, current);
+
+ perf_output_put(&handle, mmap_event->event);
+ perf_output_copy(&handle, mmap_event->file_name,
+ mmap_event->file_size);
+ perf_output_end(&handle);
+}
+
+static int perf_counter_mmap_match(struct perf_counter *counter,
+ struct perf_mmap_event *mmap_event)
+{
+ if (counter->attr.mmap)
+ return 1;
+
+ return 0;
+}
+
+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
+ struct perf_mmap_event *mmap_event)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_counter_mmap_match(counter, mmap_event))
+ perf_counter_mmap_output(counter, mmap_event);
+ }
+ rcu_read_unlock();
+}
+
+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx;
+ struct vm_area_struct *vma = mmap_event->vma;
+ struct file *file = vma->vm_file;
+ unsigned int size;
+ char tmp[16];
+ char *buf = NULL;
+ const char *name;
+
+ memset(tmp, 0, sizeof(tmp));
+
+ if (file) {
+ /*
+ * d_path works from the end of the buffer backwards, so we
+ * need to add enough zero bytes after the string to handle
+ * the 64bit alignment we do later.
+ */
+ buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+ if (!buf) {
+ name = strncpy(tmp, "//enomem", sizeof(tmp));
+ goto got_name;
+ }
+ name = d_path(&file->f_path, buf, PATH_MAX);
+ if (IS_ERR(name)) {
+ name = strncpy(tmp, "//toolong", sizeof(tmp));
+ goto got_name;
+ }
+ } else {
+ if (arch_vma_name(mmap_event->vma)) {
+ name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+ sizeof(tmp));
+ goto got_name;
+ }
+
+ if (!vma->vm_mm) {
+ name = strncpy(tmp, "[vdso]", sizeof(tmp));
+ goto got_name;
+ }
+
+ name = strncpy(tmp, "//anon", sizeof(tmp));
+ goto got_name;
+ }
+
+got_name:
+ size = ALIGN(strlen(name)+1, sizeof(u64));
+
+ mmap_event->file_name = name;
+ mmap_event->file_size = size;
+
+ mmap_event->event.header.size = sizeof(mmap_event->event) + size;
+
+ cpuctx = &get_cpu_var(perf_cpu_context);
+ perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+ put_cpu_var(perf_cpu_context);
+
+ rcu_read_lock();
+ /*
+ * doesn't really matter which of the child contexts the
+ * events ends up in.
+ */
+ ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (ctx)
+ perf_counter_mmap_ctx(ctx, mmap_event);
+ rcu_read_unlock();
+
+ kfree(buf);
+}
+
+void __perf_counter_mmap(struct vm_area_struct *vma)
+{
+ struct perf_mmap_event mmap_event;
+
+ if (!atomic_read(&nr_mmap_counters))
+ return;
+
+ mmap_event = (struct perf_mmap_event){
+ .vma = vma,
+ /* .file_name */
+ /* .file_size */
+ .event = {
+ .header = {
+ .type = PERF_EVENT_MMAP,
+ .misc = 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ .start = vma->vm_start,
+ .len = vma->vm_end - vma->vm_start,
+ .pgoff = vma->vm_pgoff,
+ },
+ };
+
+ perf_counter_mmap_event(&mmap_event);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+ struct perf_output_handle handle;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 time;
+ u64 id;
+ u64 stream_id;
+ } throttle_event = {
+ .header = {
+ .type = PERF_EVENT_THROTTLE,
+ .misc = 0,
+ .size = sizeof(throttle_event),
+ },
+ .time = sched_clock(),
+ .id = primary_counter_id(counter),
+ .stream_id = counter->id,
+ };
+
+ if (enable)
+ throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+
+ ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, throttle_event);
+ perf_output_end(&handle);
+}
+
+/*
+ * Generic counter overflow handling, sampling.
+ */
+
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
+{
+ int events = atomic_read(&counter->event_limit);
+ int throttle = counter->pmu->unthrottle != NULL;
+ struct hw_perf_counter *hwc = &counter->hw;
+ int ret = 0;
+
+ if (!throttle) {
+ hwc->interrupts++;
+ } else {
+ if (hwc->interrupts != MAX_INTERRUPTS) {
+ hwc->interrupts++;
+ if (HZ * hwc->interrupts >
+ (u64)sysctl_perf_counter_sample_rate) {
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(counter, 0);
+ ret = 1;
+ }
+ } else {
+ /*
+ * Keep re-disabling counters even though on the previous
+ * pass we disabled it - just in case we raced with a
+ * sched-in and the counter got enabled again:
+ */
+ ret = 1;
+ }
+ }
+
+ if (counter->attr.freq) {
+ u64 now = sched_clock();
+ s64 delta = now - hwc->freq_stamp;
+
+ hwc->freq_stamp = now;
+
+ if (delta > 0 && delta < TICK_NSEC)
+ perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+ }
+
+ /*
+ * XXX event_limit might not quite work as expected on inherited
+ * counters
+ */
+
+ counter->pending_kill = POLL_IN;
+ if (events && atomic_dec_and_test(&counter->event_limit)) {
+ ret = 1;
+ counter->pending_kill = POLL_HUP;
+ if (nmi) {
+ counter->pending_disable = 1;
+ perf_pending_queue(&counter->pending,
+ perf_pending_counter);
+ } else
+ perf_counter_disable(counter);
+ }
+
+ perf_counter_output(counter, nmi, data);
+ return ret;
+}
+
+/*
+ * Generic software counter infrastructure
+ */
+
+/*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swcounter_set_period(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
+
+again:
+ old = val = atomic64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
+
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
+
+ return nr;
+}
+
+static void perf_swcounter_overflow(struct perf_counter *counter,
+ int nmi, struct perf_sample_data *data)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 overflow;
+
+ data->period = counter->hw.last_period;
+ overflow = perf_swcounter_set_period(counter);
+
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
+
+ for (; overflow; overflow--) {
+ if (perf_counter_overflow(counter, nmi, data)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ }
+}
+
+static void perf_swcounter_unthrottle(struct perf_counter *counter)
+{
+ /*
+ * Nothing to do, we already reset hwc->interrupts.
+ */
+}
+
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+ int nmi, struct perf_sample_data *data)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ atomic64_add(nr, &counter->count);
+
+ if (!hwc->sample_period)
+ return;
+
+ if (!data->regs)
+ return;
+
+ if (!atomic64_add_negative(nr, &hwc->period_left))
+ perf_swcounter_overflow(counter, nmi, data);
+}
+
+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+ /*
+ * The counter is active, we're good!
+ */
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ return 1;
+
+ /*
+ * The counter is off/error, not counting.
+ */
+ if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+ return 0;
+
+ /*
+ * The counter is inactive, if the context is active
+ * we're part of a group that didn't make it on the 'pmu',
+ * not counting.
+ */
+ if (counter->ctx->is_active)
+ return 0;
+
+ /*
+ * We're inactive and the context is too, this means the
+ * task is scheduled out, we're counting events that happen
+ * to us, like migration events.
+ */
+ return 1;
+}
+
+static int perf_swcounter_match(struct perf_counter *counter,
+ enum perf_type_id type,
+ u32 event, struct pt_regs *regs)
+{
+ if (!perf_swcounter_is_counting(counter))
+ return 0;
+
+ if (counter->attr.type != type)
+ return 0;
+ if (counter->attr.config != event)
+ return 0;
+
+ if (regs) {
+ if (counter->attr.exclude_user && user_mode(regs))
+ return 0;
+
+ if (counter->attr.exclude_kernel && !user_mode(regs))
+ return 0;
+ }
+
+ return 1;
+}
+
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+ enum perf_type_id type,
+ u32 event, u64 nr, int nmi,
+ struct perf_sample_data *data)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_swcounter_match(counter, type, event, data->regs))
+ perf_swcounter_add(counter, nr, nmi, data);
+ }
+ rcu_read_unlock();
+}
+
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+ if (in_nmi())
+ return &cpuctx->recursion[3];
+
+ if (in_irq())
+ return &cpuctx->recursion[2];
+
+ if (in_softirq())
+ return &cpuctx->recursion[1];
+
+ return &cpuctx->recursion[0];
+}
+
+static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
+ u64 nr, int nmi,
+ struct perf_sample_data *data)
+{
+ struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ int *recursion = perf_swcounter_recursion_context(cpuctx);
+ struct perf_counter_context *ctx;
+
+ if (*recursion)
+ goto out;
+
+ (*recursion)++;
+ barrier();
+
+ perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+ nr, nmi, data);
+ rcu_read_lock();
+ /*
+ * doesn't really matter which of the child contexts the
+ * events ends up in.
+ */
+ ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (ctx)
+ perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+ rcu_read_unlock();
+
+ barrier();
+ (*recursion)--;
+
+out:
+ put_cpu_var(perf_cpu_context);
+}
+
+void __perf_swcounter_event(u32 event, u64 nr, int nmi,
+ struct pt_regs *regs, u64 addr)
+{
+ struct perf_sample_data data = {
+ .regs = regs,
+ .addr = addr,
+ };
+
+ do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+}
+
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+}
+
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (hwc->sample_period) {
+ hwc->last_period = hwc->sample_period;
+ perf_swcounter_set_period(counter);
+ }
+ return 0;
+}
+
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+}
+
+static const struct pmu perf_ops_generic = {
+ .enable = perf_swcounter_enable,
+ .disable = perf_swcounter_disable,
+ .read = perf_swcounter_read,
+ .unthrottle = perf_swcounter_unthrottle,
+};
+
+/*
+ * hrtimer based swcounter callback
+ */
+
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct perf_counter *counter;
+ u64 period;
+
+ counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+ counter->pmu->read(counter);
+
+ data.addr = 0;
+ data.regs = get_irq_regs();
+ /*
+ * In case we exclude kernel IPs or are somehow not in interrupt
+ * context, provide the next best thing, the user IP.
+ */
+ if ((counter->attr.exclude_kernel || !data.regs) &&
+ !counter->attr.exclude_user)
+ data.regs = task_pt_regs(current);
+
+ if (data.regs) {
+ if (perf_counter_overflow(counter, 0, &data))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, counter->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+}
+
+/*
+ * Software counter: cpu wall time clock
+ */
+
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
+{
+ int cpu = raw_smp_processor_id();
+ s64 prev;
+ u64 now;
+
+ now = cpu_clock(cpu);
+ prev = atomic64_read(&counter->hw.prev_count);
+ atomic64_set(&counter->hw.prev_count, now);
+ atomic64_add(now - prev, &counter->count);
+}
+
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ int cpu = raw_smp_processor_id();
+
+ atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swcounter_hrtimer;
+ if (hwc->sample_period) {
+ u64 period = max_t(u64, 10000, hwc->sample_period);
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL, 0);
+ }
+
+ return 0;
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+ if (counter->hw.sample_period)
+ hrtimer_cancel(&counter->hw.hrtimer);
+ cpu_clock_perf_counter_update(counter);
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+ cpu_clock_perf_counter_update(counter);
+}
+
+static const struct pmu perf_ops_cpu_clock = {
+ .enable = cpu_clock_perf_counter_enable,
+ .disable = cpu_clock_perf_counter_disable,
+ .read = cpu_clock_perf_counter_read,
+};
+
+/*
+ * Software counter: task time clock
+ */
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+ u64 prev;
+ s64 delta;
+
+ prev = atomic64_xchg(&counter->hw.prev_count, now);
+ delta = now - prev;
+ atomic64_add(delta, &counter->count);
+}
+
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 now;
+
+ now = counter->ctx->time;
+
+ atomic64_set(&hwc->prev_count, now);
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swcounter_hrtimer;
+ if (hwc->sample_period) {
+ u64 period = max_t(u64, 10000, hwc->sample_period);
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL, 0);
+ }
+
+ return 0;
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+ if (counter->hw.sample_period)
+ hrtimer_cancel(&counter->hw.hrtimer);
+ task_clock_perf_counter_update(counter, counter->ctx->time);
+
+}
+
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+ u64 time;
+
+ if (!in_nmi()) {
+ update_context_time(counter->ctx);
+ time = counter->ctx->time;
+ } else {
+ u64 now = perf_clock();
+ u64 delta = now - counter->ctx->timestamp;
+ time = counter->ctx->time + delta;
+ }
+
+ task_clock_perf_counter_update(counter, time);
+}
+
+static const struct pmu perf_ops_task_clock = {
+ .enable = task_clock_perf_counter_enable,
+ .disable = task_clock_perf_counter_disable,
+ .read = task_clock_perf_counter_read,
+};
+
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+ int entry_size)
+{
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
+ struct perf_sample_data data = {
+ .regs = get_irq_regs(),
+ .addr = addr,
+ .raw = &raw,
+ };
+
+ if (!data.regs)
+ data.regs = task_pt_regs(current);
+
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+}
+EXPORT_SYMBOL_GPL(perf_tpcounter_event);
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_counter_destroy(struct perf_counter *counter)
+{
+ ftrace_profile_disable(counter->attr.config);
+}
+
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+ /*
+ * Raw tracepoint data is a severe data leak, only allow root to
+ * have these.
+ */
+ if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (ftrace_profile_enable(counter->attr.config))
+ return NULL;
+
+ counter->destroy = tp_perf_counter_destroy;
+
+ return &perf_ops_generic;
+}
+#else
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+ return NULL;
+}
+#endif
+
+atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_counter_destroy(struct perf_counter *counter)
+{
+ u64 event = counter->attr.config;
+
+ WARN_ON(counter->parent);
+
+ atomic_dec(&perf_swcounter_enabled[event]);
+}
+
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
+{
+ const struct pmu *pmu = NULL;
+ u64 event = counter->attr.config;
+
+ /*
+ * Software counters (currently) can't in general distinguish
+ * between user, kernel and hypervisor events.
+ * However, context switches and cpu migrations are considered
+ * to be kernel events, and page faults are never hypervisor
+ * events.
+ */
+ switch (event) {
+ case PERF_COUNT_SW_CPU_CLOCK:
+ pmu = &perf_ops_cpu_clock;
+
+ break;
+ case PERF_COUNT_SW_TASK_CLOCK:
+ /*
+ * If the user instantiates this as a per-cpu counter,
+ * use the cpu_clock counter instead.
+ */
+ if (counter->ctx->task)
+ pmu = &perf_ops_task_clock;
+ else
+ pmu = &perf_ops_cpu_clock;
+
+ break;
+ case PERF_COUNT_SW_PAGE_FAULTS:
+ case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+ case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+ case PERF_COUNT_SW_CONTEXT_SWITCHES:
+ case PERF_COUNT_SW_CPU_MIGRATIONS:
+ if (!counter->parent) {
+ atomic_inc(&perf_swcounter_enabled[event]);
+ counter->destroy = sw_perf_counter_destroy;
+ }
+ pmu = &perf_ops_generic;
+ break;
+ }
+
+ return pmu;
+}
+
+/*
+ * Allocate and initialize a counter structure
+ */
+static struct perf_counter *
+perf_counter_alloc(struct perf_counter_attr *attr,
+ int cpu,
+ struct perf_counter_context *ctx,
+ struct perf_counter *group_leader,
+ struct perf_counter *parent_counter,
+ gfp_t gfpflags)
+{
+ const struct pmu *pmu;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ long err;
+
+ counter = kzalloc(sizeof(*counter), gfpflags);
+ if (!counter)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Single counters are their own group leaders, with an
+ * empty sibling list:
+ */
+ if (!group_leader)
+ group_leader = counter;
+
+ mutex_init(&counter->child_mutex);
+ INIT_LIST_HEAD(&counter->child_list);
+
+ INIT_LIST_HEAD(&counter->list_entry);
+ INIT_LIST_HEAD(&counter->event_entry);
+ INIT_LIST_HEAD(&counter->sibling_list);
+ init_waitqueue_head(&counter->waitq);
+
+ mutex_init(&counter->mmap_mutex);
+
+ counter->cpu = cpu;
+ counter->attr = *attr;
+ counter->group_leader = group_leader;
+ counter->pmu = NULL;
+ counter->ctx = ctx;
+ counter->oncpu = -1;
+
+ counter->parent = parent_counter;
+
+ counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+ counter->id = atomic64_inc_return(&perf_counter_id);
+
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+
+ if (attr->disabled)
+ counter->state = PERF_COUNTER_STATE_OFF;
+
+ pmu = NULL;
+
+ hwc = &counter->hw;
+ hwc->sample_period = attr->sample_period;
+ if (attr->freq && attr->sample_freq)
+ hwc->sample_period = 1;
+
+ atomic64_set(&hwc->period_left, hwc->sample_period);
+
+ /*
+ * we currently do not support PERF_FORMAT_GROUP on inherited counters
+ */
+ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ goto done;
+
+ switch (attr->type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ pmu = hw_perf_counter_init(counter);
+ break;
+
+ case PERF_TYPE_SOFTWARE:
+ pmu = sw_perf_counter_init(counter);
+ break;
+
+ case PERF_TYPE_TRACEPOINT:
+ pmu = tp_perf_counter_init(counter);
+ break;
+
+ default:
+ break;
+ }
+done:
+ err = 0;
+ if (!pmu)
+ err = -EINVAL;
+ else if (IS_ERR(pmu))
+ err = PTR_ERR(pmu);
+
+ if (err) {
+ if (counter->ns)
+ put_pid_ns(counter->ns);
+ kfree(counter);
+ return ERR_PTR(err);
+ }
+
+ counter->pmu = pmu;
+
+ if (!counter->parent) {
+ atomic_inc(&nr_counters);
+ if (counter->attr.mmap)
+ atomic_inc(&nr_mmap_counters);
+ if (counter->attr.comm)
+ atomic_inc(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_inc(&nr_task_counters);
+ }
+
+ return counter;
+}
+
+static int perf_copy_attr(struct perf_counter_attr __user *uattr,
+ struct perf_counter_attr *attr)
+{
+ int ret;
+ u32 size;
+
+ if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = PERF_ATTR_SIZE_VER0;
+
+ if (size < PERF_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned long val;
+ unsigned long __user *addr;
+ unsigned long __user *end;
+
+ addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+ sizeof(unsigned long));
+ end = PTR_ALIGN((void __user *)uattr + size,
+ sizeof(unsigned long));
+
+ for (; addr < end; addr += sizeof(unsigned long)) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * If the type exists, the corresponding creation will verify
+ * the attr->config.
+ */
+ if (attr->type >= PERF_TYPE_MAX)
+ return -EINVAL;
+
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+ return -EINVAL;
+
+ if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+ return -EINVAL;
+
+ if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+ return -EINVAL;
+
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
+/**
+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
+ *
+ * @attr_uptr: event type attributes for monitoring/sampling
+ * @pid: target pid
+ * @cpu: target cpu
+ * @group_fd: group leader counter fd
+ */
+SYSCALL_DEFINE5(perf_counter_open,
+ struct perf_counter_attr __user *, attr_uptr,
+ pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+ struct perf_counter *counter, *group_leader;
+ struct perf_counter_attr attr;
+ struct perf_counter_context *ctx;
+ struct file *counter_file = NULL;
+ struct file *group_file = NULL;
+ int fput_needed = 0;
+ int fput_needed2 = 0;
+ int ret;
+
+ /* for future expandability... */
+ if (flags)
+ return -EINVAL;
+
+ ret = perf_copy_attr(attr_uptr, &attr);
+ if (ret)
+ return ret;
+
+ if (!attr.exclude_kernel) {
+ if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr.freq) {
+ if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+ return -EINVAL;
+ }
+
+ /*
+ * Get the target context (task or percpu):
+ */
+ ctx = find_get_context(pid, cpu);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ /*
+ * Look up the group leader (we will attach this counter to it):
+ */
+ group_leader = NULL;
+ if (group_fd != -1) {
+ ret = -EINVAL;
+ group_file = fget_light(group_fd, &fput_needed);
+ if (!group_file)
+ goto err_put_context;
+ if (group_file->f_op != &perf_fops)
+ goto err_put_context;
+
+ group_leader = group_file->private_data;
+ /*
+ * Do not allow a recursive hierarchy (this new sibling
+ * becoming part of another group-sibling):
+ */
+ if (group_leader->group_leader != group_leader)
+ goto err_put_context;
+ /*
+ * Do not allow to attach to a group in a different
+ * task or CPU context:
+ */
+ if (group_leader->ctx != ctx)
+ goto err_put_context;
+ /*
+ * Only a group leader can be exclusive or pinned
+ */
+ if (attr.exclusive || attr.pinned)
+ goto err_put_context;
+ }
+
+ counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
+ NULL, GFP_KERNEL);
+ ret = PTR_ERR(counter);
+ if (IS_ERR(counter))
+ goto err_put_context;
+
+ ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+ if (ret < 0)
+ goto err_free_put_context;
+
+ counter_file = fget_light(ret, &fput_needed2);
+ if (!counter_file)
+ goto err_free_put_context;
+
+ counter->filp = counter_file;
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ perf_install_in_context(ctx, counter, cpu);
+ ++ctx->generation;
+ mutex_unlock(&ctx->mutex);
+
+ counter->owner = current;
+ get_task_struct(current);
+ mutex_lock(&current->perf_counter_mutex);
+ list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+ mutex_unlock(&current->perf_counter_mutex);
+
+ fput_light(counter_file, fput_needed2);
+
+out_fput:
+ fput_light(group_file, fput_needed);
+
+ return ret;
+
+err_free_put_context:
+ kfree(counter);
+
+err_put_context:
+ put_ctx(ctx);
+
+ goto out_fput;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static struct perf_counter *
+inherit_counter(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter *group_leader,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *child_counter;
+
+ /*
+ * Instead of creating recursive hierarchies of counters,
+ * we link inherited counters back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ if (parent_counter->parent)
+ parent_counter = parent_counter->parent;
+
+ child_counter = perf_counter_alloc(&parent_counter->attr,
+ parent_counter->cpu, child_ctx,
+ group_leader, parent_counter,
+ GFP_KERNEL);
+ if (IS_ERR(child_counter))
+ return child_counter;
+ get_ctx(child_ctx);
+
+ /*
+ * Make the child state follow the state of the parent counter,
+ * not its attr.disabled bit. We hold the parent's mutex,
+ * so we won't race with perf_counter_{en, dis}able_family.
+ */
+ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+ else
+ child_counter->state = PERF_COUNTER_STATE_OFF;
+
+ if (parent_counter->attr.freq)
+ child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
+ /*
+ * Link it up in the child's context:
+ */
+ add_counter_to_ctx(child_counter, child_ctx);
+
+ /*
+ * Get a reference to the parent filp - we will fput it
+ * when the child counter exits. This is safe to do because
+ * we are in the parent and we know that the filp still
+ * exists and has a nonzero count:
+ */
+ atomic_long_inc(&parent_counter->filp->f_count);
+
+ /*
+ * Link this into the parent counter's child list
+ */
+ WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+ mutex_lock(&parent_counter->child_mutex);
+ list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+ mutex_unlock(&parent_counter->child_mutex);
+
+ return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *leader;
+ struct perf_counter *sub;
+ struct perf_counter *child_ctr;
+
+ leader = inherit_counter(parent_counter, parent, parent_ctx,
+ child, NULL, child_ctx);
+ if (IS_ERR(leader))
+ return PTR_ERR(leader);
+ list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+ child_ctr = inherit_counter(sub, parent, parent_ctx,
+ child, leader, child_ctx);
+ if (IS_ERR(child_ctr))
+ return PTR_ERR(child_ctr);
+ }
+ return 0;
+}
+
+static void sync_child_counter(struct perf_counter *child_counter,
+ struct task_struct *child)
+{
+ struct perf_counter *parent_counter = child_counter->parent;
+ u64 child_val;
+
+ if (child_counter->attr.inherit_stat)
+ perf_counter_read_event(child_counter, child);
+
+ child_val = atomic64_read(&child_counter->count);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_counter->count);
+ atomic64_add(child_counter->total_time_enabled,
+ &parent_counter->child_total_time_enabled);
+ atomic64_add(child_counter->total_time_running,
+ &parent_counter->child_total_time_running);
+
+ /*
+ * Remove this counter from the parent's list
+ */
+ WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+ mutex_lock(&parent_counter->child_mutex);
+ list_del_init(&child_counter->child_list);
+ mutex_unlock(&parent_counter->child_mutex);
+
+ /*
+ * Release the parent counter, if this was the last
+ * reference to it.
+ */
+ fput(parent_counter->filp);
+}
+
+static void
+__perf_counter_exit_task(struct perf_counter *child_counter,
+ struct perf_counter_context *child_ctx,
+ struct task_struct *child)
+{
+ struct perf_counter *parent_counter;
+
+ update_counter_times(child_counter);
+ perf_counter_remove_from_context(child_counter);
+
+ parent_counter = child_counter->parent;
+ /*
+ * It can happen that parent exits first, and has counters
+ * that are still around due to the child reference. These
+ * counters need to be zapped - but otherwise linger.
+ */
+ if (parent_counter) {
+ sync_child_counter(child_counter, child);
+ free_counter(child_counter);
+ }
+}
+
+/*
+ * When a child task exits, feed back counter values to parent counters.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+ struct perf_counter *child_counter, *tmp;
+ struct perf_counter_context *child_ctx;
+ unsigned long flags;
+
+ if (likely(!child->perf_counter_ctxp)) {
+ perf_counter_task(child, NULL, 0);
+ return;
+ }
+
+ local_irq_save(flags);
+ /*
+ * We can't reschedule here because interrupts are disabled,
+ * and either child is current or it is a task that can't be
+ * scheduled, so we are now safe from rescheduling changing
+ * our context.
+ */
+ child_ctx = child->perf_counter_ctxp;
+ __perf_counter_task_sched_out(child_ctx);
+
+ /*
+ * Take the context lock here so that if find_get_context is
+ * reading child->perf_counter_ctxp, we wait until it has
+ * incremented the context's refcount before we do put_ctx below.
+ */
+ spin_lock(&child_ctx->lock);
+ child->perf_counter_ctxp = NULL;
+ /*
+ * If this context is a clone; unclone it so it can't get
+ * swapped to another process while we're removing all
+ * the counters from it.
+ */
+ unclone_ctx(child_ctx);
+ spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ /*
+ * Report the task dead after unscheduling the counters so that we
+ * won't get any samples after PERF_EVENT_EXIT. We can however still
+ * get a few PERF_EVENT_READ events.
+ */
+ perf_counter_task(child, child_ctx, 0);
+
+ /*
+ * We can recurse on the same lock type through:
+ *
+ * __perf_counter_exit_task()
+ * sync_child_counter()
+ * fput(parent_counter->filp)
+ * perf_release()
+ * mutex_lock(&ctx->mutex)
+ *
+ * But since its the parent context it won't be the same instance.
+ */
+ mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+
+again:
+ list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+ list_entry)
+ __perf_counter_exit_task(child_counter, child_ctx, child);
+
+ /*
+ * If the last counter was a group counter, it will have appended all
+ * its siblings to the list, but we obtained 'tmp' before that which
+ * will still point to the list head terminating the iteration.
+ */
+ if (!list_empty(&child_ctx->counter_list))
+ goto again;
+
+ mutex_unlock(&child_ctx->mutex);
+
+ put_ctx(child_ctx);
+}
+
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+ struct perf_counter_context *ctx = task->perf_counter_ctxp;
+ struct perf_counter *counter, *tmp;
+
+ if (!ctx)
+ return;
+
+ mutex_lock(&ctx->mutex);
+again:
+ list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+ struct perf_counter *parent = counter->parent;
+
+ if (WARN_ON_ONCE(!parent))
+ continue;
+
+ mutex_lock(&parent->child_mutex);
+ list_del_init(&counter->child_list);
+ mutex_unlock(&parent->child_mutex);
+
+ fput(parent->filp);
+
+ list_del_counter(counter, ctx);
+ free_counter(counter);
+ }
+
+ if (!list_empty(&ctx->counter_list))
+ goto again;
+
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+int perf_counter_init_task(struct task_struct *child)
+{
+ struct perf_counter_context *child_ctx, *parent_ctx;
+ struct perf_counter_context *cloned_ctx;
+ struct perf_counter *counter;
+ struct task_struct *parent = current;
+ int inherited_all = 1;
+ int ret = 0;
+
+ child->perf_counter_ctxp = NULL;
+
+ mutex_init(&child->perf_counter_mutex);
+ INIT_LIST_HEAD(&child->perf_counter_list);
+
+ if (likely(!parent->perf_counter_ctxp))
+ return 0;
+
+ /*
+ * This is executed from the parent task context, so inherit
+ * counters that have been marked for cloning.
+ * First allocate and initialize a context for the child.
+ */
+
+ child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+ if (!child_ctx)
+ return -ENOMEM;
+
+ __perf_counter_init_context(child_ctx, child);
+ child->perf_counter_ctxp = child_ctx;
+ get_task_struct(child);
+
+ /*
+ * If the parent's context is a clone, pin it so it won't get
+ * swapped under us.
+ */
+ parent_ctx = perf_pin_task_context(parent);
+
+ /*
+ * No need to check if parent_ctx != NULL here; since we saw
+ * it non-NULL earlier, the only reason for it to become NULL
+ * is if we exit, and since we're currently in the middle of
+ * a fork we can't be exiting at the same time.
+ */
+
+ /*
+ * Lock the parent list. No need to lock the child - not PID
+ * hashed yet and not running, so nobody can access it.
+ */
+ mutex_lock(&parent_ctx->mutex);
+
+ /*
+ * We dont have to disable NMIs - we are only looking at
+ * the list, not manipulating it:
+ */
+ list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
+ if (counter != counter->group_leader)
+ continue;
+
+ if (!counter->attr.inherit) {
+ inherited_all = 0;
+ continue;
+ }
+
+ ret = inherit_group(counter, parent, parent_ctx,
+ child, child_ctx);
+ if (ret) {
+ inherited_all = 0;
+ break;
+ }
+ }
+
+ if (inherited_all) {
+ /*
+ * Mark the child context as a clone of the parent
+ * context, or of whatever the parent is a clone of.
+ * Note that if the parent is a clone, it could get
+ * uncloned at any point, but that doesn't matter
+ * because the list of counters and the generation
+ * count can't have changed since we took the mutex.
+ */
+ cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+ if (cloned_ctx) {
+ child_ctx->parent_ctx = cloned_ctx;
+ child_ctx->parent_gen = parent_ctx->parent_gen;
+ } else {
+ child_ctx->parent_ctx = parent_ctx;
+ child_ctx->parent_gen = parent_ctx->generation;
+ }
+ get_ctx(child_ctx->parent_ctx);
+ }
+
+ mutex_unlock(&parent_ctx->mutex);
+
+ perf_unpin_context(parent_ctx);
+
+ return ret;
+}
+
+static void __cpuinit perf_counter_init_cpu(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ __perf_counter_init_context(&cpuctx->ctx, NULL);
+
+ spin_lock(&perf_resource_lock);
+ cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+ spin_unlock(&perf_resource_lock);
+
+ hw_perf_counter_setup(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_counter_exit_cpu(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+ struct perf_counter *counter, *tmp;
+
+ list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+ __perf_counter_remove_from_context(counter);
+}
+static void perf_counter_exit_cpu(int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+ mutex_unlock(&ctx->mutex);
+}
+#else
+static inline void perf_counter_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action) {
+
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ perf_counter_init_cpu(cpu);
+ break;
+
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ hw_perf_counter_setup_online(cpu);
+ break;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ perf_counter_exit_cpu(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+ .notifier_call = perf_cpu_notify,
+ .priority = 20,
+};
+
+void __init perf_counter_init(void)
+{
+ perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
+ perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+ (void *)(long)smp_processor_id());
+ register_cpu_notifier(&perf_cpu_nb);
+}
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+ const char *buf,
+ size_t count)
+{
+ struct perf_cpu_context *cpuctx;
+ unsigned long val;
+ int err, cpu, mpt;
+
+ err = strict_strtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val > perf_max_counters)
+ return -EINVAL;
+
+ spin_lock(&perf_resource_lock);
+ perf_reserved_percpu = val;
+ for_each_online_cpu(cpu) {
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ spin_lock_irq(&cpuctx->ctx.lock);
+ mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
+ perf_max_counters - perf_reserved_percpu);
+ cpuctx->max_pertask = mpt;
+ spin_unlock_irq(&cpuctx->ctx.lock);
+ }
+ spin_unlock(&perf_resource_lock);
+
+ return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+ unsigned long val;
+ int err;
+
+ err = strict_strtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val > 1)
+ return -EINVAL;
+
+ spin_lock(&perf_resource_lock);
+ perf_overcommit = val;
+ spin_unlock(&perf_resource_lock);
+
+ return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+ reserve_percpu,
+ 0644,
+ perf_show_reserve_percpu,
+ perf_set_reserve_percpu
+ );
+
+static SYSDEV_CLASS_ATTR(
+ overcommit,
+ 0644,
+ perf_show_overcommit,
+ perf_set_overcommit
+ );
+
+static struct attribute *perfclass_attrs[] = {
+ &attr_reserve_percpu.attr,
+ &attr_overcommit.attr,
+ NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+ .attrs = perfclass_attrs,
+ .name = "perf_counters",
+};
+
+static int __init perf_counter_sysfs_init(void)
+{
+ return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+ &perfclass_attr_group);
+}
+device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd28..31310b5d3f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
/*
* Must be called under rcu_read_lock() or with tasklist_lock read-held.
*/
-struct task_struct *find_task_by_pid_type_ns(int type, int nr,
- struct pid_namespace *ns)
+struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- return pid_task(find_pid_ns(nr, ns), type);
+ return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
-EXPORT_SYMBOL(find_task_by_pid_type_ns);
-
struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
- current->nsproxy->pid_ns);
-}
-EXPORT_SYMBOL(find_task_by_vpid);
-
-struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
-{
- return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
+ return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
}
-EXPORT_SYMBOL(find_task_by_pid_ns);
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858..821722ae58a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
return NULL;
}
-static struct pid_namespace *create_pid_namespace(unsigned int level)
+static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
+ unsigned int level = parent_pid_ns->level + 1;
int i;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
kref_init(&ns->kref);
ns->level = level;
+ ns->parent = get_pid_ns(parent_pid_ns);
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
{
- struct pid_namespace *new_ns;
-
- BUG_ON(!old_ns);
- new_ns = get_pid_ns(old_ns);
if (!(flags & CLONE_NEWPID))
- goto out;
-
- new_ns = ERR_PTR(-EINVAL);
+ return get_pid_ns(old_ns);
if (flags & CLONE_THREAD)
- goto out_put;
-
- new_ns = create_pid_namespace(old_ns->level + 1);
- if (!IS_ERR(new_ns))
- new_ns->parent = get_pid_ns(old_ns);
-
-out_put:
- put_pid_ns(old_ns);
-out:
- return new_ns;
+ return ERR_PTR(-EINVAL);
+ return create_pid_namespace(old_ns);
}
void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b..e33a21cb940 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
- struct task_cputime cputime;
+ struct signal_struct *const sig = tsk->signal;
- thread_group_cputimer(tsk, &cputime);
cleanup_timers(tsk->signal->cpu_timers,
- cputime.utime, cputime.stime, cputime.sum_exec_runtime);
+ cputime_add(tsk->utime, sig->utime),
+ cputime_add(tsk->stime, sig->stime),
+ tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
}
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c..d089d052c4a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
return -EOPNOTSUPP;
}
+static int no_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsave, struct timespec __user *rmtp)
+{
+ return -EOPNOTSUPP;
+}
+
/*
* Return nonzero if we know a priori this clockid_t value is bogus.
*/
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
.clock_get = posix_get_monotonic_raw,
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
+ .nsleep = no_nsleep,
};
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96..72067cbdb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HIBERNATION_NVS
+ bool
+
config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+ select HIBERNATION_NVS if HAS_IOMEM
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 720ea4f781b..c3b81c30e5d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -6,6 +6,9 @@ endif
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
-obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
+obj-$(CONFIG_SUSPEND) += suspend.o
+obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
+obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c
index 5cb080e7eeb..81d2e746489 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/hibernate.c
@@ -1,12 +1,12 @@
/*
- * kernel/power/disk.c - Suspend-to-disk support.
+ * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
*
* This file is released under the GPLv2.
- *
*/
#include <linux/suspend.h>
@@ -215,13 +215,13 @@ static int create_image(int platform_mode)
if (error)
return error;
- /* At this point, device_suspend() has been called, but *not*
- * device_power_down(). We *must* call device_power_down() now.
+ /* At this point, dpm_suspend_start() has been called, but *not*
+ * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
* Otherwise, drivers for some devices (e.g. interrupt controllers)
* become desynchronized with the actual state of the hardware
* at resume time, and evil weirdness ensues.
*/
- error = device_power_down(PMSG_FREEZE);
+ error = dpm_suspend_noirq(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting hibernation\n");
@@ -262,7 +262,7 @@ static int create_image(int platform_mode)
Power_up:
sysdev_resume();
- /* NOTE: device_power_up() is just a resume() for devices
+ /* NOTE: dpm_resume_noirq() is just a resume() for devices
* that suspended with irqs off ... no overall powerup.
*/
@@ -275,7 +275,7 @@ static int create_image(int platform_mode)
Platform_finish:
platform_finish(platform_mode);
- device_power_up(in_suspend ?
+ dpm_resume_noirq(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
return error;
@@ -304,7 +304,7 @@ int hibernation_snapshot(int platform_mode)
goto Close;
suspend_console();
- error = device_suspend(PMSG_FREEZE);
+ error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -315,7 +315,7 @@ int hibernation_snapshot(int platform_mode)
/* Control returns here after successful restore */
Resume_devices:
- device_resume(in_suspend ?
+ dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
resume_console();
Close:
@@ -339,7 +339,7 @@ static int resume_target_kernel(bool platform_mode)
{
int error;
- error = device_power_down(PMSG_QUIESCE);
+ error = dpm_suspend_noirq(PMSG_QUIESCE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting resume\n");
@@ -394,7 +394,7 @@ static int resume_target_kernel(bool platform_mode)
Cleanup:
platform_restore_cleanup(platform_mode);
- device_power_up(PMSG_RECOVER);
+ dpm_resume_noirq(PMSG_RECOVER);
return error;
}
@@ -414,10 +414,10 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
- error = device_suspend(PMSG_QUIESCE);
+ error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
- device_resume(PMSG_RECOVER);
+ dpm_resume_end(PMSG_RECOVER);
}
resume_console();
pm_restore_console();
@@ -447,14 +447,14 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
- error = device_suspend(PMSG_HIBERNATE);
+ error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
hibernation_ops->recover();
goto Resume_devices;
}
- error = device_power_down(PMSG_HIBERNATE);
+ error = dpm_suspend_noirq(PMSG_HIBERNATE);
if (error)
goto Resume_devices;
@@ -479,11 +479,11 @@ int hibernation_platform_enter(void)
Platofrm_finish:
hibernation_ops->finish();
- device_power_up(PMSG_RESTORE);
+ dpm_suspend_noirq(PMSG_RESTORE);
Resume_devices:
entering_platform_hibernation = false;
- device_resume(PMSG_RESTORE);
+ dpm_resume_end(PMSG_RESTORE);
resume_console();
Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 00000000000..39ac698ef83
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume. The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+ unsigned long phys_start;
+ unsigned int size;
+ void *kaddr;
+ void *data;
+ struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ * hibernate_nvs_register - register platform NVS memory region to save
+ * @start - physical address of the region
+ * @size - size of the region
+ *
+ * The NVS region need not be page-aligned (both ends) and we arrange
+ * things so that the data from page-aligned addresses in this region will
+ * be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+ struct nvs_page *entry, *next;
+
+ while (size > 0) {
+ unsigned int nr_bytes;
+
+ entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+ if (!entry)
+ goto Error;
+
+ list_add_tail(&entry->node, &nvs_list);
+ entry->phys_start = start;
+ nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+ entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+ start += entry->size;
+ size -= entry->size;
+ }
+ return 0;
+
+ Error:
+ list_for_each_entry_safe(entry, next, &nvs_list, node) {
+ list_del(&entry->node);
+ kfree(entry);
+ }
+ return -ENOMEM;
+}
+
+/**
+ * hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+ struct nvs_page *entry;
+
+ list_for_each_entry(entry, &nvs_list, node)
+ if (entry->data) {
+ free_page((unsigned long)entry->data);
+ entry->data = NULL;
+ if (entry->kaddr) {
+ iounmap(entry->kaddr);
+ entry->kaddr = NULL;
+ }
+ }
+}
+
+/**
+ * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+ struct nvs_page *entry;
+
+ list_for_each_entry(entry, &nvs_list, node) {
+ entry->data = (void *)__get_free_page(GFP_KERNEL);
+ if (!entry->data) {
+ hibernate_nvs_free();
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/**
+ * hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+ struct nvs_page *entry;
+
+ printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+ list_for_each_entry(entry, &nvs_list, node)
+ if (entry->data) {
+ entry->kaddr = ioremap(entry->phys_start, entry->size);
+ memcpy(entry->data, entry->kaddr, entry->size);
+ }
+}
+
+/**
+ * hibernate_nvs_restore - restore NVS memory regions
+ *
+ * This function is going to be called with interrupts disabled, so it
+ * cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+ struct nvs_page *entry;
+
+ printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+ list_for_each_entry(entry, &nvs_list, node)
+ if (entry->data)
+ memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 868028280d1..f710e36930c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,20 +8,9 @@
*
*/
-#include <linux/module.h>
-#include <linux/suspend.h>
#include <linux/kobject.h>
#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/kmod.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/cpu.h>
#include <linux/resume-trace.h>
-#include <linux/freezer.h>
-#include <linux/vmstat.h>
-#include <linux/syscalls.h>
#include "power.h"
@@ -119,373 +108,6 @@ power_attr(pm_test);
#endif /* CONFIG_PM_SLEEP */
-#ifdef CONFIG_SUSPEND
-
-static int suspend_test(int level)
-{
-#ifdef CONFIG_PM_DEBUG
- if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
- mdelay(5000);
- return 1;
- }
-#endif /* !CONFIG_PM_DEBUG */
- return 0;
-}
-
-#ifdef CONFIG_PM_TEST_SUSPEND
-
-/*
- * We test the system suspend code by setting an RTC wakealarm a short
- * time in the future, then suspending. Suspending the devices won't
- * normally take long ... some systems only need a few milliseconds.
- *
- * The time it takes is system-specific though, so when we test this
- * during system bootup we allow a LOT of time.
- */
-#define TEST_SUSPEND_SECONDS 5
-
-static unsigned long suspend_test_start_time;
-
-static void suspend_test_start(void)
-{
- /* FIXME Use better timebase than "jiffies", ideally a clocksource.
- * What we want is a hardware counter that will work correctly even
- * during the irqs-are-off stages of the suspend/resume cycle...
- */
- suspend_test_start_time = jiffies;
-}
-
-static void suspend_test_finish(const char *label)
-{
- long nj = jiffies - suspend_test_start_time;
- unsigned msec;
-
- msec = jiffies_to_msecs(abs(nj));
- pr_info("PM: %s took %d.%03d seconds\n", label,
- msec / 1000, msec % 1000);
-
- /* Warning on suspend means the RTC alarm period needs to be
- * larger -- the system was sooo slooowwww to suspend that the
- * alarm (should have) fired before the system went to sleep!
- *
- * Warning on either suspend or resume also means the system
- * has some performance issues. The stack dump of a WARN_ON
- * is more likely to get the right attention than a printk...
- */
- WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
-}
-
-#else
-
-static void suspend_test_start(void)
-{
-}
-
-static void suspend_test_finish(const char *label)
-{
-}
-
-#endif
-
-/* This is just an arbitrary number */
-#define FREE_PAGE_NUMBER (100)
-
-static struct platform_suspend_ops *suspend_ops;
-
-/**
- * suspend_set_ops - Set the global suspend method table.
- * @ops: Pointer to ops structure.
- */
-
-void suspend_set_ops(struct platform_suspend_ops *ops)
-{
- mutex_lock(&pm_mutex);
- suspend_ops = ops;
- mutex_unlock(&pm_mutex);
-}
-
-/**
- * suspend_valid_only_mem - generic memory-only valid callback
- *
- * Platform drivers that implement mem suspend only and only need
- * to check for that in their .valid callback can use this instead
- * of rolling their own .valid callback.
- */
-int suspend_valid_only_mem(suspend_state_t state)
-{
- return state == PM_SUSPEND_MEM;
-}
-
-/**
- * suspend_prepare - Do prep work before entering low-power state.
- *
- * This is common code that is called for each state that we're entering.
- * Run suspend notifiers, allocate a console and stop all processes.
- */
-static int suspend_prepare(void)
-{
- int error;
- unsigned int free_pages;
-
- if (!suspend_ops || !suspend_ops->enter)
- return -EPERM;
-
- pm_prepare_console();
-
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (error)
- goto Finish;
-
- error = usermodehelper_disable();
- if (error)
- goto Finish;
-
- if (suspend_freeze_processes()) {
- error = -EAGAIN;
- goto Thaw;
- }
-
- free_pages = global_page_state(NR_FREE_PAGES);
- if (free_pages < FREE_PAGE_NUMBER) {
- pr_debug("PM: free some memory\n");
- shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
- if (nr_free_pages() < FREE_PAGE_NUMBER) {
- error = -ENOMEM;
- printk(KERN_ERR "PM: No enough memory\n");
- }
- }
- if (!error)
- return 0;
-
- Thaw:
- suspend_thaw_processes();
- usermodehelper_enable();
- Finish:
- pm_notifier_call_chain(PM_POST_SUSPEND);
- pm_restore_console();
- return error;
-}
-
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
-{
- local_irq_disable();
-}
-
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
-{
- local_irq_enable();
-}
-
-/**
- * suspend_enter - enter the desired system sleep state.
- * @state: state to enter
- *
- * This function should be called after devices have been suspended.
- */
-static int suspend_enter(suspend_state_t state)
-{
- int error;
-
- if (suspend_ops->prepare) {
- error = suspend_ops->prepare();
- if (error)
- return error;
- }
-
- error = device_power_down(PMSG_SUSPEND);
- if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down\n");
- goto Platfrom_finish;
- }
-
- if (suspend_ops->prepare_late) {
- error = suspend_ops->prepare_late();
- if (error)
- goto Power_up_devices;
- }
-
- if (suspend_test(TEST_PLATFORM))
- goto Platform_wake;
-
- error = disable_nonboot_cpus();
- if (error || suspend_test(TEST_CPUS))
- goto Enable_cpus;
-
- arch_suspend_disable_irqs();
- BUG_ON(!irqs_disabled());
-
- error = sysdev_suspend(PMSG_SUSPEND);
- if (!error) {
- if (!suspend_test(TEST_CORE))
- error = suspend_ops->enter(state);
- sysdev_resume();
- }
-
- arch_suspend_enable_irqs();
- BUG_ON(irqs_disabled());
-
- Enable_cpus:
- enable_nonboot_cpus();
-
- Platform_wake:
- if (suspend_ops->wake)
- suspend_ops->wake();
-
- Power_up_devices:
- device_power_up(PMSG_RESUME);
-
- Platfrom_finish:
- if (suspend_ops->finish)
- suspend_ops->finish();
-
- return error;
-}
-
-/**
- * suspend_devices_and_enter - suspend devices and enter the desired system
- * sleep state.
- * @state: state to enter
- */
-int suspend_devices_and_enter(suspend_state_t state)
-{
- int error;
-
- if (!suspend_ops)
- return -ENOSYS;
-
- if (suspend_ops->begin) {
- error = suspend_ops->begin(state);
- if (error)
- goto Close;
- }
- suspend_console();
- suspend_test_start();
- error = device_suspend(PMSG_SUSPEND);
- if (error) {
- printk(KERN_ERR "PM: Some devices failed to suspend\n");
- goto Recover_platform;
- }
- suspend_test_finish("suspend devices");
- if (suspend_test(TEST_DEVICES))
- goto Recover_platform;
-
- suspend_enter(state);
-
- Resume_devices:
- suspend_test_start();
- device_resume(PMSG_RESUME);
- suspend_test_finish("resume devices");
- resume_console();
- Close:
- if (suspend_ops->end)
- suspend_ops->end();
- return error;
-
- Recover_platform:
- if (suspend_ops->recover)
- suspend_ops->recover();
- goto Resume_devices;
-}
-
-/**
- * suspend_finish - Do final work before exiting suspend sequence.
- *
- * Call platform code to clean up, restart processes, and free the
- * console that we've allocated. This is not called for suspend-to-disk.
- */
-static void suspend_finish(void)
-{
- suspend_thaw_processes();
- usermodehelper_enable();
- pm_notifier_call_chain(PM_POST_SUSPEND);
- pm_restore_console();
-}
-
-
-
-
-static const char * const pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_STANDBY] = "standby",
- [PM_SUSPEND_MEM] = "mem",
-};
-
-static inline int valid_state(suspend_state_t state)
-{
- /* All states need lowlevel support and need to be valid
- * to the lowlevel implementation, no valid callback
- * implies that none are valid. */
- if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
- return 0;
- return 1;
-}
-
-
-/**
- * enter_state - Do common work of entering low-power state.
- * @state: pm_state structure for state we're entering.
- *
- * Make sure we're the only ones trying to enter a sleep state. Fail
- * if someone has beat us to it, since we don't want anything weird to
- * happen when we wake up.
- * Then, do the setup for suspend, enter the state, and cleaup (after
- * we've woken up).
- */
-static int enter_state(suspend_state_t state)
-{
- int error;
-
- if (!valid_state(state))
- return -ENODEV;
-
- if (!mutex_trylock(&pm_mutex))
- return -EBUSY;
-
- printk(KERN_INFO "PM: Syncing filesystems ... ");
- sys_sync();
- printk("done.\n");
-
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
- error = suspend_prepare();
- if (error)
- goto Unlock;
-
- if (suspend_test(TEST_FREEZER))
- goto Finish;
-
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
- error = suspend_devices_and_enter(state);
-
- Finish:
- pr_debug("PM: Finishing wakeup.\n");
- suspend_finish();
- Unlock:
- mutex_unlock(&pm_mutex);
- return error;
-}
-
-
-/**
- * pm_suspend - Externally visible function for suspending system.
- * @state: Enumerated value of state to enter.
- *
- * Determine whether or not value is within range, get state
- * structure, and enter (above).
- */
-
-int pm_suspend(suspend_state_t state)
-{
- if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
- return enter_state(state);
- return -EINVAL;
-}
-
-EXPORT_SYMBOL(pm_suspend);
-
-#endif /* CONFIG_SUSPEND */
-
struct kobject *power_kobj;
/**
@@ -498,7 +120,6 @@ struct kobject *power_kobj;
* store() accepts one of those strings, translates it into the
* proper enumerated value, and initiates a suspend transition.
*/
-
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -596,7 +217,6 @@ static struct attribute_group attr_group = {
.attrs = g,
};
-
static int __init pm_init(void)
{
power_kobj = kobject_create_and_add("power", NULL);
@@ -606,144 +226,3 @@ static int __init pm_init(void)
}
core_initcall(pm_init);
-
-
-#ifdef CONFIG_PM_TEST_SUSPEND
-
-#include <linux/rtc.h>
-
-/*
- * To test system suspend, we need a hands-off mechanism to resume the
- * system. RTCs wake alarms are a common self-contained mechanism.
- */
-
-static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
-{
- static char err_readtime[] __initdata =
- KERN_ERR "PM: can't read %s time, err %d\n";
- static char err_wakealarm [] __initdata =
- KERN_ERR "PM: can't set %s wakealarm, err %d\n";
- static char err_suspend[] __initdata =
- KERN_ERR "PM: suspend test failed, error %d\n";
- static char info_test[] __initdata =
- KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
-
- unsigned long now;
- struct rtc_wkalrm alm;
- int status;
-
- /* this may fail if the RTC hasn't been initialized */
- status = rtc_read_time(rtc, &alm.time);
- if (status < 0) {
- printk(err_readtime, dev_name(&rtc->dev), status);
- return;
- }
- rtc_tm_to_time(&alm.time, &now);
-
- memset(&alm, 0, sizeof alm);
- rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
- alm.enabled = true;
-
- status = rtc_set_alarm(rtc, &alm);
- if (status < 0) {
- printk(err_wakealarm, dev_name(&rtc->dev), status);
- return;
- }
-
- if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state]);
- status = pm_suspend(state);
- if (status == -ENODEV)
- state = PM_SUSPEND_STANDBY;
- }
- if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state]);
- status = pm_suspend(state);
- }
- if (status < 0)
- printk(err_suspend, status);
-
- /* Some platforms can't detect that the alarm triggered the
- * wakeup, or (accordingly) disable it after it afterwards.
- * It's supposed to give oneshot behavior; cope.
- */
- alm.enabled = false;
- rtc_set_alarm(rtc, &alm);
-}
-
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
-{
- struct rtc_device *candidate = to_rtc_device(dev);
-
- if (!candidate->ops->set_alarm)
- return 0;
- if (!device_may_wakeup(candidate->dev.parent))
- return 0;
-
- *(const char **)name_ptr = dev_name(dev);
- return 1;
-}
-
-/*
- * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
- * at startup time. They're normally disabled, for faster boot and because
- * we can't know which states really work on this particular system.
- */
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
-
-static char warn_bad_state[] __initdata =
- KERN_WARNING "PM: can't test '%s' suspend state\n";
-
-static int __init setup_test_suspend(char *value)
-{
- unsigned i;
-
- /* "=mem" ==> "mem" */
- value++;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (!pm_states[i])
- continue;
- if (strcmp(pm_states[i], value) != 0)
- continue;
- test_state = (__force suspend_state_t) i;
- return 0;
- }
- printk(warn_bad_state, value);
- return 0;
-}
-__setup("test_suspend", setup_test_suspend);
-
-static int __init test_suspend(void)
-{
- static char warn_no_rtc[] __initdata =
- KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
-
- char *pony = NULL;
- struct rtc_device *rtc = NULL;
-
- /* PM is initialized by now; is that state testable? */
- if (test_state == PM_SUSPEND_ON)
- goto done;
- if (!valid_state(test_state)) {
- printk(warn_bad_state, pm_states[test_state]);
- goto done;
- }
-
- /* RTCs have initialized by now too ... can we use one? */
- class_find_device(rtc_class, NULL, &pony, has_wakealarm);
- if (pony)
- rtc = rtc_class_open(pony);
- if (!rtc) {
- printk(warn_no_rtc);
- goto done;
- }
-
- /* go for it */
- test_wakealarm(rtc, test_state);
- rtc_class_close(rtc);
-done:
- return 0;
-}
-late_initcall(test_suspend);
-
-#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7a3af..26d5a26f82e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
-extern unsigned int count_data_pages(void);
+extern int swsusp_shrink_memory(void);
/**
* Auxiliary structure used for reading the snapshot image data and
@@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void);
*/
#define SF_PLATFORM_MODE 1
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
extern int swsusp_check(void);
-extern int swsusp_shrink_memory(void);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
@@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
unsigned int, char *);
#ifdef CONFIG_SUSPEND
-/* kernel/power/main.c */
+/* kernel/power/suspend.c */
+extern const char *const pm_states[];
+
+extern bool valid_state(suspend_state_t state);
extern int suspend_devices_and_enter(suspend_state_t state);
+extern int enter_state(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
}
+static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
+static inline bool valid_state(suspend_state_t state) { return false; }
#endif /* !CONFIG_SUSPEND */
+#ifdef CONFIG_PM_TEST_SUSPEND
+/* kernel/power/suspend_test.c */
+extern void suspend_test_start(void);
+extern void suspend_test_finish(const char *label);
+#else /* !CONFIG_PM_TEST_SUSPEND */
+static inline void suspend_test_start(void) {}
+static inline void suspend_test_finish(const char *label) {}
+#endif /* !CONFIG_PM_TEST_SUSPEND */
+
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
extern int pm_notifier_call_chain(unsigned long val);
#endif
#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void);
int restore_highmem(void);
#else
static inline unsigned int count_highmem_pages(void) { return 0; }
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b..e8b33700627 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
.help_msg = "powerOff",
.action_msg = "Power Off",
- .enable_mask = SYSRQ_ENABLE_BOOT,
+ .enable_mask = SYSRQ_ENABLE_BOOT,
};
static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497..da2072d7381 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
if (error)
goto Exit;
printk("done.");
+
+ oom_killer_disable();
Exit:
BUG_ON(in_atomic());
printk("\n");
+
return error;
}
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
void thaw_processes(void)
{
+ oom_killer_enable();
+
printk("Restarting tasks ... ");
thaw_tasks(true);
thaw_tasks(false);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 33e2e4a819f..523a451b45d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size = 500 * 1024 * 1024;
+
/* List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
@@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
* pages.
*/
-unsigned int count_highmem_pages(void)
+static unsigned int count_highmem_pages(void)
{
struct zone *zone;
unsigned int n = 0;
@@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
* pages.
*/
-unsigned int count_data_pages(void)
+static unsigned int count_data_pages(void)
{
struct zone *zone;
unsigned long pfn, max_zone_pfn;
@@ -1058,6 +1066,74 @@ void swsusp_free(void)
buffer = NULL;
}
+/**
+ * swsusp_shrink_memory - Try to free as much memory as needed
+ *
+ * ... but do not OOM-kill anyone
+ *
+ * Notice: all userland should be stopped before it is called, or
+ * livelock is possible.
+ */
+
+#define SHRINK_BITE 10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+ if (tmp > SHRINK_BITE)
+ tmp = SHRINK_BITE;
+ return shrink_all_memory(tmp);
+}
+
+int swsusp_shrink_memory(void)
+{
+ long tmp;
+ struct zone *zone;
+ unsigned long pages = 0;
+ unsigned int i = 0;
+ char *p = "-\\|/";
+ struct timeval start, stop;
+
+ printk(KERN_INFO "PM: Shrinking memory... ");
+ do_gettimeofday(&start);
+ do {
+ long size, highmem_size;
+
+ highmem_size = count_highmem_pages();
+ size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
+ tmp = size;
+ size += highmem_size;
+ for_each_populated_zone(zone) {
+ tmp += snapshot_additional_pages(zone);
+ if (is_highmem(zone)) {
+ highmem_size -=
+ zone_page_state(zone, NR_FREE_PAGES);
+ } else {
+ tmp -= zone_page_state(zone, NR_FREE_PAGES);
+ tmp += zone->lowmem_reserve[ZONE_NORMAL];
+ }
+ }
+
+ if (highmem_size < 0)
+ highmem_size = 0;
+
+ tmp += highmem_size;
+ if (tmp > 0) {
+ tmp = __shrink_memory(tmp);
+ if (!tmp)
+ return -ENOMEM;
+ pages += tmp;
+ } else if (size > image_size / PAGE_SIZE) {
+ tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
+ pages += tmp;
+ }
+ printk("\b%c", p[i++%4]);
+ } while (tmp > 0);
+ do_gettimeofday(&stop);
+ printk("\bdone (%lu pages freed)\n", pages);
+ swsusp_show_speed(&start, &stop, pages, "Freed");
+
+ return 0;
+}
+
#ifdef CONFIG_HIGHMEM
/**
* count_pages_for_highmem - compute the number of non-highmem pages
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 00000000000..6f10dfc2d3e
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,300 @@
+/*
+ * kernel/power/suspend.c - Suspend to RAM and standby functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+
+#include "power.h"
+
+const char *const pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_STANDBY] = "standby",
+ [PM_SUSPEND_MEM] = "mem",
+};
+
+static struct platform_suspend_ops *suspend_ops;
+
+/**
+ * suspend_set_ops - Set the global suspend method table.
+ * @ops: Pointer to ops structure.
+ */
+void suspend_set_ops(struct platform_suspend_ops *ops)
+{
+ mutex_lock(&pm_mutex);
+ suspend_ops = ops;
+ mutex_unlock(&pm_mutex);
+}
+
+bool valid_state(suspend_state_t state)
+{
+ /*
+ * All states need lowlevel support and need to be valid to the lowlevel
+ * implementation, no valid callback implies that none are valid.
+ */
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+
+/**
+ * suspend_valid_only_mem - generic memory-only valid callback
+ *
+ * Platform drivers that implement mem suspend only and only need
+ * to check for that in their .valid callback can use this instead
+ * of rolling their own .valid callback.
+ */
+int suspend_valid_only_mem(suspend_state_t state)
+{
+ return state == PM_SUSPEND_MEM;
+}
+
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level == level) {
+ printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+ return 1;
+ }
+#endif /* !CONFIG_PM_DEBUG */
+ return 0;
+}
+
+/**
+ * suspend_prepare - Do prep work before entering low-power state.
+ *
+ * This is common code that is called for each state that we're entering.
+ * Run suspend notifiers, allocate a console and stop all processes.
+ */
+static int suspend_prepare(void)
+{
+ int error;
+
+ if (!suspend_ops || !suspend_ops->enter)
+ return -EPERM;
+
+ pm_prepare_console();
+
+ error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+ if (error)
+ goto Finish;
+
+ error = usermodehelper_disable();
+ if (error)
+ goto Finish;
+
+ error = suspend_freeze_processes();
+ if (!error)
+ return 0;
+
+ suspend_thaw_processes();
+ usermodehelper_enable();
+ Finish:
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+ return error;
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+{
+ local_irq_disable();
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+{
+ local_irq_enable();
+}
+
+/**
+ * suspend_enter - enter the desired system sleep state.
+ * @state: state to enter
+ *
+ * This function should be called after devices have been suspended.
+ */
+static int suspend_enter(suspend_state_t state)
+{
+ int error;
+
+ if (suspend_ops->prepare) {
+ error = suspend_ops->prepare();
+ if (error)
+ return error;
+ }
+
+ error = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to power down\n");
+ goto Platfrom_finish;
+ }
+
+ if (suspend_ops->prepare_late) {
+ error = suspend_ops->prepare_late();
+ if (error)
+ goto Power_up_devices;
+ }
+
+ if (suspend_test(TEST_PLATFORM))
+ goto Platform_wake;
+
+ error = disable_nonboot_cpus();
+ if (error || suspend_test(TEST_CPUS))
+ goto Enable_cpus;
+
+ arch_suspend_disable_irqs();
+ BUG_ON(!irqs_disabled());
+
+ error = sysdev_suspend(PMSG_SUSPEND);
+ if (!error) {
+ if (!suspend_test(TEST_CORE))
+ error = suspend_ops->enter(state);
+ sysdev_resume();
+ }
+
+ arch_suspend_enable_irqs();
+ BUG_ON(irqs_disabled());
+
+ Enable_cpus:
+ enable_nonboot_cpus();
+
+ Platform_wake:
+ if (suspend_ops->wake)
+ suspend_ops->wake();
+
+ Power_up_devices:
+ dpm_resume_noirq(PMSG_RESUME);
+
+ Platfrom_finish:
+ if (suspend_ops->finish)
+ suspend_ops->finish();
+
+ return error;
+}
+
+/**
+ * suspend_devices_and_enter - suspend devices and enter the desired system
+ * sleep state.
+ * @state: state to enter
+ */
+int suspend_devices_and_enter(suspend_state_t state)
+{
+ int error;
+
+ if (!suspend_ops)
+ return -ENOSYS;
+
+ if (suspend_ops->begin) {
+ error = suspend_ops->begin(state);
+ if (error)
+ goto Close;
+ }
+ suspend_console();
+ suspend_test_start();
+ error = dpm_suspend_start(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to suspend\n");
+ goto Recover_platform;
+ }
+ suspend_test_finish("suspend devices");
+ if (suspend_test(TEST_DEVICES))
+ goto Recover_platform;
+
+ suspend_enter(state);
+
+ Resume_devices:
+ suspend_test_start();
+ dpm_resume_end(PMSG_RESUME);
+ suspend_test_finish("resume devices");
+ resume_console();
+ Close:
+ if (suspend_ops->end)
+ suspend_ops->end();
+ return error;
+
+ Recover_platform:
+ if (suspend_ops->recover)
+ suspend_ops->recover();
+ goto Resume_devices;
+}
+
+/**
+ * suspend_finish - Do final work before exiting suspend sequence.
+ *
+ * Call platform code to clean up, restart processes, and free the
+ * console that we've allocated. This is not called for suspend-to-disk.
+ */
+static void suspend_finish(void)
+{
+ suspend_thaw_processes();
+ usermodehelper_enable();
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+}
+
+/**
+ * enter_state - Do common work of entering low-power state.
+ * @state: pm_state structure for state we're entering.
+ *
+ * Make sure we're the only ones trying to enter a sleep state. Fail
+ * if someone has beat us to it, since we don't want anything weird to
+ * happen when we wake up.
+ * Then, do the setup for suspend, enter the state, and cleaup (after
+ * we've woken up).
+ */
+int enter_state(suspend_state_t state)
+{
+ int error;
+
+ if (!valid_state(state))
+ return -ENODEV;
+
+ if (!mutex_trylock(&pm_mutex))
+ return -EBUSY;
+
+ printk(KERN_INFO "PM: Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ error = suspend_prepare();
+ if (error)
+ goto Unlock;
+
+ if (suspend_test(TEST_FREEZER))
+ goto Finish;
+
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ error = suspend_devices_and_enter(state);
+
+ Finish:
+ pr_debug("PM: Finishing wakeup.\n");
+ suspend_finish();
+ Unlock:
+ mutex_unlock(&pm_mutex);
+ return error;
+}
+
+/**
+ * pm_suspend - Externally visible function for suspending system.
+ * @state: Enumerated value of state to enter.
+ *
+ * Determine whether or not value is within range, get state
+ * structure, and enter (above).
+ */
+int pm_suspend(suspend_state_t state)
+{
+ if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+ return enter_state(state);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 00000000000..17d8bb1acf9
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,187 @@
+/*
+ * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
+ *
+ * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/init.h>
+#include <linux/rtc.h>
+
+#include "power.h"
+
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending. Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS 5
+
+static unsigned long suspend_test_start_time;
+
+void suspend_test_start(void)
+{
+ /* FIXME Use better timebase than "jiffies", ideally a clocksource.
+ * What we want is a hardware counter that will work correctly even
+ * during the irqs-are-off stages of the suspend/resume cycle...
+ */
+ suspend_test_start_time = jiffies;
+}
+
+void suspend_test_finish(const char *label)
+{
+ long nj = jiffies - suspend_test_start_time;
+ unsigned msec;
+
+ msec = jiffies_to_msecs(abs(nj));
+ pr_info("PM: %s took %d.%03d seconds\n", label,
+ msec / 1000, msec % 1000);
+
+ /* Warning on suspend means the RTC alarm period needs to be
+ * larger -- the system was sooo slooowwww to suspend that the
+ * alarm (should have) fired before the system went to sleep!
+ *
+ * Warning on either suspend or resume also means the system
+ * has some performance issues. The stack dump of a WARN_ON
+ * is more likely to get the right attention than a printk...
+ */
+ WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+}
+
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system. RTCs wake alarms are a common self-contained mechanism.
+ */
+
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+ static char err_readtime[] __initdata =
+ KERN_ERR "PM: can't read %s time, err %d\n";
+ static char err_wakealarm [] __initdata =
+ KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+ static char err_suspend[] __initdata =
+ KERN_ERR "PM: suspend test failed, error %d\n";
+ static char info_test[] __initdata =
+ KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+
+ unsigned long now;
+ struct rtc_wkalrm alm;
+ int status;
+
+ /* this may fail if the RTC hasn't been initialized */
+ status = rtc_read_time(rtc, &alm.time);
+ if (status < 0) {
+ printk(err_readtime, dev_name(&rtc->dev), status);
+ return;
+ }
+ rtc_tm_to_time(&alm.time, &now);
+
+ memset(&alm, 0, sizeof alm);
+ rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ alm.enabled = true;
+
+ status = rtc_set_alarm(rtc, &alm);
+ if (status < 0) {
+ printk(err_wakealarm, dev_name(&rtc->dev), status);
+ return;
+ }
+
+ if (state == PM_SUSPEND_MEM) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ if (status == -ENODEV)
+ state = PM_SUSPEND_STANDBY;
+ }
+ if (state == PM_SUSPEND_STANDBY) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+ if (status < 0)
+ printk(err_suspend, status);
+
+ /* Some platforms can't detect that the alarm triggered the
+ * wakeup, or (accordingly) disable it after it afterwards.
+ * It's supposed to give oneshot behavior; cope.
+ */
+ alm.enabled = false;
+ rtc_set_alarm(rtc, &alm);
+}
+
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+ struct rtc_device *candidate = to_rtc_device(dev);
+
+ if (!candidate->ops->set_alarm)
+ return 0;
+ if (!device_may_wakeup(candidate->dev.parent))
+ return 0;
+
+ *(const char **)name_ptr = dev_name(dev);
+ return 1;
+}
+
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time. They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+
+static char warn_bad_state[] __initdata =
+ KERN_WARNING "PM: can't test '%s' suspend state\n";
+
+static int __init setup_test_suspend(char *value)
+{
+ unsigned i;
+
+ /* "=mem" ==> "mem" */
+ value++;
+ for (i = 0; i < PM_SUSPEND_MAX; i++) {
+ if (!pm_states[i])
+ continue;
+ if (strcmp(pm_states[i], value) != 0)
+ continue;
+ test_state = (__force suspend_state_t) i;
+ return 0;
+ }
+ printk(warn_bad_state, value);
+ return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+
+static int __init test_suspend(void)
+{
+ static char warn_no_rtc[] __initdata =
+ KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+
+ char *pony = NULL;
+ struct rtc_device *rtc = NULL;
+
+ /* PM is initialized by now; is that state testable? */
+ if (test_state == PM_SUSPEND_ON)
+ goto done;
+ if (!valid_state(test_state)) {
+ printk(warn_bad_state, pm_states[test_state]);
+ goto done;
+ }
+
+ /* RTCs have initialized by now too ... can we use one? */
+ class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+ if (pony)
+ rtc = rtc_class_open(pony);
+ if (!rtc) {
+ printk(warn_no_rtc);
+ goto done;
+ }
+
+ /* go for it */
+ test_wakealarm(rtc, test_state);
+ rtc_class_close(rtc);
+done:
+ return 0;
+}
+late_initcall(test_suspend);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 78c35047586..6a07f4dbf2f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -55,14 +55,6 @@
#include "power.h"
-/*
- * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
- * size will not exceed N bytes, but if that is impossible, it will
- * try to create the smallest image possible.
- */
-unsigned long image_size = 500 * 1024 * 1024;
-
int in_suspend __nosavedata = 0;
/**
@@ -194,193 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
centisecs / 100, centisecs % 100,
kps / 1000, (kps % 1000) / 10);
}
-
-/**
- * swsusp_shrink_memory - Try to free as much memory as needed
- *
- * ... but do not OOM-kill anyone
- *
- * Notice: all userland should be stopped before it is called, or
- * livelock is possible.
- */
-
-#define SHRINK_BITE 10000
-static inline unsigned long __shrink_memory(long tmp)
-{
- if (tmp > SHRINK_BITE)
- tmp = SHRINK_BITE;
- return shrink_all_memory(tmp);
-}
-
-int swsusp_shrink_memory(void)
-{
- long tmp;
- struct zone *zone;
- unsigned long pages = 0;
- unsigned int i = 0;
- char *p = "-\\|/";
- struct timeval start, stop;
-
- printk(KERN_INFO "PM: Shrinking memory... ");
- do_gettimeofday(&start);
- do {
- long size, highmem_size;
-
- highmem_size = count_highmem_pages();
- size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
- tmp = size;
- size += highmem_size;
- for_each_populated_zone(zone) {
- tmp += snapshot_additional_pages(zone);
- if (is_highmem(zone)) {
- highmem_size -=
- zone_page_state(zone, NR_FREE_PAGES);
- } else {
- tmp -= zone_page_state(zone, NR_FREE_PAGES);
- tmp += zone->lowmem_reserve[ZONE_NORMAL];
- }
- }
-
- if (highmem_size < 0)
- highmem_size = 0;
-
- tmp += highmem_size;
- if (tmp > 0) {
- tmp = __shrink_memory(tmp);
- if (!tmp)
- return -ENOMEM;
- pages += tmp;
- } else if (size > image_size / PAGE_SIZE) {
- tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
- pages += tmp;
- }
- printk("\b%c", p[i++%4]);
- } while (tmp > 0);
- do_gettimeofday(&stop);
- printk("\bdone (%lu pages freed)\n", pages);
- swsusp_show_speed(&start, &stop, pages, "Freed");
-
- return 0;
-}
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
- * resume. The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
- unsigned long phys_start;
- unsigned int size;
- void *kaddr;
- void *data;
- struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- * hibernate_nvs_register - register platform NVS memory region to save
- * @start - physical address of the region
- * @size - size of the region
- *
- * The NVS region need not be page-aligned (both ends) and we arrange
- * things so that the data from page-aligned addresses in this region will
- * be copied into separate RAM pages.
- */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
-{
- struct nvs_page *entry, *next;
-
- while (size > 0) {
- unsigned int nr_bytes;
-
- entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
- if (!entry)
- goto Error;
-
- list_add_tail(&entry->node, &nvs_list);
- entry->phys_start = start;
- nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
- entry->size = (size < nr_bytes) ? size : nr_bytes;
-
- start += entry->size;
- size -= entry->size;
- }
- return 0;
-
- Error:
- list_for_each_entry_safe(entry, next, &nvs_list, node) {
- list_del(&entry->node);
- kfree(entry);
- }
- return -ENOMEM;
-}
-
-/**
- * hibernate_nvs_free - free data pages allocated for saving NVS regions
- */
-void hibernate_nvs_free(void)
-{
- struct nvs_page *entry;
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data) {
- free_page((unsigned long)entry->data);
- entry->data = NULL;
- if (entry->kaddr) {
- iounmap(entry->kaddr);
- entry->kaddr = NULL;
- }
- }
-}
-
-/**
- * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int hibernate_nvs_alloc(void)
-{
- struct nvs_page *entry;
-
- list_for_each_entry(entry, &nvs_list, node) {
- entry->data = (void *)__get_free_page(GFP_KERNEL);
- if (!entry->data) {
- hibernate_nvs_free();
- return -ENOMEM;
- }
- }
- return 0;
-}
-
-/**
- * hibernate_nvs_save - save NVS memory regions
- */
-void hibernate_nvs_save(void)
-{
- struct nvs_page *entry;
-
- printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data) {
- entry->kaddr = ioremap(entry->phys_start, entry->size);
- memcpy(entry->data, entry->kaddr, entry->size);
- }
-}
-
-/**
- * hibernate_nvs_restore - restore NVS memory regions
- *
- * This function is going to be called with interrupts disabled, so it
- * cannot iounmap the virtual addresses used to access the NVS region.
- */
-void hibernate_nvs_restore(void)
-{
- struct nvs_page *entry;
-
- printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data)
- memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae..bf0014d6a5f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
#include <scsi/scsi_scan.h>
#include <asm/uaccess.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c6..b4d97b54c1e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
sizeof(printk_buf) - printed_len, fmt, args);
+ p = printk_buf;
+
+ /* Do we have a loglevel in the string? */
+ if (p[0] == '<') {
+ unsigned char c = p[1];
+ if (c && p[2] == '>') {
+ switch (c) {
+ case '0' ... '7': /* loglevel */
+ current_log_level = c - '0';
+ /* Fallthrough - make sure we're on a new line */
+ case 'd': /* KERN_DEFAULT */
+ if (!new_text_line) {
+ emit_log_char('\n');
+ new_text_line = 1;
+ }
+ /* Fallthrough - skip the loglevel */
+ case 'c': /* KERN_CONT */
+ p += 3;
+ break;
+ }
+ }
+ }
+
/*
* Copy the output into log_buf. If the caller didn't provide
* appropriate log level tags, we insert them here
*/
- for (p = printk_buf; *p; p++) {
+ for ( ; *p; p++) {
if (new_text_line) {
- /* If a token, set current_log_level and skip over */
- if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
- p[2] == '>') {
- current_log_level = p[1] - '0';
- p += 3;
- printed_len -= 3;
- }
-
/* Always output the token */
emit_log_char('<');
emit_log_char(current_log_level + '0');
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409ba..419250ebec4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,23 +111,18 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!slab_is_available()) {
- prof_buffer = alloc_bootmem(buffer_bytes);
- alloc_bootmem_cpumask_var(&prof_cpu_mask);
- cpumask_copy(prof_cpu_mask, cpu_possible_mask);
- return 0;
- }
if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
return -ENOMEM;
cpumask_copy(prof_cpu_mask, cpu_possible_mask);
- prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
+ prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
if (prof_buffer)
return 0;
- prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
+ prof_buffer = alloc_pages_exact(buffer_bytes,
+ GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
if (prof_buffer)
return 0;
@@ -371,7 +366,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -379,7 +374,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -570,14 +565,14 @@ static int create_hash_tables(void)
int node = cpu_to_node(cpu);
struct page *page;
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
0);
if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42c317874cf..082c320e4db 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,16 +25,6 @@
/*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
- arch_ptrace_fork(child, clone_flags);
-}
-
-/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
@@ -177,66 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
int ptrace_attach(struct task_struct *task)
{
int retval;
- unsigned long flags;
audit_ptrace(task);
retval = -EPERM;
+ if (unlikely(task->flags & PF_KTHREAD))
+ goto out;
if (same_thread_group(task, current))
goto out;
- /* Protect exec's credential calculations against our interference;
- * SUID, SGID and LSM creds get determined differently under ptrace.
+ /*
+ * Protect exec's credential calculations against our interference;
+ * interference; SUID, SGID and LSM creds get determined differently
+ * under ptrace.
*/
- retval = mutex_lock_interruptible(&task->cred_exec_mutex);
- if (retval < 0)
+ retval = -ERESTARTNOINTR;
+ if (mutex_lock_interruptible(&task->cred_guard_mutex))
goto out;
- retval = -EPERM;
-repeat:
- /*
- * Nasty, nasty.
- *
- * We want to hold both the task-lock and the
- * tasklist_lock for writing at the same time.
- * But that's against the rules (tasklist_lock
- * is taken for reading by interrupts on other
- * cpu's that may have task_lock).
- */
task_lock(task);
- if (!write_trylock_irqsave(&tasklist_lock, flags)) {
- task_unlock(task);
- do {
- cpu_relax();
- } while (!write_can_lock(&tasklist_lock));
- goto repeat;
- }
-
- if (!task->mm)
- goto bad;
- /* the same process cannot be attached many times */
- if (task->ptrace & PT_PTRACED)
- goto bad;
retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ task_unlock(task);
if (retval)
- goto bad;
+ goto unlock_creds;
+
+ write_lock_irq(&tasklist_lock);
+ retval = -EPERM;
+ if (unlikely(task->exit_state))
+ goto unlock_tasklist;
+ if (task->ptrace)
+ goto unlock_tasklist;
- /* Go */
- task->ptrace |= PT_PTRACED;
+ task->ptrace = PT_PTRACED;
if (capable(CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
-
send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
-bad:
- write_unlock_irqrestore(&tasklist_lock, flags);
- task_unlock(task);
- mutex_unlock(&task->cred_exec_mutex);
+
+ retval = 0;
+unlock_tasklist:
+ write_unlock_irq(&tasklist_lock);
+unlock_creds:
+ mutex_unlock(&task->cred_guard_mutex);
out:
return retval;
}
+/**
+ * ptrace_traceme -- helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
+{
+ int ret = -EPERM;
+
+ write_lock_irq(&tasklist_lock);
+ /* Are we already being traced? */
+ if (!current->ptrace) {
+ ret = security_ptrace_traceme(current->parent);
+ /*
+ * Check PF_EXITING to ensure ->real_parent has not passed
+ * exit_ptrace(). Otherwise we don't report the error but
+ * pretend ->real_parent untraces us right after return.
+ */
+ if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+ current->ptrace = PT_PTRACED;
+ __ptrace_link(current, current->real_parent);
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+
+ return ret;
+}
+
/*
* Called with irqs disabled, returns true if childs should reap themselves.
*/
@@ -418,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
{
+ unsigned long flags;
int error = -ESRCH;
- read_lock(&tasklist_lock);
- if (likely(child->sighand != NULL)) {
+ if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
- spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
*info = *child->last_siginfo;
error = 0;
}
- spin_unlock_irq(&child->sighand->siglock);
+ unlock_task_sighand(child, &flags);
}
- read_unlock(&tasklist_lock);
return error;
}
static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
{
+ unsigned long flags;
int error = -ESRCH;
- read_lock(&tasklist_lock);
- if (likely(child->sighand != NULL)) {
+ if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
- spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
*child->last_siginfo = *info;
error = 0;
}
- spin_unlock_irq(&child->sighand->siglock);
+ unlock_task_sighand(child, &flags);
}
- read_unlock(&tasklist_lock);
return error;
}
@@ -575,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
return ret;
}
-/**
- * ptrace_traceme -- helper for PTRACE_TRACEME
- *
- * Performs checks and sets PT_PTRACED.
- * Should be used by all ptrace implementations for PTRACE_TRACEME.
- */
-int ptrace_traceme(void)
-{
- int ret = -EPERM;
-
- /*
- * Are we already being traced?
- */
-repeat:
- task_lock(current);
- if (!(current->ptrace & PT_PTRACED)) {
- /*
- * See ptrace_attach() comments about the locking here.
- */
- unsigned long flags;
- if (!write_trylock_irqsave(&tasklist_lock, flags)) {
- task_unlock(current);
- do {
- cpu_relax();
- } while (!write_can_lock(&tasklist_lock));
- goto repeat;
- }
-
- ret = security_ptrace_traceme(current->parent);
-
- /*
- * Check PF_EXITING to ensure ->real_parent has not passed
- * exit_ptrace(). Otherwise we don't report the error but
- * pretend ->real_parent untraces us right after return.
- */
- if (!ret && !(current->real_parent->flags & PF_EXITING)) {
- current->ptrace |= PT_PTRACED;
- __ptrace_link(current, current->real_parent);
- }
-
- write_unlock_irqrestore(&tasklist_lock, flags);
- }
- task_unlock(current);
- return ret;
-}
-
-/**
- * ptrace_get_task_struct -- grab a task struct reference for ptrace
- * @pid: process id to grab a task_struct reference of
- *
- * This function is a helper for ptrace implementations. It checks
- * permissions and then grabs a task struct for use of the actual
- * ptrace implementation.
- *
- * Returns the task_struct for @pid or an ERR_PTR() on failure.
- */
-struct task_struct *ptrace_get_task_struct(pid_t pid)
+static struct task_struct *ptrace_get_task_struct(pid_t pid)
{
struct task_struct *child;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
child = find_task_by_vpid(pid);
if (child)
get_task_struct(child);
+ rcu_read_unlock();
- read_unlock(&tasklist_lock);
if (!child)
return ERR_PTR(-ESRCH);
return child;
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4df64d..beb0e659adc 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
- ret = 0;
+ ret = 0; /* unused */
__wait_event_interruptible(rcu_ctrlblk.sched_wq,
rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
ret);
- /*
- * Signals would prevent us from sleeping, and we cannot
- * do much with them in any case. So flush them.
- */
- if (ret)
- flush_signals(current);
couldsleepnext = 0;
} while (!kthread_should_stop());
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9..7717b95c202 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
check_cpu_stall(rsp, rdp);
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rdp->qs_pending)
+ if (rdp->qs_pending) {
+ rdp->n_rp_qs_pending++;
return 1;
+ }
/* Does this CPU have callbacks ready to invoke? */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+ rdp->n_rp_cb_ready++;
return 1;
+ }
/* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp))
+ if (cpu_needs_another_gp(rsp, rdp)) {
+ rdp->n_rp_cpu_needs_gp++;
return 1;
+ }
/* Has another RCU grace period completed? */
- if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+ if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+ rdp->n_rp_gp_completed++;
return 1;
+ }
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+ if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+ rdp->n_rp_gp_started++;
return 1;
+ }
/* Has an RCU GP gone long enough to send resched IPIs &c? */
if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
- ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
+ ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+ rdp->n_rp_need_fqs++;
return 1;
+ }
/* nothing to do */
+ rdp->n_rp_need_nothing++;
return 0;
}
@@ -1520,7 +1533,7 @@ void __init __rcu_init(void)
int j;
struct rcu_node *rnp;
- printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+ printk(KERN_INFO "Hierarchical RCU implementation.\n");
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1533,7 +1546,6 @@ void __init __rcu_init(void)
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
- printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
}
module_param(blimit, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba940..fe1dcdbf1ca 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
.release = single_release,
};
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+ seq_printf(m, "%3d%cnp=%ld "
+ "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+ rdp->cpu,
+ cpu_is_offline(rdp->cpu) ? '!' : ' ',
+ rdp->n_rcu_pending,
+ rdp->n_rp_qs_pending,
+ rdp->n_rp_cb_ready,
+ rdp->n_rp_cpu_needs_gp,
+ rdp->n_rp_gp_completed,
+ rdp->n_rp_gp_started,
+ rdp->n_rp_need_fqs,
+ rdp->n_rp_need_nothing);
+}
+
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+ int cpu;
+ struct rcu_data *rdp;
+
+ for_each_possible_cpu(cpu) {
+ rdp = rsp->rda[cpu];
+ if (rdp->beenonline)
+ print_one_rcu_pending(m, rdp);
+ }
+}
+
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+ seq_puts(m, "rcu:\n");
+ print_rcu_pendings(m, &rcu_state);
+ seq_puts(m, "rcu_bh:\n");
+ print_rcu_pendings(m, &rcu_bh_state);
+ return 0;
+}
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcu_pending, NULL);
+}
+
+static struct file_operations rcu_pending_fops = {
+ .owner = THIS_MODULE,
+ .open = rcu_pending_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+static struct dentry *datadir;
+static struct dentry *datadir_csv;
+static struct dentry *gpdir;
+static struct dentry *hierdir;
+static struct dentry *rcu_pendingdir;
+
static int __init rcuclassic_trace_init(void)
{
rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
NULL, &rcuhier_fops);
if (!hierdir)
goto free_out;
+
+ rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+ NULL, &rcu_pending_fops);
+ if (!rcu_pendingdir)
+ goto free_out;
return 0;
free_out:
if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
debugfs_remove(datadir_csv);
debugfs_remove(gpdir);
debugfs_remove(hierdir);
+ debugfs_remove(rcu_pendingdir);
debugfs_remove(rcudir);
}
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c80..e1338f07431 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
void res_counter_init(struct res_counter *counter, struct res_counter *parent)
{
spin_lock_init(&counter->lock);
- counter->limit = (unsigned long long)LLONG_MAX;
+ counter->limit = RESOURCE_MAX;
counter->parent = parent;
}
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
unsigned long long *res)
{
char *end;
+
+ /* return RESOURCE_MAX(unlimited) if "-1" is specified */
+ if (*buf == '-') {
+ *res = simple_strtoull(buf + 1, &end, 10);
+ if (*res != 1 || *end != '\0')
+ return -EINVAL;
+ *res = RESOURCE_MAX;
+ return 0;
+ }
+
/* FIXME - make memparse() take const char* args */
*res = memparse((char *)buf, &end);
if (*end != '\0')
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923..78b087221c1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
static struct resource reserve[MAXRESERVE];
for (;;) {
- int io_start, io_num;
+ unsigned int io_start, io_num;
int x = reserved;
if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ff..29bd4baf9e7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* assigned pending owner [which might not have taken the
* lock yet]:
*/
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+ struct task_struct *task)
{
struct task_struct *pendowner = rt_mutex_owner(lock);
struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
if (!rt_mutex_owner_pending(lock))
return 0;
- if (pendowner == current)
+ if (pendowner == task)
return 1;
spin_lock_irqsave(&pendowner->pi_lock, flags);
- if (current->prio >= pendowner->prio) {
+ if (task->prio >= pendowner->prio) {
spin_unlock_irqrestore(&pendowner->pi_lock, flags);
return 0;
}
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
* We are going to steal the lock and a waiter was
* enqueued on the pending owners pi_waiters queue. So
* we have to enqueue this waiter into
- * current->pi_waiters list. This covers the case,
- * where current is boosted because it holds another
+ * task->pi_waiters list. This covers the case,
+ * where task is boosted because it holds another
* lock and gets unboosted because the booster is
* interrupted, so we would delay a waiter with higher
- * priority as current->normal_prio.
+ * priority as task->normal_prio.
*
* Note: in the rare case of a SCHED_OTHER task changing
* its priority and thus stealing the lock, next->task
- * might be current:
+ * might be task:
*/
- if (likely(next->task != current)) {
- spin_lock_irqsave(&current->pi_lock, flags);
- plist_add(&next->pi_list_entry, &current->pi_waiters);
- __rt_mutex_adjust_prio(current);
- spin_unlock_irqrestore(&current->pi_lock, flags);
+ if (likely(next->task != task)) {
+ spin_lock_irqsave(&task->pi_lock, flags);
+ plist_add(&next->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
}
return 1;
}
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
*/
mark_rt_mutex_waiters(lock);
- if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+ if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
return 0;
/* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
int detect_deadlock)
{
struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
unsigned long flags;
int chain_walk = 0, res;
- spin_lock_irqsave(&current->pi_lock, flags);
- __rt_mutex_adjust_prio(current);
- waiter->task = current;
+ spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ waiter->task = task;
waiter->lock = lock;
- plist_node_init(&waiter->list_entry, current->prio);
- plist_node_init(&waiter->pi_list_entry, current->prio);
+ plist_node_init(&waiter->list_entry, task->prio);
+ plist_node_init(&waiter->pi_list_entry, task->prio);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
top_waiter = rt_mutex_top_waiter(lock);
plist_add(&waiter->list_entry, &lock->wait_list);
- current->pi_blocked_on = waiter;
+ task->pi_blocked_on = waiter;
- spin_unlock_irqrestore(&current->pi_lock, flags);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
spin_unlock(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
- current);
+ task);
spin_lock(&lock->wait_lock);
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
}
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @detect_deadlock: passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
*/
static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock)
{
- struct rt_mutex_waiter waiter;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- waiter.task = NULL;
-
- spin_lock(&lock->wait_lock);
-
- /* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock)) {
- spin_unlock(&lock->wait_lock);
- return 0;
- }
-
- set_current_state(state);
-
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout)) {
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
-
for (;;) {
/* Try to acquire the lock: */
if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
}
/*
- * waiter.task is NULL the first time we come here and
+ * waiter->task is NULL the first time we come here and
* when we have been woken up by the previous owner
* but the lock got stolen by a higher prio task.
*/
- if (!waiter.task) {
- ret = task_blocks_on_rt_mutex(lock, &waiter,
+ if (!waiter->task) {
+ ret = task_blocks_on_rt_mutex(lock, waiter, current,
detect_deadlock);
/*
* If we got woken up by the owner then start loop
* all over without going into schedule to try
* to get the lock now:
*/
- if (unlikely(!waiter.task)) {
+ if (unlikely(!waiter->task)) {
/*
* Reset the return value. We might
* have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
spin_unlock(&lock->wait_lock);
- debug_rt_mutex_print_deadlock(&waiter);
+ debug_rt_mutex_print_deadlock(waiter);
- if (waiter.task)
+ if (waiter->task)
schedule_rt_mutex(lock);
spin_lock(&lock->wait_lock);
set_current_state(state);
}
+ return ret;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ struct rt_mutex_waiter waiter;
+ int ret = 0;
+
+ debug_rt_mutex_init_waiter(&waiter);
+ waiter.task = NULL;
+
+ spin_lock(&lock->wait_lock);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock)) {
+ spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ set_current_state(state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout)) {
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
+
+ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+ detect_deadlock);
+
set_current_state(TASK_RUNNING);
if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
- * the timeout structure is provided
- * by the caller
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
*
* @lock: the rt_mutex to be locked
* @timeout: timeout structure or NULL (no timeout)
@@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
* Returns:
* 0 on success
* -EINTR when interrupted by a signal
- * -ETIMEOUT when the timeout expired
+ * -ETIMEDOUT when the timeout expired
* -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
int
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
}
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
-/***
+/**
* rt_mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
*
@@ -986,6 +1013,57 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
}
/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ * @detect_deadlock: perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task, int detect_deadlock)
+{
+ int ret;
+
+ spin_lock(&lock->wait_lock);
+
+ mark_rt_mutex_waiters(lock);
+
+ if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+ /* We got the lock for task. */
+ debug_rt_mutex_lock(lock);
+ rt_mutex_set_owner(lock, task, 0);
+ spin_unlock(&lock->wait_lock);
+ rt_mutex_deadlock_account_lock(lock, task);
+ return 1;
+ }
+
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+
+ if (ret && !waiter->task) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+ spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ return ret;
+}
+
+/**
* rt_mutex_next_owner - return the next owner of the lock
*
* @lock: the rt lock query
@@ -1004,3 +1082,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
return rt_mutex_top_waiter(lock)->task;
}
+
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @detect_deadlock: perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock)
+{
+ int ret;
+
+ spin_lock(&lock->wait_lock);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+ detect_deadlock);
+
+ set_current_state(TASK_RUNNING);
+
+ if (unlikely(waiter->task))
+ remove_waiter(lock, waiter);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /*
+ * Readjust priority, when we did not get the lock. We might have been
+ * the pending owner and boosted. Since we did not take the lock, the
+ * PI boost has to go.
+ */
+ if (unlikely(ret))
+ rt_mutex_adjust_prio(current);
+
+ return ret;
+}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800e..97a2f81866a 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc..1b59e265273 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
#include <linux/completion.h>
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
@@ -68,17 +69,18 @@
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
#ifdef CONFIG_SMP
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
hard = hrtimer_get_expires(&rt_b->rt_period_timer);
delta = ktime_to_ns(ktime_sub(hard, soft));
__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
- HRTIMER_MODE_ABS, 0);
+ HRTIMER_MODE_ABS_PINNED, 0);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -497,6 +493,7 @@ struct rt_rq {
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
@@ -584,6 +581,7 @@ struct rq {
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
+ u64 nr_migrations_in;
struct cfs_rq cfs;
struct rt_rq rt;
@@ -630,6 +628,10 @@ struct rq {
struct list_head migration_queue;
#endif
+ /* calc_load related fields */
+ unsigned long calc_load_update;
+ long calc_load_active;
+
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
{
rq->clock = sched_clock_cpu(cpu_of(rq));
}
@@ -1154,7 +1156,7 @@ static __init void init_hrtick(void)
static void hrtick_start(struct rq *rq, u64 delay)
{
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
- HRTIMER_MODE_REL, 0);
+ HRTIMER_MODE_REL_PINNED, 0);
}
static inline void init_hrtick(void)
@@ -1728,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
}
#endif
+static void calc_load_account_active(struct rq *this_rq);
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -1958,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
clock_offset = old_rq->clock - new_rq->clock;
- trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ trace_sched_migrate_task(p, new_cpu);
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
@@ -1967,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
+#endif
if (old_cpu != new_cpu) {
- schedstat_inc(p, se.nr_migrations);
+ p->se.nr_migrations++;
+ new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
- }
#endif
+ perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);
+ }
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
@@ -2015,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
}
/*
+ * wait_task_context_switch - wait for a thread to complete at least one
+ * context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+ unsigned long nvcsw, nivcsw, flags;
+ int running;
+ struct rq *rq;
+
+ nvcsw = p->nvcsw;
+ nivcsw = p->nivcsw;
+ for (;;) {
+ /*
+ * The runqueue is assigned before the actual context
+ * switch. We need to take the runqueue lock.
+ *
+ * We could check initially without the lock but it is
+ * very likely that we need to take the lock in every
+ * iteration.
+ */
+ rq = task_rq_lock(p, &flags);
+ running = task_running(rq, p);
+ task_rq_unlock(rq, &flags);
+
+ if (likely(!running))
+ break;
+ /*
+ * The switch count is incremented before the actual
+ * context switch. We thus wait for two switches to be
+ * sure at least one completed.
+ */
+ if ((p->nvcsw - nvcsw) > 1)
+ break;
+ if ((p->nivcsw - nivcsw) > 1)
+ break;
+
+ cpu_relax();
+ }
+}
+
+/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
@@ -2142,6 +2194,7 @@ void kick_process(struct task_struct *p)
smp_send_reschedule(cpu);
preempt_enable();
}
+EXPORT_SYMBOL_GPL(kick_process);
/*
* Return a low guess at the load of a migration-source cpu weighted
@@ -2324,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag)
#endif /* CONFIG_SMP */
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+ void (*func) (void *info), void *info)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if (task_curr(p))
+ smp_call_function_single(cpu, func, info, 1);
+ preempt_enable();
+}
+
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
@@ -2458,6 +2532,17 @@ out:
return success;
}
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes. Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,21 +2565,44 @@ static void __sched_fork(struct task_struct *p)
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.nr_migrations = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
p->se.start_runtime = 0;
p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.sum_sleep_runtime = 0;
- p->se.sleep_start = 0;
- p->se.block_start = 0;
- p->se.sleep_max = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
- p->se.wait_max = 0;
+ p->se.wait_start = 0;
+ p->se.wait_max = 0;
+ p->se.wait_count = 0;
+ p->se.wait_sum = 0;
+
+ p->se.sleep_start = 0;
+ p->se.sleep_max = 0;
+ p->se.sum_sleep_runtime = 0;
+
+ p->se.block_start = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+
+ p->se.nr_migrations_cold = 0;
+ p->se.nr_failed_migrations_affine = 0;
+ p->se.nr_failed_migrations_running = 0;
+ p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_forced_migrations = 0;
+ p->se.nr_forced2_migrations = 0;
+
+ p->se.nr_wakeups = 0;
+ p->se.nr_wakeups_sync = 0;
+ p->se.nr_wakeups_migrate = 0;
+ p->se.nr_wakeups_local = 0;
+ p->se.nr_wakeups_remote = 0;
+ p->se.nr_wakeups_affine = 0;
+ p->se.nr_wakeups_affine_attempts = 0;
+ p->se.nr_wakeups_passive = 0;
+ p->se.nr_wakeups_idle = 0;
+
#endif
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2710,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/
prev_state = prev->state;
finish_arch_switch(prev);
+ perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
if (post_schedule)
@@ -2766,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* combine the page table reload and the switch backend into
* one hypercall.
*/
- arch_enter_lazy_cpu_mode();
+ arch_start_context_switch(prev);
if (unlikely(!mm)) {
next->active_mm = oldmm;
@@ -2856,19 +2965,81 @@ unsigned long nr_iowait(void)
return sum;
}
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
{
- unsigned long i, running = 0, uninterruptible = 0;
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
- for_each_online_cpu(i) {
- running += cpu_rq(i)->nr_running;
- uninterruptible += cpu_rq(i)->nr_uninterruptible;
- }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ return load >> FSHIFT;
+}
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+ unsigned long upd = calc_load_update + 10;
+ long active;
+
+ if (time_before(jiffies, upd))
+ return;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- if (unlikely((long)uninterruptible < 0))
- uninterruptible = 0;
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
- return running + uninterruptible;
+ calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long nr_active, delta;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ atomic_long_add(delta, &calc_load_tasks);
+ }
+}
+
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+ return cpu_rq(cpu)->nr_migrations_in;
}
/*
@@ -2899,6 +3070,11 @@ static void update_cpu_load(struct rq *this_rq)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
+
+ if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+ this_rq->calc_load_update += LOAD_FREQ;
+ calc_load_account_active(this_rq);
+ }
}
#ifdef CONFIG_SMP
@@ -4240,10 +4416,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
static struct {
atomic_t load_balancer;
cpumask_var_t cpu_mask;
+ cpumask_var_t ilb_grp_nohz_mask;
} nohz ____cacheline_aligned = {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu: The cpu whose lowest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the lowest sched_domain
+ * for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd)
+ if (sd && (sd->flags & flag))
+ break;
+
+ return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu: The cpu whose domains we're iterating over.
+ * @sd: variable holding the value of the power_savings_sd
+ * for cpu.
+ * @flag: The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+ for (sd = lowest_flag_domain(cpu, flag); \
+ (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group: group to be checked for semi-idleness
+ *
+ * Returns: 1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+ cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ sched_group_cpus(ilb_group));
+
+ /*
+ * A sched_group is semi-idle when it has atleast one busy cpu
+ * and atleast one idle cpu.
+ */
+ if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ return 0;
+
+ if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ return 0;
+
+ return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu: The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns: Returns the id of the idle load balancer if it exists,
+ * Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *ilb_group;
+
+ /*
+ * Have idle load balancer selection from semi-idle packages only
+ * when power-aware load balancing is enabled
+ */
+ if (!(sched_smt_power_savings || sched_mc_power_savings))
+ goto out_done;
+
+ /*
+ * Optimize for the case when we have no idle CPUs or only one
+ * idle CPU. Don't walk the sched_domain hierarchy in such cases
+ */
+ if (cpumask_weight(nohz.cpu_mask) < 2)
+ goto out_done;
+
+ for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+ ilb_group = sd->groups;
+
+ do {
+ if (is_semi_idle_group(ilb_group))
+ return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+ ilb_group = ilb_group->next;
+
+ } while (ilb_group != sd->groups);
+ }
+
+out_done:
+ return cpumask_first(nohz.cpu_mask);
+}
+#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+ return cpumask_first(nohz.cpu_mask);
+}
+#endif
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4595,24 @@ int select_nohz_load_balancer(int stop_tick)
/* make me the ilb owner */
if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
return 1;
- } else if (atomic_read(&nohz.load_balancer) == cpu)
+ } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ int new_ilb;
+
+ if (!(sched_smt_power_savings ||
+ sched_mc_power_savings))
+ return 1;
+ /*
+ * Check to see if there is a more power-efficient
+ * ilb.
+ */
+ new_ilb = find_new_ilb(cpu);
+ if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+ atomic_set(&nohz.load_balancer, -1);
+ resched_cpu(new_ilb);
+ return 0;
+ }
return 1;
+ }
} else {
if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
return 0;
@@ -4468,15 +4781,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
}
if (atomic_read(&nohz.load_balancer) == -1) {
- /*
- * simple selection for now: Nominate the
- * first cpu in the nohz list to be the next
- * ilb owner.
- *
- * TBD: Traverse the sched domains and nominate
- * the nearest cpu in the nohz.cpu_mask.
- */
- int ilb = cpumask_first(nohz.cpu_mask);
+ int ilb = find_new_ilb(cpu);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);
@@ -4840,6 +5145,8 @@ void scheduler_tick(void)
curr->sched_class->task_tick(rq, curr, 0);
spin_unlock(&rq->lock);
+ perf_counter_task_tick(curr, cpu);
+
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
@@ -5007,13 +5314,15 @@ pick_next_task(struct rq *rq)
/*
* schedule() is the main scheduler function.
*/
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
int cpu;
+need_resched:
+ preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_qsctr_inc(cpu);
@@ -5053,6 +5362,7 @@ need_resched_nonpreemptible:
if (likely(prev != next)) {
sched_info_switch(prev, next);
+ perf_counter_task_sched_out(prev, next, cpu);
rq->nr_switches++;
rq->curr = next;
@@ -5070,15 +5380,9 @@ need_resched_nonpreemptible:
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
-}
-asmlinkage void __sched schedule(void)
-{
-need_resched:
- preempt_disable();
- __schedule();
preempt_enable_no_resched();
- if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ if (need_resched())
goto need_resched;
}
EXPORT_SYMBOL(schedule);
@@ -5221,7 +5525,7 @@ EXPORT_SYMBOL(default_wake_function);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int sync, void *key)
{
wait_queue_t *curr, *next;
@@ -5241,6 +5545,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
@@ -5279,6 +5586,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
@@ -5315,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
* awakened in the same order in which they were queued.
*
* See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete(struct completion *x)
{
@@ -5332,6 +5645,9 @@ EXPORT_SYMBOL(complete);
* @x: holds the state of this particular completion
*
* This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete_all(struct completion *x)
{
@@ -6248,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
+static inline int should_resched(void)
+{
+ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
static void __cond_resched(void)
{
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6267,8 +6588,7 @@ static void __cond_resched(void)
int __sched _cond_resched(void)
{
- if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
- system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
__cond_resched();
return 1;
}
@@ -6286,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
*/
int cond_resched_lock(spinlock_t *lock)
{
- int resched = need_resched() && system_state == SYSTEM_RUNNING;
+ int resched = should_resched();
int ret = 0;
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched && need_resched())
+ if (resched)
__cond_resched();
else
cpu_relax();
@@ -6306,7 +6626,7 @@ int __sched cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (need_resched() && system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
local_bh_enable();
__cond_resched();
local_bh_disable();
@@ -6490,8 +6810,9 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
- printk(KERN_CONT "%5lu %5d %6d\n", free,
- task_pid_nr(p), task_pid_nr(p->real_parent));
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ task_pid_nr(p), task_pid_nr(p->real_parent),
+ (unsigned long)task_thread_info(p)->flags);
show_stack(p, NULL);
}
@@ -6752,7 +7073,7 @@ static int migration_thread(void *data)
if (cpu_is_offline(cpu)) {
spin_unlock_irq(&rq->lock);
- goto wait_to_die;
+ break;
}
if (rq->active_balance) {
@@ -6778,16 +7099,7 @@ static int migration_thread(void *data)
complete(&req->done);
}
__set_current_state(TASK_RUNNING);
- return 0;
-wait_to_die:
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
return 0;
}
@@ -6970,6 +7282,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
}
}
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
+}
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7193,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
rq = task_rq_lock(p, &flags);
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
task_rq_unlock(rq, &flags);
+ get_task_struct(p);
cpu_rq(cpu)->migration_thread = p;
+ rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
@@ -7221,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
kthread_bind(cpu_rq(cpu)->migration_thread,
cpumask_any(cpu_online_mask));
kthread_stop(cpu_rq(cpu)->migration_thread);
+ put_task_struct(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
@@ -7230,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
+ put_task_struct(rq->migration_thread);
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
spin_lock_irq(&rq->lock);
@@ -7243,7 +7568,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
-
+ calc_global_load_remove(rq);
/*
* No need to migrate the tasks: it was best-effort if
* they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7604,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else. This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
*/
static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
@@ -7523,26 +7850,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
free_rootdomain(old_rd);
}
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
+
memset(rd, 0, sizeof(*rd));
- if (bootmem) {
- alloc_bootmem_cpumask_var(&def_root_domain.span);
- alloc_bootmem_cpumask_var(&def_root_domain.online);
- alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
- cpupri_init(&rd->cpupri, true);
- return 0;
- }
+ if (bootmem)
+ gfp = GFP_NOWAIT;
- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->span, gfp))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->online, gfp))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->rto_mask, gfp))
goto free_online;
- if (cpupri_init(&rd->cpupri, false) != 0)
+ if (cpupri_init(&rd->cpupri, bootmem) != 0)
goto free_rto_mask;
return 0;
@@ -7753,8 +8077,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
/*
* The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ * and struct sched_domain. )
*/
struct static_sched_group {
struct sched_group sg;
@@ -7875,7 +8200,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
struct sched_domain *sd;
sd = &per_cpu(phys_domains, j).sd;
- if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+ if (j != group_first_cpu(sd->groups)) {
/*
* Only add "power" once for each
* physical package.
@@ -7953,7 +8278,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
WARN_ON(!sd || !sd->groups);
- if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+ if (cpu != group_first_cpu(sd->groups))
return;
child = sd->child;
@@ -8731,6 +9056,8 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */
+const_debug unsigned int sysctl_timer_migration = 1;
+
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -8770,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
- plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+ plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
@@ -8865,7 +9192,7 @@ void __init sched_init(void)
* we use alloc_bootmem().
*/
if (alloc_size) {
- ptr = (unsigned long)alloc_bootmem(alloc_size);
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.se = (struct sched_entity **)ptr;
@@ -8938,6 +9265,8 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
rq->nr_running = 0;
+ rq->calc_load_active = 0;
+ rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9287,7 @@ void __init sched_init(void)
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the cpu resource is:
*
- * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+ * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
* We achieve this by letting init_task_group's tasks sit
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,20 +9374,26 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
+
+ calc_load_update = jiffies + LOAD_FREQ;
+
/*
* During early bootup we pretend to be a normal task:
*/
current->sched_class = &fair_sched_class;
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
#endif
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
+ perf_counter_init();
+
scheduler_running = 1;
}
@@ -9800,6 +10135,13 @@ static int sched_rt_global_constraints(void)
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
+ /*
+ * There's always some RT tasks in the root group
+ * -- migration, kstopmachine etc..
+ */
+ if (sysctl_sched_rt_runtime == 0)
+ return -EBUSY;
+
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a5e3e..d014efbf947 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
- if (lowest_mask)
+ if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ continue;
+ }
+
return 1;
}
@@ -152,10 +165,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
*
* Returns: -ENOMEM if memory fails.
*/
-int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
int i;
+ if (bootmem)
+ gfp = GFP_NOWAIT;
+
memset(cp, 0, sizeof(*cp));
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +180,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
spin_lock_init(&vec->lock);
vec->count = 0;
- if (bootmem)
- alloc_bootmem_cpumask_var(&vec->mask);
- else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&vec->mask, gfp))
goto cleanup;
}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f165..70c7e0b7994 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
spread, rq0_min_vruntime, spread0;
- struct rq *rq = &per_cpu(runqueues, cpu);
+ struct rq *rq = cpu_rq(cpu);
struct sched_entity *last;
unsigned long flags;
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
if (last)
max_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+ rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
spin_unlock_irqrestore(&rq->lock, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
static void print_cpu(struct seq_file *m, int cpu)
{
- struct rq *rq = &per_cpu(runqueues, cpu);
+ struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_X86
{
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f11..652e8bdef9a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -266,6 +266,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
+static inline int entity_before(struct sched_entity *a,
+ struct sched_entity *b)
+{
+ return (s64)(a->vruntime - b->vruntime) < 0;
+}
+
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
@@ -430,12 +436,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
for_each_sched_entity(se) {
struct load_weight *load;
+ struct load_weight lw;
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
- struct load_weight lw = cfs_rq->load;
+ lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
@@ -604,9 +611,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
+ struct task_struct *tsk = NULL;
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
if (se->sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
- struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -617,11 +628,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0;
se->sum_sleep_runtime += delta;
- account_scheduler_latency(tsk, delta >> 10, 1);
+ if (tsk)
+ account_scheduler_latency(tsk, delta >> 10, 1);
}
if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start;
- struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -632,17 +643,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->block_start = 0;
se->sum_sleep_runtime += delta;
- /*
- * Blocking time is in units of nanosecs, so shift by 20 to
- * get a milliseconds-range estimation of the amount of
- * time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
-
- profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
- delta >> 20);
+ if (tsk) {
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
}
- account_scheduler_latency(tsk, delta >> 10, 0);
}
#endif
}
@@ -686,7 +699,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
* all of which have the same weight.
*/
if (sched_feat(NORMALIZED_SLEEPER) &&
- task_of(se)->policy != SCHED_IDLE)
+ (!entity_is_task(se) ||
+ task_of(se)->policy != SCHED_IDLE))
thresh = calc_delta_fair(thresh, se);
vruntime -= thresh;
@@ -1015,7 +1029,7 @@ static void yield_task_fair(struct rq *rq)
/*
* Already in the rightmost position?
*/
- if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
+ if (unlikely(!rightmost || entity_before(rightmost, se)))
return;
/*
@@ -1487,17 +1501,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
find_matching_se(&se, &pse);
- while (se) {
- BUG_ON(!pse);
-
- if (wakeup_preempt_entity(se, pse) == 1) {
- resched_task(curr);
- break;
- }
+ BUG_ON(!pse);
- se = parent_entity(se);
- pse = parent_entity(pse);
- }
+ if (wakeup_preempt_entity(se, pse) == 1)
+ resched_task(curr);
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1718,7 +1725,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
/* 'curr' will be NULL if the child belongs to a different group */
if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
- curr && curr->vruntime < se->vruntime) {
+ curr && entity_before(curr, se)) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c1..499672c10cb 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
-
+ /* adjust the active tasks as we might go into a long sleep */
+ calc_load_account_active(rq);
return rq->idle;
}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a7304..3918e01994e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -10,6 +10,8 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
#ifdef CONFIG_RT_GROUP_SCHED
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return rt_rq->rq;
@@ -22,6 +24,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
#else /* CONFIG_RT_GROUP_SCHED */
+#define rt_entity_is_task(rt_se) (1)
+
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return container_of(rt_rq, struct rq, rt);
@@ -73,7 +77,7 @@ static inline void rt_clear_overload(struct rq *rq)
static void update_rt_migration(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+ if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
if (!rt_rq->overloaded) {
rt_set_overload(rq_of_rt_rq(rt_rq));
rt_rq->overloaded = 1;
@@ -86,6 +90,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total++;
if (rt_se->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
@@ -94,6 +104,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total--;
if (rt_se->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4..64c5deeaca5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#include <asm/param.h>
#include <asm/uaccess.h>
@@ -41,8 +41,6 @@
static struct kmem_cache *sigqueue_cachep;
-DEFINE_TRACE(sched_signal_send);
-
static void __user *sig_handler(struct task_struct *t, int sig)
{
return t->sighand->action[sig - 1].sa.sa_handler;
@@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
/*
* Flush all pending signals for a task.
*/
+void __flush_signals(struct task_struct *t)
+{
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ flush_sigqueue(&t->pending);
+ flush_sigqueue(&t->signal->shared_pending);
+}
+
void flush_signals(struct task_struct *t)
{
unsigned long flags;
spin_lock_irqsave(&t->sighand->siglock, flags);
- clear_tsk_thread_flag(t, TIF_SIGPENDING);
- flush_sigqueue(&t->pending);
- flush_sigqueue(&t->signal->shared_pending);
+ __flush_signals(t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
@@ -829,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
{
struct sigpending *pending;
struct sigqueue *q;
+ int override_rlimit;
trace_sched_signal_send(sig, t);
@@ -860,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
make sure at least one signal gets delivered and don't
pass on the info struct. */
- q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
- (is_si_special(info) ||
- info->si_code >= 0)));
+ if (sig < SIGRTMIN)
+ override_rlimit = (is_si_special(info) || info->si_code >= 0);
+ else
+ override_rlimit = 0;
+
+ q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+ override_rlimit);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
@@ -1402,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(task_is_stopped_or_traced(tsk));
- BUG_ON(!tsk->ptrace &&
+ BUG_ON(!task_ptrace(tsk) &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
info.si_signo = sig;
@@ -1441,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
- if (!tsk->ptrace && sig == SIGCHLD &&
+ if (!task_ptrace(tsk) && sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
@@ -1478,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
struct task_struct *parent;
struct sighand_struct *sighand;
- if (tsk->ptrace & PT_PTRACED)
+ if (task_ptrace(tsk))
parent = tsk->parent;
else {
tsk = tsk->group_leader;
@@ -1491,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
* see comment in do_notify_parent() abot the following 3 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();
@@ -1527,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
static inline int may_ptrace_stop(void)
{
- if (!likely(current->ptrace & PT_PTRACED))
+ if (!likely(task_ptrace(current)))
return 0;
/*
* Are we in the middle of do_coredump?
@@ -1745,7 +1753,7 @@ static int do_signal_stop(int signr)
static int ptrace_signal(int signr, siginfo_t *info,
struct pt_regs *regs, void *cookie)
{
- if (!(current->ptrace & PT_PTRACED))
+ if (!task_ptrace(current))
return signr;
ptrace_signal_deliver(regs, cookie);
@@ -2278,24 +2286,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
return kill_something_info(sig, &info, pid);
}
-static int do_tkill(pid_t tgid, pid_t pid, int sig)
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
{
- int error;
- struct siginfo info;
struct task_struct *p;
unsigned long flags;
-
- error = -ESRCH;
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_TKILL;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = current_uid();
+ int error = -ESRCH;
rcu_read_lock();
p = find_task_by_vpid(pid);
if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
- error = check_kill_permission(sig, &info, p);
+ error = check_kill_permission(sig, info, p);
/*
* The null signal is a permissions and process existence
* probe. No signal is actually delivered.
@@ -2305,7 +2306,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
* signal is private anyway.
*/
if (!error && sig && lock_task_sighand(p, &flags)) {
- error = specific_send_sig_info(sig, &info, p);
+ error = specific_send_sig_info(sig, info, p);
unlock_task_sighand(p, &flags);
}
}
@@ -2314,6 +2315,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
return error;
}
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+ struct siginfo info;
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_TKILL;
+ info.si_pid = task_tgid_vnr(current);
+ info.si_uid = current_uid();
+
+ return do_send_specific(tgid, pid, sig, &info);
+}
+
/**
* sys_tgkill - send signal to one specific thread
* @tgid: the thread group ID of the thread
@@ -2363,6 +2377,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
return kill_proc_info(sig, &info, pid);
}
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+ /* This is only valid for single tasks */
+ if (pid <= 0 || tgid <= 0)
+ return -EINVAL;
+
+ /* Not even root can pretend to send signals from the kernel.
+ Nor can they impersonate a kill(), which adds source info. */
+ if (info->si_code >= 0)
+ return -EPERM;
+ info->si_signo = sig;
+
+ return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+ siginfo_t __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+ return -EFAULT;
+
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *t = current;
@@ -2414,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
stack_t oss;
int error;
- if (uoss) {
- oss.ss_sp = (void __user *) current->sas_ss_sp;
- oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp);
- }
+ oss.ss_sp = (void __user *) current->sas_ss_sp;
+ oss.ss_size = current->sas_ss_size;
+ oss.ss_flags = sas_ss_flags(sp);
if (uss) {
void __user *ss_sp;
@@ -2426,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
int ss_flags;
error = -EFAULT;
- if (!access_ok(VERIFY_READ, uss, sizeof(*uss))
- || __get_user(ss_sp, &uss->ss_sp)
- || __get_user(ss_flags, &uss->ss_flags)
- || __get_user(ss_size, &uss->ss_size))
+ if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
+ goto out;
+ error = __get_user(ss_sp, &uss->ss_sp) |
+ __get_user(ss_flags, &uss->ss_flags) |
+ __get_user(ss_size, &uss->ss_size);
+ if (error)
goto out;
error = -EPERM;
@@ -2461,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
current->sas_ss_size = ss_size;
}
+ error = 0;
if (uoss) {
error = -EFAULT;
- if (copy_to_user(uoss, &oss, sizeof(oss)))
+ if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
goto out;
+ error = __put_user(oss.ss_sp, &uoss->ss_sp) |
+ __put_user(oss.ss_size, &uoss->ss_size) |
+ __put_user(oss.ss_flags, &uoss->ss_flags);
}
- error = 0;
out:
return error;
}
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f4..09d7519557d 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
EXPORT_SYMBOL(slow_work_enqueue);
/*
+ * Schedule a cull of the thread pool at some time in the near future
+ */
+static void slow_work_schedule_cull(void)
+{
+ mod_timer(&slow_work_cull_timer,
+ round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
+}
+
+/*
* Worker thread culling algorithm
*/
static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
list_empty(&vslow_work_queue) &&
atomic_read(&slow_work_thread_count) >
slow_work_min_threads) {
- mod_timer(&slow_work_cull_timer,
- jiffies + SLOW_WORK_CULL_TIMEOUT);
+ slow_work_schedule_cull();
do_cull = true;
}
}
@@ -372,8 +380,8 @@ static int slow_work_thread(void *_data)
vsmax *= atomic_read(&slow_work_thread_count);
vsmax /= 100;
- prepare_to_wait(&slow_work_thread_wq, &wait,
- TASK_INTERRUPTIBLE);
+ prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
+ TASK_INTERRUPTIBLE);
if (!freezing(current) &&
!slow_work_threads_should_exit &&
!slow_work_available(vsmax) &&
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
list_empty(&vslow_work_queue) &&
atomic_read(&slow_work_thread_count) >
slow_work_min_threads)
- mod_timer(&slow_work_cull_timer,
- jiffies + SLOW_WORK_CULL_TIMEOUT);
+ slow_work_schedule_cull();
continue;
}
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
if (atomic_dec_and_test(&slow_work_thread_count))
BUG(); /* we're running on a slow work thread... */
mod_timer(&slow_work_oom_timer,
- jiffies + SLOW_WORK_OOM_TIMEOUT);
+ round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
} else {
/* ratelimit the starting of new threads */
mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
if (n < 0 && !slow_work_may_not_start_new_thread)
slow_work_enqueue(&slow_work_new_thread);
else if (n > 0)
- mod_timer(&slow_work_cull_timer,
- jiffies + SLOW_WORK_CULL_TIMEOUT);
+ slow_work_schedule_cull();
}
mutex_unlock(&slow_work_user_lock);
}
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
atomic_read(&slow_work_thread_count);
if (n < 0)
- mod_timer(&slow_work_cull_timer,
- jiffies + SLOW_WORK_CULL_TIMEOUT);
+ slow_work_schedule_cull();
}
mutex_unlock(&slow_work_user_lock);
}
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d850120..94188b8ecc3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_BAD;
break;
-#ifdef CONFIG_CPU_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd34851..eb5e131a048 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,9 @@
#include <linux/ftrace.h>
#include <linux/smp.h>
#include <linux/tick.h>
-#include <trace/irq.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq.h>
#include <asm/irq.h>
/*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
*/
#define MAX_SOFTIRQ_RESTART 10
-DEFINE_TRACE(softirq_entry);
-DEFINE_TRACE(softirq_exit);
-
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
@@ -214,6 +213,7 @@ restart:
do {
if (pending & 1) {
int prev_count = preempt_count();
+ kstat_incr_softirqs_this_cpu(h - softirq_vec);
trace_softirq_entry(h, softirq_vec);
h->action(h);
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
softirq_vec[nr].action = action;
}
-/* Tasklets */
+/*
+ * Tasklets
+ */
struct tasklet_head
{
struct tasklet_struct *head;
@@ -383,6 +385,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
EXPORT_SYMBOL(__tasklet_hi_schedule);
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+ BUG_ON(!irqs_disabled());
+
+ t->next = __get_cpu_var(tasklet_hi_vec).head;
+ __get_cpu_var(tasklet_hi_vec).head = t;
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+
static void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -482,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
EXPORT_SYMBOL(tasklet_kill);
+/*
+ * tasklet_hrtimer
+ */
+
+/*
+ * The trampoline is called when the hrtimer expires. If this is
+ * called from the hrtimer interrupt then we schedule the tasklet as
+ * the timer callback function expects to run in softirq context. If
+ * it's called in softirq context anyway (i.e. high resolution timers
+ * disabled) then the hrtimer callback is called right away.
+ */
+static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
+{
+ struct tasklet_hrtimer *ttimer =
+ container_of(timer, struct tasklet_hrtimer, timer);
+
+ if (hrtimer_is_hres_active(timer)) {
+ tasklet_hi_schedule(&ttimer->tasklet);
+ return HRTIMER_NORESTART;
+ }
+ return ttimer->function(timer);
+}
+
+/*
+ * Helper function which calls the hrtimer callback from
+ * tasklet/softirq context
+ */
+static void __tasklet_hrtimer_trampoline(unsigned long data)
+{
+ struct tasklet_hrtimer *ttimer = (void *)data;
+ enum hrtimer_restart restart;
+
+ restart = ttimer->function(&ttimer->timer);
+ if (restart != HRTIMER_NORESTART)
+ hrtimer_restart(&ttimer->timer);
+}
+
+/**
+ * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
+ * @ttimer: tasklet_hrtimer which is initialized
+ * @function: hrtimer callback funtion which gets called from softirq context
+ * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
+ * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
+ */
+void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
+ enum hrtimer_restart (*function)(struct hrtimer *),
+ clockid_t which_clock, enum hrtimer_mode mode)
+{
+ hrtimer_init(&ttimer->timer, which_clock, mode);
+ ttimer->timer.function = __hrtimer_tasklet_trampoline;
+ tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
+ (unsigned long)ttimer);
+ ttimer->function = function;
+}
+EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
+
+/*
+ * Remote softirq bits
+ */
+
DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
EXPORT_PER_CPU_SYMBOL(softirq_work_list);
@@ -828,7 +901,7 @@ int __init __weak arch_early_irq_init(void)
return 0;
}
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
{
return 0;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf3149..b3f1097c76f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
+#include <linux/perf_counter.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/kexec.h>
@@ -1112,289 +1113,6 @@ out:
return err;
}
-/*
- * Supplementary group IDs
- */
-
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
-struct group_info *groups_alloc(int gidsetsize)
-{
- struct group_info *group_info;
- int nblocks;
- int i;
-
- nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
- /* Make sure we always allocate at least one indirect block pointer */
- nblocks = nblocks ? : 1;
- group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
- if (!group_info)
- return NULL;
- group_info->ngroups = gidsetsize;
- group_info->nblocks = nblocks;
- atomic_set(&group_info->usage, 1);
-
- if (gidsetsize <= NGROUPS_SMALL)
- group_info->blocks[0] = group_info->small_block;
- else {
- for (i = 0; i < nblocks; i++) {
- gid_t *b;
- b = (void *)__get_free_page(GFP_USER);
- if (!b)
- goto out_undo_partial_alloc;
- group_info->blocks[i] = b;
- }
- }
- return group_info;
-
-out_undo_partial_alloc:
- while (--i >= 0) {
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
- return NULL;
-}
-
-EXPORT_SYMBOL(groups_alloc);
-
-void groups_free(struct group_info *group_info)
-{
- if (group_info->blocks[0] != group_info->small_block) {
- int i;
- for (i = 0; i < group_info->nblocks; i++)
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
-}
-
-EXPORT_SYMBOL(groups_free);
-
-/* export the group_info to a user-space array */
-static int groups_to_user(gid_t __user *grouplist,
- const struct group_info *group_info)
-{
- int i;
- unsigned int count = group_info->ngroups;
-
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_to_user(grouplist, group_info->blocks[i], len))
- return -EFAULT;
-
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
- }
- return 0;
-}
-
-/* fill a group_info from a user-space array - it must be allocated already */
-static int groups_from_user(struct group_info *group_info,
- gid_t __user *grouplist)
-{
- int i;
- unsigned int count = group_info->ngroups;
-
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_from_user(group_info->blocks[i], grouplist, len))
- return -EFAULT;
-
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
- }
- return 0;
-}
-
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
-{
- int base, max, stride;
- int gidsetsize = group_info->ngroups;
-
- for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
- ; /* nothing */
- stride /= 3;
-
- while (stride) {
- max = gidsetsize - stride;
- for (base = 0; base < max; base++) {
- int left = base;
- int right = left + stride;
- gid_t tmp = GROUP_AT(group_info, right);
-
- while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
- GROUP_AT(group_info, right) =
- GROUP_AT(group_info, left);
- right = left;
- left -= stride;
- }
- GROUP_AT(group_info, right) = tmp;
- }
- stride /= 3;
- }
-}
-
-/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
-{
- unsigned int left, right;
-
- if (!group_info)
- return 0;
-
- left = 0;
- right = group_info->ngroups;
- while (left < right) {
- unsigned int mid = (left+right)/2;
- int cmp = grp - GROUP_AT(group_info, mid);
- if (cmp > 0)
- left = mid + 1;
- else if (cmp < 0)
- right = mid;
- else
- return 1;
- }
- return 0;
-}
-
-/**
- * set_groups - Change a group subscription in a set of credentials
- * @new: The newly prepared set of credentials to alter
- * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
- */
-int set_groups(struct cred *new, struct group_info *group_info)
-{
- int retval;
-
- retval = security_task_setgroups(group_info);
- if (retval)
- return retval;
-
- put_group_info(new->group_info);
- groups_sort(group_info);
- get_group_info(group_info);
- new->group_info = group_info;
- return 0;
-}
-
-EXPORT_SYMBOL(set_groups);
-
-/**
- * set_current_groups - Change current's group subscription
- * @group_info: The group list to impose
- *
- * Validate a group subscription and, if valid, impose it upon current's task
- * security record.
- */
-int set_current_groups(struct group_info *group_info)
-{
- struct cred *new;
- int ret;
-
- new = prepare_creds();
- if (!new)
- return -ENOMEM;
-
- ret = set_groups(new, group_info);
- if (ret < 0) {
- abort_creds(new);
- return ret;
- }
-
- return commit_creds(new);
-}
-
-EXPORT_SYMBOL(set_current_groups);
-
-SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
- const struct cred *cred = current_cred();
- int i;
-
- if (gidsetsize < 0)
- return -EINVAL;
-
- /* no need to grab task_lock here; it cannot change */
- i = cred->group_info->ngroups;
- if (gidsetsize) {
- if (i > gidsetsize) {
- i = -EINVAL;
- goto out;
- }
- if (groups_to_user(grouplist, cred->group_info)) {
- i = -EFAULT;
- goto out;
- }
- }
-out:
- return i;
-}
-
-/*
- * SMP: Our groups are copy-on-write. We can set them safely
- * without another task interfering.
- */
-
-SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
- struct group_info *group_info;
- int retval;
-
- if (!capable(CAP_SETGID))
- return -EPERM;
- if ((unsigned)gidsetsize > NGROUPS_MAX)
- return -EINVAL;
-
- group_info = groups_alloc(gidsetsize);
- if (!group_info)
- return -ENOMEM;
- retval = groups_from_user(group_info, grouplist);
- if (retval) {
- put_group_info(group_info);
- return retval;
- }
-
- retval = set_current_groups(group_info);
- put_group_info(group_info);
-
- return retval;
-}
-
-/*
- * Check whether we're fsgid/egid or in the supplemental group..
- */
-int in_group_p(gid_t grp)
-{
- const struct cred *cred = current_cred();
- int retval = 1;
-
- if (grp != cred->fsgid)
- retval = groups_search(cred->group_info, grp);
- return retval;
-}
-
-EXPORT_SYMBOL(in_group_p);
-
-int in_egroup_p(gid_t grp)
-{
- const struct cred *cred = current_cred();
- int retval = 1;
-
- if (grp != cred->egid)
- retval = groups_search(cred->group_info, grp);
- return retval;
-}
-
-EXPORT_SYMBOL(in_egroup_p);
-
DECLARE_RWSEM(uts_sem);
SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
@@ -1793,6 +1511,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
+ case PR_TASK_PERF_COUNTERS_DISABLE:
+ error = perf_counter_task_disable();
+ break;
+ case PR_TASK_PERF_COUNTERS_ENABLE:
+ error = perf_counter_task_enable();
+ break;
case PR_GET_TIMERSLACK:
error = current->timer_slack_ns;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad296738..68320f6b07b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+
+/* performance counters: */
+cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2970d56fb7..58be76017fd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/utsname.h>
+#include <linux/kmemcheck.h>
#include <linux/smp_lock.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -48,7 +49,9 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
+#include <linux/security.h>
#include <linux/slow-work.h>
+#include <linux/perf_counter.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -114,6 +117,7 @@ static int ngroups_max = NGROUPS_MAX;
#ifdef CONFIG_MODULES
extern char modprobe_path[];
+extern int modules_disabled;
#endif
#ifdef CONFIG_CHR_DEV_SG
extern int sg_big_buff;
@@ -326,6 +330,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
{
.ctl_name = CTL_UNNUMBERED,
@@ -534,6 +549,17 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dostring,
.strategy = &sysctl_string,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "modules_disabled",
+ .data = &modules_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
#endif
#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
{
@@ -722,6 +748,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
{
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = KERN_BOOTLOADER_TYPE,
.procname = "bootloader_type",
.data = &bootloader_type,
@@ -731,6 +765,14 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "bootloader_version",
+ .data = &bootloader_version,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "kstack_depth_to_print",
.data = &kstack_depth_to_print,
.maxlen = sizeof(int),
@@ -912,6 +954,43 @@ static struct ctl_table kern_table[] = {
.child = slow_work_sysctls,
},
#endif
+#ifdef CONFIG_PERF_COUNTERS
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "perf_counter_paranoid",
+ .data = &sysctl_perf_counter_paranoid,
+ .maxlen = sizeof(sysctl_perf_counter_paranoid),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "perf_counter_mlock_kb",
+ .data = &sysctl_perf_counter_mlock,
+ .maxlen = sizeof(sysctl_perf_counter_mlock),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "perf_counter_max_sample_rate",
+ .data = &sysctl_perf_counter_sample_rate,
+ .maxlen = sizeof(sysctl_perf_counter_sample_rate),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_KMEMCHECK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "kmemcheck",
+ .data = &kmemcheck_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -1225,16 +1304,14 @@ static struct ctl_table vm_table[] = {
.strategy = &sysctl_jiffies,
},
#endif
-#ifdef CONFIG_SECURITY
{
.ctl_name = CTL_UNNUMBERED,
.procname = "mmap_min_addr",
- .data = &mmap_min_addr,
- .maxlen = sizeof(unsigned long),
+ .data = &dac_mmap_min_addr,
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &mmap_min_addr_handler,
},
-#endif
#ifdef CONFIG_NUMA
{
.ctl_name = CTL_UNNUMBERED,
@@ -1272,7 +1349,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
-#ifdef CONFIG_UNEVICTABLE_LRU
{
.ctl_name = CTL_UNNUMBERED,
.procname = "scan_unevictable_pages",
@@ -1281,7 +1357,6 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &scan_unevictable_handler,
},
-#endif
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -2220,7 +2295,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
void *data)
{
#define TMPBUFLEN 21
- int *i, vleft, first=1, neg, val;
+ int *i, vleft, first = 1, neg;
unsigned long lval;
size_t left, len;
@@ -2273,8 +2348,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
len = p-buf;
if ((len < left) && *p && !isspace(*p))
break;
- if (neg)
- val = -val;
s += len;
left -= len;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a79..a6dcd67b041 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
return (unsigned long) clc;
}
+EXPORT_SYMBOL_GPL(clockevent_delta2ns);
/**
* clockevents_set_mode - set the operating mode of a clock event device
@@ -187,6 +189,7 @@ void clockevents_register_device(struct clock_event_device *dev)
spin_unlock(&clockevents_lock);
}
+EXPORT_SYMBOL_GPL(clockevents_register_device);
/*
* Noop handler when we shut down an event device
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e..7466cb81125 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
unsigned long flags;
int ret;
- /* save mult_orig on registration */
- c->mult_orig = c->mult;
-
spin_lock_irqsave(&clocksource_lock, flags);
ret = clocksource_enqueue(c);
if (!ret)
@@ -512,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
}
}
+ /*
+ * Check to make sure we don't switch to a non-highres capable
+ * clocksource if the tick code is in oneshot mode (highres or nohz)
+ */
+ if (tick_oneshot_mode_active() && ovr &&
+ !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
+ printk(KERN_WARNING "%s clocksource is not HRT compatible. "
+ "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
+ ovr = NULL;
+ override_name[0] = 0;
+ }
+
/* Reselect, when the override name has changed */
if (ovr != clocksource_override) {
clocksource_override = ovr;
@@ -540,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
spin_lock_irq(&clocksource_lock);
list_for_each_entry(src, &clocksource_list, list) {
- count += snprintf(buf + count,
+ /*
+ * Don't show non-HRES clocksource if the tick code is
+ * in one shot mode (highres=on or nohz=on)
+ */
+ if (!tick_oneshot_mode_active() ||
+ (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+ count += snprintf(buf + count,
max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
"%s ", src->name);
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9..877dbedc311 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
* timer stops in C3 state.
*/
-struct tick_device tick_broadcast_device;
+static struct tick_device tick_broadcast_device;
/* FIXME: Use cpumask_var_t. */
static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
static DECLARE_BITMAP(tmpmask, NR_CPUS);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e76..a96c0e2b89c 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
return 0;
}
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+ local_irq_restore(flags);
+
+ return ret;
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
/**
* tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cb..e0f59a21c06 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
cpu = smp_processor_id();
ts = &per_cpu(tick_cpu_sched, cpu);
+
+ /*
+ * Call to tick_nohz_start_idle stops the last_update_time from being
+ * updated. Thus, it must not be called in the event we are called from
+ * irq_exit() with the prior state different than idle.
+ */
+ if (!inidle && !ts->inidle)
+ goto end;
+
now = tick_nohz_start_idle(ts);
/*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
goto end;
- if (!inidle && !ts->inidle)
- goto end;
-
ts->inidle = 1;
if (need_resched())
@@ -349,7 +355,7 @@ void tick_nohz_stop_sched_tick(int inidle)
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start(&ts->sched_timer, expires,
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
goto out;
@@ -395,7 +401,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
@@ -698,7 +704,8 @@ void tick_setup_sched_timer(void)
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e..e8c77d9c633 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
/*
* This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
clock->cycle_last = cycle_now;
nsec = cyc2ns(clock, cycle_delta);
+
+ /* If arch requires, add in gettimeoffset() */
+ nsec += arch_gettimeoffset();
+
timespec_add_ns(&xtime, nsec);
nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
/* convert to nanoseconds: */
nsecs = cyc2ns(clock, cycle_delta);
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
+
} while (read_seqretry(&xtime_lock, seq));
timespec_add_ns(ts, nsecs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166..4cde8b9c716 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
/*
* Collection status, active/inactive:
*/
-static int __read_mostly active;
+int __read_mostly timer_stats_active;
/*
* Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
struct entry *entry, input;
unsigned long flags;
- if (likely(!active))
+ if (likely(!timer_stats_active))
return;
lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
input.timer_flag = timer_flag;
spin_lock_irqsave(lock, flags);
- if (!active)
+ if (!timer_stats_active)
goto out_unlock;
entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
/*
* If still active then calculate up to now:
*/
- if (active)
+ if (timer_stats_active)
time_stop = ktime_get();
time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
mutex_lock(&show_mutex);
switch (ctl[0]) {
case '0':
- if (active) {
- active = 0;
+ if (timer_stats_active) {
+ timer_stats_active = 0;
time_stop = ktime_get();
sync_access();
}
break;
case '1':
- if (!active) {
+ if (!timer_stats_active) {
reset_entries();
time_start = ktime_get();
smp_mb();
- active = 1;
+ timer_stats_active = 1;
}
break;
default:
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c3..a7f07d5a624 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,8 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -378,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
{
unsigned int flag = 0;
+ if (likely(!timer->start_site))
+ return;
if (unlikely(tbase_get_deferrable(timer->base)))
flag |= TIMER_STATS_FLAG_DEFERRABLE;
@@ -604,13 +608,12 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
}
static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires,
+ bool pending_only, int pinned)
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret;
-
- ret = 0;
+ int ret = 0 , cpu;
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -629,6 +632,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+ int preferred_cpu = get_nohz_load_balancer();
+
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+#endif
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
@@ -668,7 +683,7 @@ out_unlock:
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
- return __mod_timer(timer, expires, true);
+ return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
}
EXPORT_SYMBOL(mod_timer_pending);
@@ -699,14 +714,36 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
* networking code - if the timer is re-modified
* to be the same thing then just return:
*/
- if (timer->expires == expires && timer_pending(timer))
+ if (timer_pending(timer) && timer->expires == expires)
return 1;
- return __mod_timer(timer, expires, false);
+ return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
}
EXPORT_SYMBOL(mod_timer);
/**
+ * mod_timer_pinned - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pinned() is a way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * and not allow the timer to be migrated to a different CPU.
+ *
+ * mod_timer_pinned(timer, expires) is equivalent to:
+ *
+ * del_timer(timer); timer->expires = expires; add_timer(timer);
+ */
+int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+{
+ if (timer->expires == expires && timer_pending(timer))
+ return 1;
+
+ return __mod_timer(timer, expires, false, TIMER_PINNED);
+}
+EXPORT_SYMBOL(mod_timer_pinned);
+
+/**
* add_timer - start a timer
* @timer: the timer to be added
*
@@ -756,6 +793,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
wake_up_idle_cpu(cpu);
spin_unlock_irqrestore(&base->lock, flags);
}
+EXPORT_SYMBOL_GPL(add_timer_on);
/**
* del_timer - deactive a timer.
@@ -1015,6 +1053,9 @@ cascade:
index = slot = timer_jiffies & TVN_MASK;
do {
list_for_each_entry(nte, varp->vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
found = 1;
if (time_before(nte->expires, expires))
expires = nte->expires;
@@ -1123,53 +1164,14 @@ void update_process_times(int user_tick)
}
/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
- return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
- unsigned long active_tasks; /* fixed-point */
- static int count = LOAD_FREQ;
-
- count -= ticks;
- if (unlikely(count < 0)) {
- active_tasks = count_active_tasks();
- do {
- CALC_LOAD(avenrun[0], EXP_1, active_tasks);
- CALC_LOAD(avenrun[1], EXP_5, active_tasks);
- CALC_LOAD(avenrun[2], EXP_15, active_tasks);
- count += LOAD_FREQ;
- } while (count < 0);
- }
-}
-
-/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = __get_cpu_var(tvec_bases);
+ perf_counter_do_pending();
+
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1187,16 +1189,6 @@ void run_local_timers(void)
}
/*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
- update_wall_time();
- calc_load(ticks);
-}
-
-/*
* The 64-bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock.
* jiffies is defined in the linker script...
@@ -1205,7 +1197,8 @@ static inline void update_times(unsigned long ticks)
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
- update_times(ticks);
+ update_wall_time();
+ calc_global_load();
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -1353,7 +1346,7 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
- __mod_timer(&timer, expire, false);
+ __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
schedule();
del_singleshot_timer_sync(&timer);
@@ -1406,37 +1399,17 @@ int do_sysinfo(struct sysinfo *info)
{
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
- unsigned long seq;
+ struct timespec tp;
memset(info, 0, sizeof(struct sysinfo));
- do {
- struct timespec tp;
- seq = read_seqbegin(&xtime_lock);
-
- /*
- * This is annoying. The below is the same thing
- * posix_get_clock_monotonic() does, but it wants to
- * take the lock which we want to cover the loads stuff
- * too.
- */
-
- getnstimeofday(&tp);
- tp.tv_sec += wall_to_monotonic.tv_sec;
- tp.tv_nsec += wall_to_monotonic.tv_nsec;
- monotonic_to_bootbased(&tp);
- if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
- tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
- tp.tv_sec++;
- }
- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
- info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
- info->procs = nr_threads;
- } while (read_seqretry(&xtime_lock, seq));
+ info->procs = nr_threads;
si_meminfo(info);
si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 417d1985e29..019f380fd76 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
config HAVE_FUNCTION_GRAPH_TRACER
bool
+config HAVE_FUNCTION_GRAPH_FP_TEST
+ bool
+ help
+ An arch may pass in a unique value (frame pointer) to both the
+ entering and exiting of a function. On exit, the value is compared
+ and if it does not match, then it will panic the kernel.
+
config HAVE_FUNCTION_TRACE_MCOUNT_TEST
bool
help
@@ -48,6 +55,21 @@ config FTRACE_NMI_ENTER
depends on HAVE_FTRACE_NMI_ENTER
default y
+config EVENT_TRACING
+ select CONTEXT_SWITCH_TRACER
+ bool
+
+config CONTEXT_SWITCH_TRACER
+ select MARKERS
+ bool
+
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
config TRACING
bool
select DEBUG_FS
@@ -56,6 +78,11 @@ config TRACING
select TRACEPOINTS
select NOP_TRACER
select BINARY_PRINTF
+ select EVENT_TRACING
+
+config GENERIC_TRACER
+ bool
+ select TRACING
#
# Minimum requirements an architecture has to meet for us to
@@ -73,14 +100,20 @@ config TRACING_SUPPORT
if TRACING_SUPPORT
-menu "Tracers"
+menuconfig FTRACE
+ bool "Tracers"
+ default y if DEBUG_KERNEL
+ help
+ Enable the kernel tracing infrastructure.
+
+if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
select FRAME_POINTER
select KALLSYMS
- select TRACING
+ select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
help
Enable the kernel to trace every kernel function. This is done
@@ -95,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
bool "Kernel Function Graph Tracer"
depends on HAVE_FUNCTION_GRAPH_TRACER
depends on FUNCTION_TRACER
+ depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
default y
help
Enable the kernel to trace a function at both its return
@@ -104,13 +138,14 @@ config FUNCTION_GRAPH_TRACER
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
depends on GENERIC_TIME
select TRACE_IRQFLAGS
- select TRACING
+ select GENERIC_TRACER
select TRACER_MAX_TRACE
help
This option measures the time spent in irqs-off critical
@@ -120,7 +155,7 @@ config IRQSOFF_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /debugfs/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
(Note that kernel size and overhead increases with this option
enabled. This option and the preempt-off timing option can be
@@ -131,7 +166,7 @@ config PREEMPT_TRACER
default n
depends on GENERIC_TIME
depends on PREEMPT
- select TRACING
+ select GENERIC_TRACER
select TRACER_MAX_TRACE
help
This option measures the time spent in preemption off critical
@@ -141,7 +176,7 @@ config PREEMPT_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /debugfs/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
(Note that kernel size and overhead increases with this option
enabled. This option and the irqs-off timing option can be
@@ -150,7 +185,7 @@ config PREEMPT_TRACER
config SYSPROF_TRACER
bool "Sysprof Tracer"
depends on X86
- select TRACING
+ select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
help
This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,83 +193,103 @@ config SYSPROF_TRACER
config SCHED_TRACER
bool "Scheduling Latency Tracer"
- select TRACING
+ select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
help
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
-config CONTEXT_SWITCH_TRACER
- bool "Trace process context switches"
- select TRACING
- select MARKERS
- help
- This tracer gets called from the context switch and records
- all switching of tasks.
-
-config EVENT_TRACER
- bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+ bool "Trace process context switches and events"
+ depends on !GENERIC_TRACER
select TRACING
help
This tracer hooks to various trace points in the kernel
allowing the user to pick and choose which trace point they
- want to trace.
+ want to trace. It also includes the sched_switch tracer plugin.
config FTRACE_SYSCALLS
bool "Trace syscalls"
depends on HAVE_FTRACE_SYSCALLS
- select TRACING
+ select GENERIC_TRACER
select KALLSYMS
help
Basic tracer to catch the syscall entry and exit events.
config BOOT_TRACER
bool "Trace boot initcalls"
- select TRACING
+ select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
help
This tracer helps developers to optimize boot times: it records
the timings of the initcalls and traces key events and the identity
of tasks that can cause boot delays, such as context-switches.
- Its aim is to be parsed by the /scripts/bootgraph.pl tool to
+ Its aim is to be parsed by the scripts/bootgraph.pl tool to
produce pretty graphics about boot inefficiencies, giving a visual
representation of the delays during initcalls - but the raw
/debug/tracing/trace text output is readable too.
- You must pass in ftrace=initcall to the kernel command line
- to enable this on bootup.
+ You must pass in initcall_debug and ftrace=initcall to the kernel
+ command line to enable this on bootup.
config TRACE_BRANCH_PROFILING
+ bool
+ select GENERIC_TRACER
+
+choice
+ prompt "Branch Profiling"
+ default BRANCH_PROFILE_NONE
+ help
+ The branch profiling is a software profiler. It will add hooks
+ into the C conditionals to test which path a branch takes.
+
+ The likely/unlikely profiler only looks at the conditions that
+ are annotated with a likely or unlikely macro.
+
+ The "all branch" profiler will profile every if statement in the
+ kernel. This profiler will also enable the likely/unlikely
+ profiler as well.
+
+ Either of the above profilers add a bit of overhead to the system.
+ If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+ bool "No branch profiling"
+ help
+ No branch profiling. Branch profiling adds a bit of overhead.
+ Only enable it if you want to analyse the branching behavior.
+ Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
bool "Trace likely/unlikely profiler"
- select TRACING
+ select TRACE_BRANCH_PROFILING
help
This tracer profiles all the the likely and unlikely macros
in the kernel. It will display the results in:
- /debugfs/tracing/profile_annotated_branch
+ /sys/kernel/debug/tracing/profile_annotated_branch
Note: this will add a significant overhead, only turn this
on if you need to profile the system's use of these macros.
- Say N if unsure.
-
config PROFILE_ALL_BRANCHES
bool "Profile all if conditionals"
- depends on TRACE_BRANCH_PROFILING
+ select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
taken in the kernel is recorded whether it hit or miss.
The results will be displayed in:
- /debugfs/tracing/profile_branch
+ /sys/kernel/debug/tracing/profile_branch
+
+ This option also enables the likely/unlikely profiler.
This configuration, when enabled, will impose a great overhead
on the system. This should only be enabled when the system
is to be analyzed
-
- Say N if unsure.
+endchoice
config TRACING_BRANCHES
bool
@@ -261,7 +316,7 @@ config BRANCH_TRACER
config POWER_TRACER
bool "Trace power consumption behavior"
depends on X86
- select TRACING
+ select GENERIC_TRACER
help
This tracer helps developers to analyze and optimize the kernels
power management decisions, specifically the C-state and P-state
@@ -276,7 +331,7 @@ config STACK_TRACER
select KALLSYMS
help
This special tracer records the maximum stack footprint of the
- kernel and displays it in debugfs/tracing/stack_trace.
+ kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
This tracer works by hooking into every function call that the
kernel executes, and keeping a maximum stack depth value and
@@ -295,14 +350,14 @@ config STACK_TRACER
config HW_BRANCH_TRACER
depends on HAVE_HW_BRANCH_TRACER
bool "Trace hw branches"
- select TRACING
+ select GENERIC_TRACER
help
This tracer records all branches on the system in a circular
buffer giving access to the last N branches for each cpu.
config KMEMTRACE
bool "Trace SLAB allocations"
- select TRACING
+ select GENERIC_TRACER
help
kmemtrace provides tracing for slab allocator functions, such as
kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +377,7 @@ config KMEMTRACE
config WORKQUEUE_TRACER
bool "Trace workqueues"
- select TRACING
+ select GENERIC_TRACER
help
The workqueue tracer provides some statistical informations
about each cpu workqueue thread such as the number of the
@@ -338,7 +393,7 @@ config BLK_DEV_IO_TRACE
select RELAY
select DEBUG_FS
select TRACEPOINTS
- select TRACING
+ select GENERIC_TRACER
select STACKTRACE
help
Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +430,20 @@ config DYNAMIC_FTRACE
were made. If so, it runs stop_machine (stops all CPUS)
and modifies the code to jump over the call to ftrace.
+config FUNCTION_PROFILER
+ bool "Kernel function profiler"
+ depends on FUNCTION_TRACER
+ default n
+ help
+ This option enables the kernel function profiler. A file is created
+ in debugfs called function_profile_enabled which defaults to zero.
+ When a 1 is echoed into this file profiling begins, and when a
+ zero is entered, profiling stops. A file in the trace_stats
+ directory called functions, that show the list of functions that
+ have been hit and their counters.
+
+ If in doubt, say N
+
config FTRACE_MCOUNT_RECORD
def_bool y
depends on DYNAMIC_FTRACE
@@ -385,7 +454,7 @@ config FTRACE_SELFTEST
config FTRACE_STARTUP_TEST
bool "Perform a startup test on ftrace"
- depends on TRACING
+ depends on GENERIC_TRACER
select FTRACE_SELFTEST
help
This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +465,7 @@ config FTRACE_STARTUP_TEST
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on HAVE_MMIOTRACE_SUPPORT && PCI
- select TRACING
+ select GENERIC_TRACER
help
Mmiotrace traces Memory Mapped I/O access and is meant for
debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +485,23 @@ config MMIOTRACE_TEST
Say N, unless you absolutely know what you are doing.
-endmenu
+config RING_BUFFER_BENCHMARK
+ tristate "Ring buffer benchmark stress tester"
+ depends on RING_BUFFER
+ help
+ This option creates a test to stress the ring buffer and bench mark it.
+ It creates its own ring buffer such that it will not interfer with
+ any other users of the ring buffer (such as ftrace). It then creates
+ a producer and consumer that will run for 10 seconds and sleep for
+ 10 seconds. Each interval it will print out the number of events
+ it recorded and give a rough estimate of how long each iteration took.
+
+ It does not disable interrupts or raise its priority, so it may be
+ affected by processes that are running.
+
+ If unsure, say N
+
+endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec..844164dca90 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
-obj-$(CONFIG_EVENT_TRACER) += events.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0b..7a34cb563fe 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,11 +22,16 @@
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/debugfs.h>
+#include <linux/smp_lock.h>
#include <linux/time.h>
-#include <trace/block.h>
#include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
#include "trace_output.h"
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
static unsigned int blktrace_seq __read_mostly = 1;
static struct trace_array *blk_tr;
@@ -147,7 +152,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
{
if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
return 1;
- if (sector < bt->start_lba || sector > bt->end_lba)
+ if (sector && (sector < bt->start_lba || sector > bt->end_lba))
return 1;
if (bt->pid && pid != bt->pid)
return 1;
@@ -192,7 +197,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= MASK_TC_BIT(rw, DISCARD);
pid = tsk->pid;
- if (unlikely(act_log_check(bt, what, sector, pid)))
+ if (act_log_check(bt, what, sector, pid))
return;
cpu = raw_smp_processor_id();
@@ -263,6 +268,7 @@ static void blk_trace_free(struct blk_trace *bt)
debugfs_remove(bt->msg_file);
debugfs_remove(bt->dropped_file);
relay_close(bt->rchan);
+ debugfs_remove(bt->dir);
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -372,18 +378,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
- struct dentry *parent = dentry->d_parent;
debugfs_remove(dentry);
- /*
- * this will fail for all but the last file, but that is ok. what we
- * care about is the top level buts->name directory going away, when
- * the last trace file is gone. Then we don't have to rmdir() that
- * manually on trace stop, so it nicely solves the issue with
- * force killing of running traces.
- */
-
- debugfs_remove(parent);
return 0;
}
@@ -403,11 +399,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
.remove_buf_file = blk_remove_buf_file_callback,
};
+static void blk_trace_setup_lba(struct blk_trace *bt,
+ struct block_device *bdev)
+{
+ struct hd_struct *part = NULL;
+
+ if (bdev)
+ part = bdev->bd_part;
+
+ if (part) {
+ bt->start_lba = part->start_sect;
+ bt->end_lba = part->start_sect + part->nr_sects;
+ } else {
+ bt->start_lba = 0;
+ bt->end_lba = -1ULL;
+ }
+}
+
/*
* Setup everything required to start tracing
*/
int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct blk_user_trace_setup *buts)
+ struct block_device *bdev,
+ struct blk_user_trace_setup *buts)
{
struct blk_trace *old_bt, *bt = NULL;
struct dentry *dir = NULL;
@@ -480,10 +494,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->act_mask)
bt->act_mask = (u16) -1;
- bt->start_lba = buts->start_lba;
- bt->end_lba = buts->end_lba;
- if (!bt->end_lba)
- bt->end_lba = -1ULL;
+ blk_trace_setup_lba(bt, bdev);
+
+ /* overwrite with user settings */
+ if (buts->start_lba)
+ bt->start_lba = buts->start_lba;
+ if (buts->end_lba)
+ bt->end_lba = buts->end_lba;
bt->pid = buts->pid;
bt->trace_state = Blktrace_setup;
@@ -505,6 +522,7 @@ err:
}
int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
char __user *arg)
{
struct blk_user_trace_setup buts;
@@ -514,7 +532,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (ret)
return -EFAULT;
- ret = do_blk_trace_setup(q, name, dev, &buts);
+ ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
if (ret)
return ret;
@@ -582,7 +600,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
switch (cmd) {
case BLKTRACESETUP:
bdevname(bdev, b);
- ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+ ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
case BLKTRACESTART:
start = 1;
@@ -642,12 +660,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (blk_pc_request(rq)) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
- rq->cmd_len, rq->cmd);
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+ what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
- rw, what, rq->errors, 0, NULL);
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+ what, rq->errors, 0, NULL);
}
}
@@ -809,7 +827,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
* @bio: the source bio
* @dev: target device
* @from: source sector
- * @to: target sector
*
* Description:
* Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +834,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
*
**/
static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from, sector_t to)
+ dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
@@ -825,12 +842,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
if (likely(!bt))
return;
- r.device = cpu_to_be32(dev);
- r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
- r.sector = cpu_to_be64(to);
+ r.device_from = cpu_to_be32(dev);
+ r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.sector_from = cpu_to_be64(from);
- __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
- !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+ BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+ sizeof(r), &r);
}
/**
@@ -854,11 +872,11 @@ void blk_add_driver_data(struct request_queue *q,
return;
if (blk_pc_request(rq))
- __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
- rq->errors, len, data);
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+ BLK_TA_DRV_DATA, rq->errors, len, data);
else
- __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
- 0, BLK_TA_DRV_DATA, rq->errors, len, data);
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+ BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -971,6 +989,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
return te_blk_io_trace(ent) + 1;
}
+static inline u32 t_action(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->bytes;
+}
+
static inline u32 t_sec(const struct trace_entry *ent)
{
return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1024,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
struct blk_io_trace_remap *r)
{
const struct blk_io_trace_remap *__r = pdu_start(ent);
- __u64 sector = __r->sector;
+ __u64 sector_from = __r->sector_from;
- r->device = be32_to_cpu(__r->device);
r->device_from = be32_to_cpu(__r->device_from);
- r->sector = be64_to_cpu(sector);
+ r->device_to = be32_to_cpu(__r->device_to);
+ r->sector_from = be64_to_cpu(sector_from);
}
typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1059,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
MAJOR(t->device), MINOR(t->device), act, rwbs);
}
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+ const unsigned char *pdu_buf;
+ int pdu_len;
+ int i, end, ret;
+
+ pdu_buf = pdu_start(ent);
+ pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+ if (!pdu_len)
+ return 1;
+
+ /* find the last zero that needs to be printed */
+ for (end = pdu_len - 1; end >= 0; end--)
+ if (pdu_buf[end])
+ break;
+ end++;
+
+ if (!trace_seq_putc(s, '('))
+ return 0;
+
+ for (i = 0; i < pdu_len; i++) {
+
+ ret = trace_seq_printf(s, "%s%02x",
+ i == 0 ? "" : " ", pdu_buf[i]);
+ if (!ret)
+ return ret;
+
+ /*
+ * stop when the rest is just zeroes and indicate so
+ * with a ".." appended
+ */
+ if (i == end && end != pdu_len - 1)
+ return trace_seq_puts(s, " ..) ");
+ }
+
+ return trace_seq_puts(s, ") ");
+}
+
static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%s]\n",
- t_sector(ent), t_sec(ent), cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+ int ret;
+
+ ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+ if (!ret)
+ return 0;
+ ret = blk_log_dump_pdu(s, ent);
+ if (!ret)
+ return 0;
+ return trace_seq_printf(s, "[%s]\n", cmd);
+ } else {
+ if (t_sec(ent))
+ return trace_seq_printf(s, "%llu + %u [%s]\n",
+ t_sector(ent), t_sec(ent), cmd);
+ return trace_seq_printf(s, "[%s]\n", cmd);
+ }
}
static int blk_log_with_error(struct trace_seq *s,
const struct trace_entry *ent)
{
- if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
- t_sec(ent), t_error(ent));
- return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+ if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+ int ret;
+
+ ret = blk_log_dump_pdu(s, ent);
+ if (ret)
+ return trace_seq_printf(s, "[%d]\n", t_error(ent));
+ return 0;
+ } else {
+ if (t_sec(ent))
+ return trace_seq_printf(s, "%llu + %u [%d]\n",
+ t_sector(ent),
+ t_sec(ent), t_error(ent));
+ return trace_seq_printf(s, "%llu [%d]\n",
+ t_sector(ent), t_error(ent));
+ }
}
static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
{
- struct blk_io_trace_remap r = { .device = 0, };
+ struct blk_io_trace_remap r = { .device_from = 0, };
get_pdu_remap(ent, &r);
return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
- t_sector(ent),
- t_sec(ent), MAJOR(r.device), MINOR(r.device),
- (unsigned long long)r.sector);
+ t_sector(ent), t_sec(ent),
+ MAJOR(r.device_from), MINOR(r.device_from),
+ (unsigned long long)r.sector_from);
}
static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1207,6 @@ static void blk_tracer_print_header(struct seq_file *m)
static void blk_tracer_start(struct trace_array *tr)
{
blk_tracer_enabled = true;
- trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
}
static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1219,6 @@ static int blk_tracer_init(struct trace_array *tr)
static void blk_tracer_stop(struct trace_array *tr)
{
blk_tracer_enabled = false;
- trace_flags |= TRACE_ITER_CONTEXT_INFO;
}
static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1270,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
- ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+ ret = trace_seq_printf(s, "Unknown action %x\n", what);
else {
ret = log_action(iter, what2act[what].act[long_act]);
if (ret)
@@ -1195,9 +1283,6 @@ out:
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
int flags)
{
- if (!trace_print_context(iter))
- return TRACE_TYPE_PARTIAL_LINE;
-
return print_one_line(iter, false);
}
@@ -1232,6 +1317,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
return print_one_line(iter, true);
}
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+ /* don't output context-info for blk_classic output */
+ if (bit == TRACE_BLK_OPT_CLASSIC) {
+ if (set)
+ trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+ else
+ trace_flags |= TRACE_ITER_CONTEXT_INFO;
+ }
+ return 0;
+}
+
static struct tracer blk_tracer __read_mostly = {
.name = "blk",
.init = blk_tracer_init,
@@ -1241,6 +1338,7 @@ static struct tracer blk_tracer __read_mostly = {
.print_header = blk_tracer_print_header,
.print_line = blk_tracer_print_line,
.flags = &blk_tracer_flags,
+ .set_flag = blk_tracer_set_flag,
};
static struct trace_event trace_blk_event = {
@@ -1285,7 +1383,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
/*
* Setup everything required to start tracing
*/
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+ struct block_device *bdev)
{
struct blk_trace *old_bt, *bt = NULL;
int ret = -ENOMEM;
@@ -1298,9 +1397,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
if (!bt->msg_data)
goto free_bt;
- bt->dev = dev;
+ bt->dev = bdev->bd_dev;
bt->act_mask = (u16)-1;
- bt->end_lba = -1ULL;
+
+ blk_trace_setup_lba(bt, bdev);
old_bt = xchg(&q->blk_trace, bt);
if (old_bt != NULL) {
@@ -1517,7 +1617,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (attr == &dev_attr_enable) {
if (value)
- ret = blk_trace_setup_queue(q, bdev->bd_dev);
+ ret = blk_trace_setup_queue(q, bdev);
else
ret = blk_trace_remove_queue(q);
goto out_unlock_bdev;
@@ -1525,7 +1625,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
ret = 0;
if (q->blk_trace == NULL)
- ret = blk_trace_setup_queue(q, bdev->bd_dev);
+ ret = blk_trace_setup_queue(q, bdev);
if (ret == 0) {
if (attr == &dev_attr_act_mask)
@@ -1548,3 +1648,77 @@ out:
return ret ? ret : count;
}
+int blk_trace_init_sysfs(struct device *dev)
+{
+ return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+ int i, end;
+ int len = rq->cmd_len;
+ unsigned char *cmd = rq->cmd;
+
+ if (!blk_pc_request(rq)) {
+ buf[0] = '\0';
+ return;
+ }
+
+ for (end = len - 1; end >= 0; end--)
+ if (cmd[end])
+ break;
+ end++;
+
+ for (i = 0; i < len; i++) {
+ buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+ if (i == end && end != len - 1) {
+ sprintf(buf, " ..");
+ break;
+ }
+ }
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+ int i = 0;
+
+ if (rw & WRITE)
+ rwbs[i++] = 'W';
+ else if (rw & 1 << BIO_RW_DISCARD)
+ rwbs[i++] = 'D';
+ else if (bytes)
+ rwbs[i++] = 'R';
+ else
+ rwbs[i++] = 'N';
+
+ if (rw & 1 << BIO_RW_AHEAD)
+ rwbs[i++] = 'A';
+ if (rw & 1 << BIO_RW_BARRIER)
+ rwbs[i++] = 'B';
+ if (rw & 1 << BIO_RW_SYNCIO)
+ rwbs[i++] = 'S';
+ if (rw & 1 << BIO_RW_META)
+ rwbs[i++] = 'M';
+
+ rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+ int rw = rq->cmd_flags & 0x03;
+ int bytes;
+
+ if (blk_discard_rq(rq))
+ rw |= (1 << BIO_RW_DISCARD);
+
+ bytes = blk_rq_bytes(rq);
+
+ blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 246f2aa6dc4..00000000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-
-#include <linux/stringify.h>
-
-#include <trace/trace_events.h>
-
-#include "trace_output.h"
-
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
-
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080406c..1e1d23c2630 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,11 +29,13 @@
#include <linux/list.h>
#include <linux/hash.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#include <asm/ftrace.h>
+#include <asm/setup.h>
-#include "trace.h"
+#include "trace_output.h"
+#include "trace_stat.h"
#define FTRACE_WARN_ON(cond) \
do { \
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
static struct ftrace_ops ftrace_list_end __read_mostly =
{
- .func = ftrace_stub,
+ .func = ftrace_stub,
};
static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,581 @@ static void ftrace_update_pid_func(void)
#endif
}
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+ struct hlist_node node;
+ unsigned long ip;
+ unsigned long counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ unsigned long long time;
+#endif
+};
+
+struct ftrace_profile_page {
+ struct ftrace_profile_page *next;
+ unsigned long index;
+ struct ftrace_profile records[];
+};
+
+struct ftrace_profile_stat {
+ atomic_t disabled;
+ struct hlist_head *hash;
+ struct ftrace_profile_page *pages;
+ struct ftrace_profile_page *start;
+ struct tracer_stat stat;
+};
+
+#define PROFILE_RECORDS_SIZE \
+ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
+
+#define PROFILES_PER_PAGE \
+ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
+
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static void *
+function_stat_next(void *v, int idx)
+{
+ struct ftrace_profile *rec = v;
+ struct ftrace_profile_page *pg;
+
+ pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+ if (idx != 0)
+ rec++;
+
+ if ((void *)rec >= (void *)&pg->records[pg->index]) {
+ pg = pg->next;
+ if (!pg)
+ return NULL;
+ rec = &pg->records[0];
+ if (!rec->counter)
+ goto again;
+ }
+
+ return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+ struct ftrace_profile_stat *stat =
+ container_of(trace, struct ftrace_profile_stat, stat);
+
+ if (!stat || !stat->start)
+ return NULL;
+
+ return function_stat_next(&stat->start->records[0], 0);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_profile *a = p1;
+ struct ftrace_profile *b = p2;
+
+ if (a->time < b->time)
+ return -1;
+ if (a->time > b->time)
+ return 1;
+ else
+ return 0;
+}
+#else
+/* not function graph compares against hits */
+static int function_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_profile *a = p1;
+ struct ftrace_profile *b = p2;
+
+ if (a->counter < b->counter)
+ return -1;
+ if (a->counter > b->counter)
+ return 1;
+ else
+ return 0;
+}
+#endif
+
+static int function_stat_headers(struct seq_file *m)
+{
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ seq_printf(m, " Function "
+ "Hit Time Avg\n"
+ " -------- "
+ "--- ---- ---\n");
+#else
+ seq_printf(m, " Function Hit\n"
+ " -------- ---\n");
+#endif
+ return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_profile *rec = v;
+ char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ static DEFINE_MUTEX(mutex);
+ static struct trace_seq s;
+ unsigned long long avg;
+#endif
+
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ seq_printf(m, " %-30.30s %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ seq_printf(m, " ");
+ avg = rec->time;
+ do_div(avg, rec->counter);
+
+ mutex_lock(&mutex);
+ trace_seq_init(&s);
+ trace_print_graph_duration(rec->time, &s);
+ trace_seq_puts(&s, " ");
+ trace_print_graph_duration(avg, &s);
+ trace_print_seq(m, &s);
+ mutex_unlock(&mutex);
+#endif
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+{
+ struct ftrace_profile_page *pg;
+
+ pg = stat->pages = stat->start;
+
+ while (pg) {
+ memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+ pg->index = 0;
+ pg = pg->next;
+ }
+
+ memset(stat->hash, 0,
+ FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
+
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+{
+ struct ftrace_profile_page *pg;
+ int functions;
+ int pages;
+ int i;
+
+ /* If we already allocated, do nothing */
+ if (stat->pages)
+ return 0;
+
+ stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!stat->pages)
+ return -ENOMEM;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ functions = ftrace_update_tot_cnt;
+#else
+ /*
+ * We do not know the number of functions that exist because
+ * dynamic tracing is what counts them. With past experience
+ * we have around 20K functions. That should be more than enough.
+ * It is highly unlikely we will execute every function in
+ * the kernel.
+ */
+ functions = 20000;
+#endif
+
+ pg = stat->start = stat->pages;
+
+ pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+ for (i = 0; i < pages; i++) {
+ pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!pg->next)
+ goto out_free;
+ pg = pg->next;
+ }
+
+ return 0;
+
+ out_free:
+ pg = stat->start;
+ while (pg) {
+ unsigned long tmp = (unsigned long)pg;
+
+ pg = pg->next;
+ free_page(tmp);
+ }
+
+ free_page((unsigned long)stat->pages);
+ stat->pages = NULL;
+ stat->start = NULL;
+
+ return -ENOMEM;
+}
+
+static int ftrace_profile_init_cpu(int cpu)
+{
+ struct ftrace_profile_stat *stat;
+ int size;
+
+ stat = &per_cpu(ftrace_profile_stats, cpu);
+
+ if (stat->hash) {
+ /* If the profile is already created, simply reset it */
+ ftrace_profile_reset(stat);
+ return 0;
+ }
+
+ /*
+ * We are profiling all functions, but usually only a few thousand
+ * functions are hit. We'll make a hash of 1024 items.
+ */
+ size = FTRACE_PROFILE_HASH_SIZE;
+
+ stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+ if (!stat->hash)
+ return -ENOMEM;
+
+ if (!ftrace_profile_bits) {
+ size--;
+
+ for (; size; size >>= 1)
+ ftrace_profile_bits++;
+ }
+
+ /* Preallocate the function profiling pages */
+ if (ftrace_profile_pages_init(stat) < 0) {
+ kfree(stat->hash);
+ stat->hash = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int ftrace_profile_init(void)
+{
+ int cpu;
+ int ret = 0;
+
+ for_each_online_cpu(cpu) {
+ ret = ftrace_profile_init_cpu(cpu);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/* interrupts must be disabled */
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+ struct ftrace_profile *rec;
+ struct hlist_head *hhd;
+ struct hlist_node *n;
+ unsigned long key;
+
+ key = hash_long(ip, ftrace_profile_bits);
+ hhd = &stat->hash[key];
+
+ if (hlist_empty(hhd))
+ return NULL;
+
+ hlist_for_each_entry_rcu(rec, n, hhd, node) {
+ if (rec->ip == ip)
+ return rec;
+ }
+
+ return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+ struct ftrace_profile *rec)
+{
+ unsigned long key;
+
+ key = hash_long(rec->ip, ftrace_profile_bits);
+ hlist_add_head_rcu(&rec->node, &stat->hash[key]);
+}
+
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
+static struct ftrace_profile *
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+ struct ftrace_profile *rec = NULL;
+
+ /* prevent recursion (from NMIs) */
+ if (atomic_inc_return(&stat->disabled) != 1)
+ goto out;
+
+ /*
+ * Try to find the function again since an NMI
+ * could have added it
+ */
+ rec = ftrace_find_profiled_func(stat, ip);
+ if (rec)
+ goto out;
+
+ if (stat->pages->index == PROFILES_PER_PAGE) {
+ if (!stat->pages->next)
+ goto out;
+ stat->pages = stat->pages->next;
+ }
+
+ rec = &stat->pages->records[stat->pages->index++];
+ rec->ip = ip;
+ ftrace_add_profile(stat, rec);
+
+ out:
+ atomic_dec(&stat->disabled);
+
+ return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct ftrace_profile_stat *stat;
+ struct ftrace_profile *rec;
+ unsigned long flags;
+
+ if (!ftrace_profile_enabled)
+ return;
+
+ local_irq_save(flags);
+
+ stat = &__get_cpu_var(ftrace_profile_stats);
+ if (!stat->hash || !ftrace_profile_enabled)
+ goto out;
+
+ rec = ftrace_find_profiled_func(stat, ip);
+ if (!rec) {
+ rec = ftrace_profile_alloc(stat, ip);
+ if (!rec)
+ goto out;
+ }
+
+ rec->counter++;
+ out:
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+ function_profile_call(trace->func, 0);
+ return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct ftrace_profile_stat *stat;
+ unsigned long long calltime;
+ struct ftrace_profile *rec;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ stat = &__get_cpu_var(ftrace_profile_stats);
+ if (!stat->hash || !ftrace_profile_enabled)
+ goto out;
+
+ calltime = trace->rettime - trace->calltime;
+
+ if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+ int index;
+
+ index = trace->depth;
+
+ /* Append this call time to the parent time to subtract */
+ if (index)
+ current->ret_stack[index - 1].subtime += calltime;
+
+ if (current->ret_stack[index].subtime < calltime)
+ calltime -= current->ret_stack[index].subtime;
+ else
+ calltime = 0;
+ }
+
+ rec = ftrace_find_profiled_func(stat, trace->func);
+ if (rec)
+ rec->time += calltime;
+
+ out:
+ local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+ return register_ftrace_graph(&profile_graph_return,
+ &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+ unregister_ftrace_graph();
+}
+#else
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+ .func = function_profile_call,
+};
+
+static int register_ftrace_profiler(void)
+{
+ return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+ unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ char buf[64]; /* big enough to hold a number */
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ val = !!val;
+
+ mutex_lock(&ftrace_profile_lock);
+ if (ftrace_profile_enabled ^ val) {
+ if (val) {
+ ret = ftrace_profile_init();
+ if (ret < 0) {
+ cnt = ret;
+ goto out;
+ }
+
+ ret = register_ftrace_profiler();
+ if (ret < 0) {
+ cnt = ret;
+ goto out;
+ }
+ ftrace_profile_enabled = 1;
+ } else {
+ ftrace_profile_enabled = 0;
+ /*
+ * unregister_ftrace_profiler calls stop_machine
+ * so this acts like an synchronize_sched.
+ */
+ unregister_ftrace_profiler();
+ }
+ }
+ out:
+ mutex_unlock(&ftrace_profile_lock);
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64]; /* big enough to hold a number */
+ int r;
+
+ r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations ftrace_profile_fops = {
+ .open = tracing_open_generic,
+ .read = ftrace_profile_read,
+ .write = ftrace_profile_write,
+};
+
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+ .name = "functions",
+ .stat_start = function_stat_start,
+ .stat_next = function_stat_next,
+ .stat_cmp = function_stat_cmp,
+ .stat_headers = function_stat_headers,
+ .stat_show = function_stat_show
+};
+
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+ struct ftrace_profile_stat *stat;
+ struct dentry *entry;
+ char *name;
+ int ret;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ stat = &per_cpu(ftrace_profile_stats, cpu);
+
+ /* allocate enough for function name + cpu number */
+ name = kmalloc(32, GFP_KERNEL);
+ if (!name) {
+ /*
+ * The files created are permanent, if something happens
+ * we still do not free memory.
+ */
+ WARN(1,
+ "Could not allocate stat file for cpu %d\n",
+ cpu);
+ return;
+ }
+ stat->stat = function_stats;
+ snprintf(name, 32, "function%d", cpu);
+ stat->stat.name = name;
+ ret = register_stat_tracer(&stat->stat);
+ if (ret) {
+ WARN(1,
+ "Could not register function stat for cpu %d\n",
+ cpu);
+ kfree(name);
+ return;
+ }
+ }
+
+ entry = debugfs_create_file("function_profile_enabled", 0644,
+ d_tracer, NULL, &ftrace_profile_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'function_profile_enabled' entry\n");
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
/* set when tracing only a pid */
struct pid *ftrace_pid_trace;
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +838,6 @@ struct ftrace_func_probe {
struct rcu_head rcu;
};
-
enum {
FTRACE_ENABLE_CALLS = (1 << 0),
FTRACE_DISABLE_CALLS = (1 << 1),
@@ -346,30 +922,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
rec->flags |= FTRACE_FL_FREE;
}
-void ftrace_release(void *start, unsigned long size)
-{
- struct dyn_ftrace *rec;
- struct ftrace_page *pg;
- unsigned long s = (unsigned long)start;
- unsigned long e = s + size;
-
- if (ftrace_disabled || !start)
- return;
-
- mutex_lock(&ftrace_lock);
- do_for_each_ftrace_rec(pg, rec) {
- if ((rec->ip >= s) && (rec->ip < e)) {
- /*
- * rec->ip is changed in ftrace_free_rec()
- * It should not between s and e if record was freed.
- */
- FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
- ftrace_free_rec(rec);
- }
- } while_for_each_ftrace_rec();
- mutex_unlock(&ftrace_lock);
-}
-
static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
{
struct dyn_ftrace *rec;
@@ -673,6 +1225,13 @@ static void ftrace_shutdown(int command)
return;
ftrace_start_up--;
+ /*
+ * Just warn in case of unbalance, no need to kill ftrace, it's not
+ * critical but the ftrace_call callers may be never nopped again after
+ * further ftrace uses.
+ */
+ WARN_ON_ONCE(ftrace_start_up < 0);
+
if (!ftrace_start_up)
command |= FTRACE_DISABLE_CALLS;
@@ -859,10 +1418,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
void *p = NULL;
+ loff_t l;
+
+ if (!(iter->flags & FTRACE_ITER_HASH))
+ *pos = 0;
iter->flags |= FTRACE_ITER_HASH;
- return t_hash_next(m, p, pos);
+ iter->hidx = 0;
+ for (l = 0; l <= *pos; ) {
+ p = t_hash_next(m, p, &l);
+ if (!p)
+ break;
+ }
+ return p;
}
static int t_hash_show(struct seq_file *m, void *v)
@@ -909,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
iter->pg = iter->pg->next;
iter->idx = 0;
goto retry;
- } else {
- iter->idx = -1;
}
} else {
rec = &iter->pg->records[iter->idx++];
@@ -939,6 +1506,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
void *p = NULL;
+ loff_t l;
mutex_lock(&ftrace_lock);
/*
@@ -950,23 +1518,21 @@ static void *t_start(struct seq_file *m, loff_t *pos)
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
- (*pos)++;
return iter;
}
if (iter->flags & FTRACE_ITER_HASH)
return t_hash_start(m, pos);
- if (*pos > 0) {
- if (iter->idx < 0)
- return p;
- (*pos)--;
- iter->idx--;
+ iter->pg = ftrace_pages_start;
+ iter->idx = 0;
+ for (l = 0; l <= *pos; ) {
+ p = t_next(m, p, &l);
+ if (!p)
+ break;
}
- p = t_next(m, p, pos);
-
- if (!p)
+ if (!p && iter->flags & FTRACE_ITER_FILTER)
return t_hash_start(m, pos);
return p;
@@ -1096,7 +1662,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
mutex_lock(&ftrace_regex_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_filter_reset(enable);
if (file->f_mode & FMODE_READ) {
@@ -1408,7 +1974,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_probe_ops __read_mostly =
{
- .func = function_trace_probe_call,
+ .func = function_trace_probe_call,
};
static int ftrace_probe_registered;
@@ -1823,6 +2389,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
ftrace_set_regex(buf, len, reset, 0);
}
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+ strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+ strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+ char *func;
+
+ while (buf) {
+ func = strsep(&buf, ",");
+ ftrace_set_regex(func, strlen(func), 0, enable);
+ }
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+ if (ftrace_filter_buf[0])
+ set_ftrace_early_filter(ftrace_filter_buf, 1);
+ if (ftrace_notrace_buf[0])
+ set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
static int
ftrace_regex_release(struct inode *inode, struct file *file, int enable)
{
@@ -1903,32 +2508,31 @@ int ftrace_graph_count;
unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
static void *
-g_next(struct seq_file *m, void *v, loff_t *pos)
+__g_next(struct seq_file *m, loff_t *pos)
{
unsigned long *array = m->private;
- int index = *pos;
- (*pos)++;
-
- if (index >= ftrace_graph_count)
+ if (*pos >= ftrace_graph_count)
return NULL;
+ return &array[*pos];
+}
- return &array[index];
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return __g_next(m, pos);
}
static void *g_start(struct seq_file *m, loff_t *pos)
{
- void *p = NULL;
-
mutex_lock(&graph_lock);
/* Nothing, tell g_show to print all functions are enabled */
if (!ftrace_graph_count && !*pos)
return (void *)1;
- p = g_next(m, p, pos);
-
- return p;
+ return __g_next(m, pos);
}
static void g_stop(struct seq_file *m, void *p)
@@ -1973,7 +2577,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
mutex_lock(&graph_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND)) {
+ (file->f_flags & O_TRUNC)) {
ftrace_graph_count = 0;
memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
}
@@ -1992,6 +2596,14 @@ ftrace_graph_open(struct inode *inode, struct file *file)
}
static int
+ftrace_graph_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+ return 0;
+}
+
+static int
ftrace_set_func(unsigned long *array, int *idx, char *buffer)
{
struct dyn_ftrace *rec;
@@ -2120,46 +2732,32 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
}
static const struct file_operations ftrace_graph_fops = {
- .open = ftrace_graph_open,
- .read = seq_read,
- .write = ftrace_graph_write,
+ .open = ftrace_graph_open,
+ .read = seq_read,
+ .write = ftrace_graph_write,
+ .release = ftrace_graph_release,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
{
- struct dentry *entry;
- entry = debugfs_create_file("available_filter_functions", 0444,
- d_tracer, NULL, &ftrace_avail_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'available_filter_functions' entry\n");
+ trace_create_file("available_filter_functions", 0444,
+ d_tracer, NULL, &ftrace_avail_fops);
- entry = debugfs_create_file("failures", 0444,
- d_tracer, NULL, &ftrace_failures_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'failures' entry\n");
+ trace_create_file("failures", 0444,
+ d_tracer, NULL, &ftrace_failures_fops);
- entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
- NULL, &ftrace_filter_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_ftrace_filter' entry\n");
+ trace_create_file("set_ftrace_filter", 0644, d_tracer,
+ NULL, &ftrace_filter_fops);
- entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+ trace_create_file("set_ftrace_notrace", 0644, d_tracer,
NULL, &ftrace_notrace_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_ftrace_notrace' entry\n");
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+ trace_create_file("set_graph_function", 0444, d_tracer,
NULL,
&ftrace_graph_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_graph_function' entry\n");
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
return 0;
@@ -2197,14 +2795,72 @@ static int ftrace_convert_nops(struct module *mod,
return 0;
}
-void ftrace_init_module(struct module *mod,
- unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+ unsigned long s = (unsigned long)start;
+ unsigned long e = (unsigned long)end;
+
+ if (ftrace_disabled || !start || start == end)
+ return;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+ if ((rec->ip >= s) && (rec->ip < e)) {
+ /*
+ * rec->ip is changed in ftrace_free_rec()
+ * It should not between s and e if record was freed.
+ */
+ FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+ ftrace_free_rec(rec);
+ }
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+ unsigned long *start, unsigned long *end)
{
if (ftrace_disabled || start == end)
return;
ftrace_convert_nops(mod, start, end);
}
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ftrace_init_module(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
+ break;
+ case MODULE_STATE_GOING:
+ ftrace_release(mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
+ break;
+ }
+
+ return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+ .notifier_call = ftrace_module_notify,
+ .priority = 0,
+};
+
extern unsigned long __start_mcount_loc[];
extern unsigned long __stop_mcount_loc[];
@@ -2236,6 +2892,12 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
+ ret = register_module_notifier(&ftrace_module_nb);
+ if (ret)
+ pr_warning("Failed to register trace ftrace module notifier\n");
+
+ set_ftrace_early_filters();
+
return;
failed:
ftrace_disabled = 1;
@@ -2417,7 +3079,6 @@ static const struct file_operations ftrace_pid_fops = {
static __init int ftrace_init_debugfs(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -2425,11 +3086,11 @@ static __init int ftrace_init_debugfs(void)
ftrace_init_dyn_debugfs(d_tracer);
- entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
- NULL, &ftrace_pid_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_ftrace_pid' entry\n");
+ trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ NULL, &ftrace_pid_fops);
+
+ ftrace_profile_debugfs(d_tracer);
+
return 0;
}
fs_initcall(ftrace_init_debugfs);
@@ -2507,10 +3168,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
- if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+ if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
goto out;
- last_ftrace_enabled = ftrace_enabled;
+ last_ftrace_enabled = !!ftrace_enabled;
if (ftrace_enabled) {
@@ -2538,7 +3199,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
static struct notifier_block ftrace_suspend_notifier;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2580,12 +3241,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
}
if (t->ret_stack == NULL) {
- t->curr_ret_stack = -1;
- /* Make sure IRQs see the -1 first: */
- barrier();
- t->ret_stack = ret_stack_list[start++];
atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
+ t->curr_ret_stack = -1;
+ /* Make sure the tasks see the -1 first: */
+ smp_wmb();
+ t->ret_stack = ret_stack_list[start++];
}
} while_each_thread(g, t);
@@ -2643,8 +3304,10 @@ static int start_graph_tracing(void)
return -ENOMEM;
/* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu)
- ftrace_graph_init_task(idle_task(cpu));
+ for_each_online_cpu(cpu) {
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_task(idle_task(cpu));
+ }
do {
ret = alloc_retstack_tasklist(ret_stack_list);
@@ -2690,7 +3353,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
mutex_lock(&ftrace_lock);
/* we currently allow only one tracer registered at a time */
- if (atomic_read(&ftrace_graph_active)) {
+ if (ftrace_graph_active) {
ret = -EBUSY;
goto out;
}
@@ -2698,10 +3361,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
register_pm_notifier(&ftrace_suspend_notifier);
- atomic_inc(&ftrace_graph_active);
+ ftrace_graph_active++;
ret = start_graph_tracing();
if (ret) {
- atomic_dec(&ftrace_graph_active);
+ ftrace_graph_active--;
goto out;
}
@@ -2719,10 +3382,10 @@ void unregister_ftrace_graph(void)
{
mutex_lock(&ftrace_lock);
- if (!unlikely(atomic_read(&ftrace_graph_active)))
+ if (unlikely(!ftrace_graph_active))
goto out;
- atomic_dec(&ftrace_graph_active);
+ ftrace_graph_active--;
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,18 +3399,25 @@ void unregister_ftrace_graph(void)
/* Allocate a return stack for newly created task */
void ftrace_graph_init_task(struct task_struct *t)
{
- if (atomic_read(&ftrace_graph_active)) {
- t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ /* Make sure we do not use the parent ret_stack */
+ t->ret_stack = NULL;
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
* sizeof(struct ftrace_ret_stack),
GFP_KERNEL);
- if (!t->ret_stack)
+ if (!ret_stack)
return;
t->curr_ret_stack = -1;
atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->ftrace_timestamp = 0;
- } else
- t->ret_stack = NULL;
+ /* make curr_ret_stack visable before we add the ret_stack */
+ smp_wmb();
+ t->ret_stack = ret_stack;
+ }
}
void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e3..1edaa9516e8 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
#include <linux/dcache.h>
#include <linux/fs.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
#include "trace_output.h"
#include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
gfp_t gfp_flags,
int node)
{
+ struct ftrace_event_call *call = &event_kmem_alloc;
struct trace_array *tr = kmemtrace_array;
struct kmemtrace_alloc_entry *entry;
struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
entry->gfp_flags = gfp_flags;
entry->node = node;
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
trace_wake_up();
}
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
unsigned long call_site,
const void *ptr)
{
+ struct ftrace_event_call *call = &event_kmem_free;
struct trace_array *tr = kmemtrace_array;
struct kmemtrace_free_entry *entry;
struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
entry->call_site = call_site;
entry->ptr = ptr;
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
trace_wake_up();
}
@@ -182,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
int cpu;
kmemtrace_array = tr;
- for_each_cpu_mask(cpu, cpu_possible_map)
+ for_each_cpu(cpu, cpu_possible_mask)
tracing_reset(tr, cpu);
kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c84..a330513d96c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
@@ -22,6 +23,28 @@
#include "trace.h"
/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+ int ret;
+
+ ret = trace_seq_printf(s, "# compressed entry header\n");
+ ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
+ ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
+ ret = trace_seq_printf(s, "\tarray : 32 bits\n");
+ ret = trace_seq_printf(s, "\n");
+ ret = trace_seq_printf(s, "\tpadding : type == %d\n",
+ RINGBUF_TYPE_PADDING);
+ ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+ RINGBUF_TYPE_TIME_EXTEND);
+ ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+
+ return ret;
+}
+
+/*
* The ring buffer is made up of a list of pages. A separate list of pages is
* allocated for each CPU. A writer may only write to a buffer that is
* associated with the CPU it is currently executing on. A reader may read
@@ -182,7 +205,11 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
-#define RB_MAX_SMALL_DATA 28
+#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
enum {
RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +218,28 @@ enum {
static inline int rb_null_event(struct ring_buffer_event *event)
{
- return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+ return event->type_len == RINGBUF_TYPE_PADDING
+ && event->time_delta == 0;
}
static inline int rb_discarded_event(struct ring_buffer_event *event)
{
- return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+ return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
}
static void rb_event_set_padding(struct ring_buffer_event *event)
{
- event->type = RINGBUF_TYPE_PADDING;
+ event->type_len = RINGBUF_TYPE_PADDING;
event->time_delta = 0;
}
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
- event->type = RINGBUF_TYPE_PADDING;
- /* time delta must be non zero */
- if (!event->time_delta)
- event->time_delta = 1;
-}
-
static unsigned
rb_event_data_length(struct ring_buffer_event *event)
{
unsigned length;
- if (event->len)
- length = event->len * RB_ALIGNMENT;
+ if (event->type_len)
+ length = event->type_len * RB_ALIGNMENT;
else
length = event->array[0];
return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +249,12 @@ rb_event_data_length(struct ring_buffer_event *event)
static unsigned
rb_event_length(struct ring_buffer_event *event)
{
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
if (rb_null_event(event))
/* undefined */
return -1;
- return rb_event_data_length(event);
+ return event->array[0] + RB_EVNT_HDR_SIZE;
case RINGBUF_TYPE_TIME_EXTEND:
return RB_LEN_TIME_EXTEND;
@@ -271,7 +278,7 @@ rb_event_length(struct ring_buffer_event *event)
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
unsigned length = rb_event_length(event);
- if (event->type != RINGBUF_TYPE_DATA)
+ if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +291,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static void *
rb_event_data(struct ring_buffer_event *event)
{
- BUG_ON(event->type != RINGBUF_TYPE_DATA);
+ BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
- if (event->len)
+ if (event->type_len)
return (void *)&event->array[0];
/* Otherwise length is in array[0] and array[1] has the data */
return (void *)&event->array[1];
@@ -316,9 +323,10 @@ struct buffer_data_page {
};
struct buffer_page {
+ struct list_head list; /* list of buffer pages */
local_t write; /* index for next write */
unsigned read; /* index for next read */
- struct list_head list; /* list of free pages */
+ local_t entries; /* entries on this page */
struct buffer_data_page *page; /* Actual data page */
};
@@ -361,6 +369,34 @@ static inline int test_time_stamp(u64 delta)
#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+ struct buffer_data_page field;
+ int ret;
+
+ ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\n",
+ (unsigned int)sizeof(field.time_stamp));
+
+ ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit));
+
+ ret = trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE);
+
+ return ret;
+}
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -375,8 +411,13 @@ struct ring_buffer_per_cpu {
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
struct buffer_page *reader_page;
+ unsigned long nmi_dropped;
+ unsigned long commit_overrun;
unsigned long overrun;
- unsigned long entries;
+ unsigned long read;
+ local_t entries;
+ local_t committing;
+ local_t commits;
u64 write_stamp;
u64 read_stamp;
atomic_t record_disabled;
@@ -389,6 +430,8 @@ struct ring_buffer {
atomic_t record_disabled;
cpumask_var_t cpumask;
+ struct lock_class_key *reader_lock_key;
+
struct mutex mutex;
struct ring_buffer_per_cpu **buffers;
@@ -420,13 +463,18 @@ struct ring_buffer_iter {
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return buffer->clock() << DEBUG_SHIFT;
+}
+
u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
{
u64 time;
preempt_disable_notrace();
- /* shift to debug/test normalization and TIME_EXTENTS */
- time = buffer->clock() << DEBUG_SHIFT;
+ time = rb_time_stamp(buffer, cpu);
preempt_enable_no_resched_notrace();
return time;
@@ -523,6 +571,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
cpu_buffer->cpu = cpu;
cpu_buffer->buffer = buffer;
spin_lock_init(&cpu_buffer->reader_lock);
+ lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
INIT_LIST_HEAD(&cpu_buffer->pages);
@@ -572,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
-/*
- * Causes compile errors if the struct buffer_page gets bigger
- * than the struct page.
- */
-extern int ring_buffer_page_too_big(void);
-
#ifdef CONFIG_HOTPLUG_CPU
static int rb_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu);
@@ -593,17 +636,13 @@ static int rb_cpu_notify(struct notifier_block *self,
* when the buffer wraps. If this flag is not set, the buffer will
* drop data when the tail hits the head.
*/
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
{
struct ring_buffer *buffer;
int bsize;
int cpu;
- /* Paranoid! Optimizes out when all is well */
- if (sizeof(struct buffer_page) > sizeof(struct page))
- ring_buffer_page_too_big();
-
-
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
GFP_KERNEL);
@@ -616,10 +655,11 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
buffer->flags = flags;
buffer->clock = trace_clock_local;
+ buffer->reader_lock_key = key;
/* need at least two pages */
- if (buffer->pages == 1)
- buffer->pages++;
+ if (buffer->pages < 2)
+ buffer->pages = 2;
/*
* In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -673,7 +713,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
kfree(buffer);
return NULL;
}
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
/**
* ring_buffer_free - free a ring buffer.
@@ -695,6 +735,7 @@ ring_buffer_free(struct ring_buffer *buffer)
put_online_cpus();
+ kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
kfree(buffer);
@@ -947,31 +988,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
return rb_page_commit(cpu_buffer->head_page);
}
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
- struct ring_buffer_event *event;
- unsigned long head;
-
- for (head = 0; head < rb_head_size(cpu_buffer);
- head += rb_event_length(event)) {
-
- event = __rb_page_index(cpu_buffer->head_page, head);
- if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
- return;
- /* Only count data entries */
- if (event->type != RINGBUF_TYPE_DATA)
- continue;
- cpu_buffer->overrun++;
- cpu_buffer->entries--;
- }
-}
-
static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page **bpage)
{
@@ -988,12 +1004,12 @@ rb_event_index(struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+ return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
}
-static int
-rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
+static inline int
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
unsigned long index;
@@ -1006,31 +1022,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
}
static void
-rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- while (cpu_buffer->commit_page->page != (void *)addr) {
- if (RB_WARN_ON(cpu_buffer,
- cpu_buffer->commit_page == cpu_buffer->tail_page))
- return;
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- }
-
- /* Now set the commit to the event's index */
- local_set(&cpu_buffer->commit_page->page->commit, index);
-}
-
-static void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
/*
@@ -1110,28 +1101,21 @@ static void
rb_update_event(struct ring_buffer_event *event,
unsigned type, unsigned length)
{
- event->type = type;
+ event->type_len = type;
switch (type) {
case RINGBUF_TYPE_PADDING:
- break;
-
case RINGBUF_TYPE_TIME_EXTEND:
- event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
- break;
-
case RINGBUF_TYPE_TIME_STAMP:
- event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
break;
- case RINGBUF_TYPE_DATA:
+ case 0:
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA) {
- event->len = 0;
+ if (length > RB_MAX_SMALL_DATA)
event->array[0] = length;
- } else
- event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+ else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
break;
default:
BUG();
@@ -1155,158 +1139,241 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned type, unsigned long length, u64 *ts)
+static inline void
+rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ unsigned long tail, unsigned long length)
{
- struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
- unsigned long tail, write;
- struct ring_buffer *buffer = cpu_buffer->buffer;
struct ring_buffer_event *event;
- unsigned long flags;
- bool lock_taken = false;
- commit_page = cpu_buffer->commit_page;
- /* we just need to protect against interrupts */
- barrier();
- tail_page = cpu_buffer->tail_page;
- write = local_add_return(length, &tail_page->write);
- tail = write - length;
+ /*
+ * Only the event that crossed the page boundary
+ * must fill the old tail_page with padding.
+ */
+ if (tail >= BUF_PAGE_SIZE) {
+ local_sub(length, &tail_page->write);
+ return;
+ }
- /* See if we shot pass the end of this buffer page */
- if (write > BUF_PAGE_SIZE) {
- struct buffer_page *next_page = tail_page;
+ event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
- local_irq_save(flags);
- /*
- * Since the write to the buffer is still not
- * fully lockless, we must be careful with NMIs.
- * The locks in the writers are taken when a write
- * crosses to a new page. The locks protect against
- * races with the readers (this will soon be fixed
- * with a lockless solution).
- *
- * Because we can not protect against NMIs, and we
- * want to keep traces reentrant, we need to manage
- * what happens when we are in an NMI.
- *
- * NMIs can happen after we take the lock.
- * If we are in an NMI, only take the lock
- * if it is not already taken. Otherwise
- * simply fail.
- */
- if (unlikely(in_nmi())) {
- if (!__raw_spin_trylock(&cpu_buffer->lock))
- goto out_reset;
- } else
- __raw_spin_lock(&cpu_buffer->lock);
+ /*
+ * If this event is bigger than the minimum size, then
+ * we need to be careful that we don't subtract the
+ * write counter enough to allow another writer to slip
+ * in on this page.
+ * We put in a discarded commit instead, to make sure
+ * that this space is not used again.
+ *
+ * If we are less than the minimum size, we don't need to
+ * worry about it.
+ */
+ if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ /* No room for any events */
- lock_taken = true;
+ /* Mark the rest of the page with padding */
+ rb_event_set_padding(event);
- rb_inc_page(cpu_buffer, &next_page);
+ /* Set the write back to the previous setting */
+ local_sub(length, &tail_page->write);
+ return;
+ }
- head_page = cpu_buffer->head_page;
- reader_page = cpu_buffer->reader_page;
+ /* Put in a discarded event */
+ event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ event->time_delta = 1;
+ /* Account for this as an entry */
+ local_inc(&tail_page->entries);
+ local_inc(&cpu_buffer->entries);
- /* we grabbed the lock before incrementing */
- if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
- goto out_reset;
+ /* Set write to end of buffer */
+ length = (tail + length) - BUF_PAGE_SIZE;
+ local_sub(length, &tail_page->write);
+}
- /*
- * If for some reason, we had an interrupt storm that made
- * it all the way around the buffer, bail, and warn
- * about it.
- */
- if (unlikely(next_page == commit_page)) {
- WARN_ON_ONCE(1);
+static struct ring_buffer_event *
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length, unsigned long tail,
+ struct buffer_page *commit_page,
+ struct buffer_page *tail_page, u64 *ts)
+{
+ struct buffer_page *next_page, *head_page, *reader_page;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ bool lock_taken = false;
+ unsigned long flags;
+
+ next_page = tail_page;
+
+ local_irq_save(flags);
+ /*
+ * Since the write to the buffer is still not
+ * fully lockless, we must be careful with NMIs.
+ * The locks in the writers are taken when a write
+ * crosses to a new page. The locks protect against
+ * races with the readers (this will soon be fixed
+ * with a lockless solution).
+ *
+ * Because we can not protect against NMIs, and we
+ * want to keep traces reentrant, we need to manage
+ * what happens when we are in an NMI.
+ *
+ * NMIs can happen after we take the lock.
+ * If we are in an NMI, only take the lock
+ * if it is not already taken. Otherwise
+ * simply fail.
+ */
+ if (unlikely(in_nmi())) {
+ if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+ cpu_buffer->nmi_dropped++;
goto out_reset;
}
+ } else
+ __raw_spin_lock(&cpu_buffer->lock);
- if (next_page == head_page) {
- if (!(buffer->flags & RB_FL_OVERWRITE))
- goto out_reset;
+ lock_taken = true;
- /* tail_page has not moved yet? */
- if (tail_page == cpu_buffer->tail_page) {
- /* count overflows */
- rb_update_overflow(cpu_buffer);
+ rb_inc_page(cpu_buffer, &next_page);
- rb_inc_page(cpu_buffer, &head_page);
- cpu_buffer->head_page = head_page;
- cpu_buffer->head_page->read = 0;
- }
- }
+ head_page = cpu_buffer->head_page;
+ reader_page = cpu_buffer->reader_page;
- /*
- * If the tail page is still the same as what we think
- * it is, then it is up to us to update the tail
- * pointer.
- */
+ /* we grabbed the lock before incrementing */
+ if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+ goto out_reset;
+
+ /*
+ * If for some reason, we had an interrupt storm that made
+ * it all the way around the buffer, bail, and warn
+ * about it.
+ */
+ if (unlikely(next_page == commit_page)) {
+ cpu_buffer->commit_overrun++;
+ goto out_reset;
+ }
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ goto out_reset;
+
+ /* tail_page has not moved yet? */
if (tail_page == cpu_buffer->tail_page) {
- local_set(&next_page->write, 0);
- local_set(&next_page->page->commit, 0);
- cpu_buffer->tail_page = next_page;
+ /* count overflows */
+ cpu_buffer->overrun +=
+ local_read(&head_page->entries);
- /* reread the time stamp */
- *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
- cpu_buffer->tail_page->page->time_stamp = *ts;
+ rb_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ cpu_buffer->head_page->read = 0;
}
+ }
- /*
- * The actual tail page has moved forward.
- */
- if (tail < BUF_PAGE_SIZE) {
- /* Mark the rest of the page with padding */
- event = __rb_page_index(tail_page, tail);
- rb_event_set_padding(event);
- }
+ /*
+ * If the tail page is still the same as what we think
+ * it is, then it is up to us to update the tail
+ * pointer.
+ */
+ if (tail_page == cpu_buffer->tail_page) {
+ local_set(&next_page->write, 0);
+ local_set(&next_page->entries, 0);
+ local_set(&next_page->page->commit, 0);
+ cpu_buffer->tail_page = next_page;
+
+ /* reread the time stamp */
+ *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+ cpu_buffer->tail_page->page->time_stamp = *ts;
+ }
- if (tail <= BUF_PAGE_SIZE)
- /* Set the write back to the previous setting */
- local_set(&tail_page->write, tail);
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
- /*
- * If this was a commit entry that failed,
- * increment that too
- */
- if (tail_page == cpu_buffer->commit_page &&
- tail == rb_commit_index(cpu_buffer)) {
- rb_set_commit_to_write(cpu_buffer);
- }
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ /* fail and let the caller try again */
+ return ERR_PTR(-EAGAIN);
+
+ out_reset:
+ /* reset write */
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ if (likely(lock_taken))
__raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
+ local_irq_restore(flags);
+ return NULL;
+}
- /* fail and let the caller try again */
- return ERR_PTR(-EAGAIN);
- }
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *tail_page, *commit_page;
+ struct ring_buffer_event *event;
+ unsigned long tail, write;
- /* We reserved something on the buffer */
+ commit_page = cpu_buffer->commit_page;
+ /* we just need to protect against interrupts */
+ barrier();
+ tail_page = cpu_buffer->tail_page;
+ write = local_add_return(length, &tail_page->write);
+ tail = write - length;
- if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
- return NULL;
+ /* See if we shot pass the end of this buffer page */
+ if (write > BUF_PAGE_SIZE)
+ return rb_move_tail(cpu_buffer, length, tail,
+ commit_page, tail_page, ts);
+
+ /* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
rb_update_event(event, type, length);
+ /* The passed in type is zero for DATA */
+ if (likely(!type))
+ local_inc(&tail_page->entries);
+
/*
- * If this is a commit and the tail is zero, then update
- * this page's time stamp.
+ * If this is the first commit on the page, then update
+ * its timestamp.
*/
- if (!tail && rb_is_commit(cpu_buffer, event))
- cpu_buffer->commit_page->page->time_stamp = *ts;
+ if (!tail)
+ tail_page->page->time_stamp = *ts;
return event;
+}
- out_reset:
- /* reset write */
- if (tail <= BUF_PAGE_SIZE)
- local_set(&tail_page->write, tail);
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long new_index, old_index;
+ struct buffer_page *bpage;
+ unsigned long index;
+ unsigned long addr;
- if (likely(lock_taken))
- __raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
- return NULL;
+ new_index = rb_event_index(event);
+ old_index = new_index + rb_event_length(event);
+ addr = (unsigned long)event;
+ addr &= PAGE_MASK;
+
+ bpage = cpu_buffer->tail_page;
+
+ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+ /*
+ * This is on the tail page. It is possible that
+ * a write could come in and move the tail page
+ * and write to the next page. That is fine
+ * because we just shorten what is on this page.
+ */
+ index = local_cmpxchg(&bpage->write, old_index, new_index);
+ if (index == old_index)
+ return 1;
+ }
+
+ /* could not discard */
+ return 0;
}
static int
@@ -1341,26 +1408,33 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return -EAGAIN;
/* Only a commited time event can update the write stamp */
- if (rb_is_commit(cpu_buffer, event)) {
+ if (rb_event_is_commit(cpu_buffer, event)) {
/*
- * If this is the first on the page, then we need to
- * update the page itself, and just put in a zero.
+ * If this is the first on the page, then it was
+ * updated with the page itself. Try to discard it
+ * and if we can't just make it zero.
*/
if (rb_event_index(event)) {
event->time_delta = *delta & TS_MASK;
event->array[0] = *delta >> TS_SHIFT;
} else {
- cpu_buffer->commit_page->page->time_stamp = *ts;
- event->time_delta = 0;
- event->array[0] = 0;
+ /* try to discard, since we do not need this */
+ if (!rb_try_to_discard(cpu_buffer, event)) {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
}
cpu_buffer->write_stamp = *ts;
/* let the caller know this was the commit */
ret = 1;
} else {
- /* Darn, this is just wasted space */
- event->time_delta = 0;
- event->array[0] = 0;
+ /* Try to discard the event */
+ if (!rb_try_to_discard(cpu_buffer, event)) {
+ /* Darn, this is just wasted space */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
ret = 0;
}
@@ -1369,15 +1443,56 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return ret;
}
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_inc(&cpu_buffer->committing);
+ local_inc(&cpu_buffer->commits);
+}
+
+static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long commits;
+
+ if (RB_WARN_ON(cpu_buffer,
+ !local_read(&cpu_buffer->committing)))
+ return;
+
+ again:
+ commits = local_read(&cpu_buffer->commits);
+ /* synchronize with interrupts */
+ barrier();
+ if (local_read(&cpu_buffer->committing) == 1)
+ rb_set_commit_to_write(cpu_buffer);
+
+ local_dec(&cpu_buffer->committing);
+
+ /* synchronize with interrupts */
+ barrier();
+
+ /*
+ * Need to account for interrupts coming in between the
+ * updating of the commit page and the clearing of the
+ * committing counter.
+ */
+ if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+ !local_read(&cpu_buffer->committing)) {
+ local_inc(&cpu_buffer->committing);
+ goto again;
+ }
+}
+
static struct ring_buffer_event *
rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned type, unsigned long length)
+ unsigned long length)
{
struct ring_buffer_event *event;
- u64 ts, delta;
+ u64 ts, delta = 0;
int commit = 0;
int nr_loops = 0;
+ rb_start_commit(cpu_buffer);
+
+ length = rb_calculate_event_length(length);
again:
/*
* We allow for interrupts to reenter here and do a trace.
@@ -1389,9 +1504,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
* Bail!
*/
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
- return NULL;
+ goto out_fail;
- ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+ ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
/*
* Only the first commit can update the timestamp.
@@ -1401,63 +1516,93 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
* also be made. But only the entry that did the actual
* commit will be something other than zero.
*/
- if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
- rb_page_write(cpu_buffer->tail_page) ==
- rb_commit_index(cpu_buffer)) {
+ if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
+ rb_page_write(cpu_buffer->tail_page) ==
+ rb_commit_index(cpu_buffer))) {
+ u64 diff;
- delta = ts - cpu_buffer->write_stamp;
+ diff = ts - cpu_buffer->write_stamp;
- /* make sure this delta is calculated here */
+ /* make sure this diff is calculated here */
barrier();
/* Did the write stamp get updated already? */
if (unlikely(ts < cpu_buffer->write_stamp))
- delta = 0;
+ goto get_event;
- if (test_time_stamp(delta)) {
+ delta = diff;
+ if (unlikely(test_time_stamp(delta))) {
commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-
if (commit == -EBUSY)
- return NULL;
+ goto out_fail;
if (commit == -EAGAIN)
goto again;
RB_WARN_ON(cpu_buffer, commit < 0);
}
- } else
- /* Non commits have zero deltas */
- delta = 0;
+ }
- event = __rb_reserve_next(cpu_buffer, type, length, &ts);
- if (PTR_ERR(event) == -EAGAIN)
+ get_event:
+ event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+ if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again;
- if (!event) {
- if (unlikely(commit))
- /*
- * Ouch! We needed a timestamp and it was commited. But
- * we didn't get our event reserved.
- */
- rb_set_commit_to_write(cpu_buffer);
- return NULL;
- }
+ if (!event)
+ goto out_fail;
- /*
- * If the timestamp was commited, make the commit our entry
- * now so that we will update it when needed.
- */
- if (commit)
- rb_set_commit_event(cpu_buffer, event);
- else if (!rb_is_commit(cpu_buffer, event))
+ if (!rb_event_is_commit(cpu_buffer, event))
delta = 0;
event->time_delta = delta;
return event;
+
+ out_fail:
+ rb_end_commit(cpu_buffer);
+ return NULL;
+}
+
+#ifdef CONFIG_TRACING
+
+#define TRACE_RECURSIVE_DEPTH 16
+
+static int trace_recursive_lock(void)
+{
+ current->trace_recursion++;
+
+ if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ return 0;
+
+ /* Disable all tracing before we do anything else */
+ tracing_off_permanent();
+
+ printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
+ "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+ current->trace_recursion,
+ hardirq_count() >> HARDIRQ_SHIFT,
+ softirq_count() >> SOFTIRQ_SHIFT,
+ in_nmi());
+
+ WARN_ON_ONCE(1);
+ return -1;
+}
+
+static void trace_recursive_unlock(void)
+{
+ WARN_ON_ONCE(!current->trace_recursion);
+
+ current->trace_recursion--;
}
+#else
+
+#define trace_recursive_lock() (0)
+#define trace_recursive_unlock() do { } while (0)
+
+#endif
+
static DEFINE_PER_CPU(int, rb_need_resched);
/**
@@ -1491,6 +1636,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
/* If we are tracing schedule, we don't want to recurse */
resched = ftrace_preempt_disable();
+ if (trace_recursive_lock())
+ goto out_nocheck;
+
cpu = raw_smp_processor_id();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1649,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- length = rb_calculate_event_length(length);
- if (length > BUF_PAGE_SIZE)
+ if (length > BUF_MAX_DATA_SIZE)
goto out;
- event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+ event = rb_reserve_next_event(cpu_buffer, length);
if (!event)
goto out;
@@ -1520,6 +1667,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
return event;
out:
+ trace_recursive_unlock();
+
+ out_nocheck:
ftrace_preempt_enable(resched);
return NULL;
}
@@ -1528,15 +1678,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
- cpu_buffer->entries++;
+ local_inc(&cpu_buffer->entries);
- /* Only process further if we own the commit */
- if (!rb_is_commit(cpu_buffer, event))
- return;
-
- cpu_buffer->write_stamp += event->time_delta;
+ /*
+ * The event first in the commit queue updates the
+ * time stamp.
+ */
+ if (rb_event_is_commit(cpu_buffer, event))
+ cpu_buffer->write_stamp += event->time_delta;
- rb_set_commit_to_write(cpu_buffer);
+ rb_end_commit(cpu_buffer);
}
/**
@@ -1558,6 +1709,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
+ trace_recursive_unlock();
+
/*
* Only the last preempt count needs to restore preemption.
*/
@@ -1570,6 +1723,93 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+ /* array[0] holds the actual length for the discarded event */
+ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ if (!event->time_delta)
+ event->time_delta = 1;
+}
+
+/**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+ rb_event_discard(event);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* The event is discarded regardless */
+ rb_event_discard(event);
+
+ cpu = smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+
+ /*
+ * This must only be called if the event has not been
+ * committed yet. Thus we can assume that preemption
+ * is still disabled.
+ */
+ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
+
+ if (rb_try_to_discard(cpu_buffer, event))
+ goto out;
+
+ /*
+ * The commit is still visible by the reader, so we
+ * must increment entries.
+ */
+ local_inc(&cpu_buffer->entries);
+ out:
+ rb_end_commit(cpu_buffer);
+
+ trace_recursive_unlock();
+
+ /*
+ * Only the last preempt count needs to restore preemption.
+ */
+ if (preempt_count() == 1)
+ ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+ else
+ preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
/**
* ring_buffer_write - write data to the buffer without reserving
* @buffer: The ring buffer to write to.
@@ -1589,7 +1829,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
- unsigned long event_length;
void *body;
int ret = -EBUSY;
int cpu, resched;
@@ -1612,9 +1851,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- event_length = rb_calculate_event_length(length);
- event = rb_reserve_next_event(cpu_buffer,
- RINGBUF_TYPE_DATA, event_length);
+ if (length > BUF_MAX_DATA_SIZE)
+ goto out;
+
+ event = rb_reserve_next_event(cpu_buffer, length);
if (!event)
goto out;
@@ -1728,7 +1968,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
return 0;
cpu_buffer = buffer->buffers[cpu];
- ret = cpu_buffer->entries;
+ ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+ - cpu_buffer->read;
return ret;
}
@@ -1755,6 +1996,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
/**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = cpu_buffer->nmi_dropped;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = cpu_buffer->commit_overrun;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
+/**
* ring_buffer_entries - get the number of entries in a buffer
* @buffer: The ring buffer
*
@@ -1770,7 +2052,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- entries += cpu_buffer->entries;
+ entries += (local_read(&cpu_buffer->entries) -
+ cpu_buffer->overrun) - cpu_buffer->read;
}
return entries;
@@ -1862,7 +2145,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
{
u64 delta;
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
return;
@@ -1893,7 +2176,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
{
u64 delta;
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
return;
@@ -1966,6 +2249,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->list.prev = reader->list.prev;
local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
/* Make the reader page now replace the head */
@@ -2008,8 +2292,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
event = rb_reader_event(cpu_buffer);
- if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
- cpu_buffer->entries--;
+ if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+ || rb_discarded_event(event))
+ cpu_buffer->read++;
rb_update_read_stamp(cpu_buffer, event);
@@ -2031,8 +2316,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
* Check if we are at the end of the buffer.
*/
if (iter->head >= rb_page_size(iter->head_page)) {
- if (RB_WARN_ON(buffer,
- iter->head_page == cpu_buffer->commit_page))
+ /* discarded commits can make the page empty */
+ if (iter->head_page == cpu_buffer->commit_page)
return;
rb_inc_iter(iter);
return;
@@ -2075,12 +2360,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
/*
* We repeat when a timestamp is encountered. It is possible
* to get multiple timestamps from an interrupt entering just
- * as one timestamp is about to be written. The max times
- * that this can happen is the number of nested interrupts we
- * can have. Nesting 10 deep of interrupts is clearly
- * an anomaly.
+ * as one timestamp is about to be written, or from discarded
+ * commits. The most that we can have is the number on a single page.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
return NULL;
reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2372,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
event = rb_reader_event(cpu_buffer);
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
if (rb_null_event(event))
RB_WARN_ON(cpu_buffer, 1);
@@ -2101,7 +2384,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
* the box. Return the padding, and we will release
* the current locks, and try again.
*/
- rb_advance_reader(cpu_buffer);
return event;
case RINGBUF_TYPE_TIME_EXTEND:
@@ -2146,14 +2428,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
again:
/*
- * We repeat when a timestamp is encountered. It is possible
- * to get multiple timestamps from an interrupt entering just
- * as one timestamp is about to be written. The max times
- * that this can happen is the number of nested interrupts we
- * can have. Nesting 10 deep of interrupts is clearly
- * an anomaly.
+ * We repeat when a timestamp is encountered.
+ * We can get multiple timestamps by nested interrupts or also
+ * if filtering is on (discarding commits). Since discarding
+ * commits can be frequent we can get a lot of timestamps.
+ * But we limit them by not adding timestamps if they begin
+ * at the start of a page.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2443,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
event = rb_iter_head_event(iter);
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
if (rb_null_event(event)) {
rb_inc_iter(iter);
@@ -2196,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
+static inline int rb_ok_to_lock(void)
+{
+ /*
+ * If an NMI die dumps out the content of the ring buffer
+ * do not grab locks. We also permanently disable the ring
+ * buffer too. A one time deal is all you get from reading
+ * the ring buffer from an NMI.
+ */
+ if (likely(!in_nmi()))
+ return 1;
+
+ tracing_off_permanent();
+ return 0;
+}
+
/**
* ring_buffer_peek - peek at the next event to be read
* @buffer: The ring buffer to read
@@ -2211,16 +2508,24 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
unsigned long flags;
+ int dolock;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
+ dolock = rb_ok_to_lock();
again:
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(buffer, cpu, ts);
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ rb_advance_reader(cpu_buffer);
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
- if (event && event->type == RINGBUF_TYPE_PADDING) {
+ if (event && event->type_len == RINGBUF_TYPE_PADDING) {
cpu_relax();
goto again;
}
@@ -2248,7 +2553,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
event = rb_iter_peek(iter, ts);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (event && event->type == RINGBUF_TYPE_PADDING) {
+ if (event && event->type_len == RINGBUF_TYPE_PADDING) {
cpu_relax();
goto again;
}
@@ -2270,6 +2575,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event = NULL;
unsigned long flags;
+ int dolock;
+
+ dolock = rb_ok_to_lock();
again:
/* might be called in atomic */
@@ -2279,21 +2587,22 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
goto out;
cpu_buffer = buffer->buffers[cpu];
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(buffer, cpu, ts);
- if (!event)
- goto out_unlock;
-
- rb_advance_reader(cpu_buffer);
+ if (event)
+ rb_advance_reader(cpu_buffer);
- out_unlock:
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
out:
preempt_enable();
- if (event && event->type == RINGBUF_TYPE_PADDING) {
+ if (event && event->type_len == RINGBUF_TYPE_PADDING) {
cpu_relax();
goto again;
}
@@ -2386,7 +2695,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
out:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (event && event->type == RINGBUF_TYPE_PADDING) {
+ if (event && event->type_len == RINGBUF_TYPE_PADDING) {
cpu_relax();
goto again;
}
@@ -2411,6 +2720,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->head_page
= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
local_set(&cpu_buffer->head_page->write, 0);
+ local_set(&cpu_buffer->head_page->entries, 0);
local_set(&cpu_buffer->head_page->page->commit, 0);
cpu_buffer->head_page->read = 0;
@@ -2420,11 +2730,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->read = 0;
+ cpu_buffer->nmi_dropped = 0;
+ cpu_buffer->commit_overrun = 0;
cpu_buffer->overrun = 0;
- cpu_buffer->entries = 0;
+ cpu_buffer->read = 0;
+ local_set(&cpu_buffer->entries, 0);
+ local_set(&cpu_buffer->committing, 0);
+ local_set(&cpu_buffer->commits, 0);
cpu_buffer->write_stamp = 0;
cpu_buffer->read_stamp = 0;
@@ -2443,6 +2759,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
+ atomic_inc(&cpu_buffer->record_disabled);
+
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
__raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2770,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
__raw_spin_unlock(&cpu_buffer->lock);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ atomic_dec(&cpu_buffer->record_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -2475,12 +2795,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
int ring_buffer_empty(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int dolock;
int cpu;
+ int ret;
+
+ dolock = rb_ok_to_lock();
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- if (!rb_per_cpu_empty(cpu_buffer))
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
+ ret = rb_per_cpu_empty(cpu_buffer);
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ if (!ret)
return 0;
}
@@ -2496,14 +2829,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int dolock;
int ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 1;
+ dolock = rb_ok_to_lock();
+
cpu_buffer = buffer->buffers[cpu];
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
ret = rb_per_cpu_empty(cpu_buffer);
-
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
return ret;
}
@@ -2578,28 +2920,6 @@ out:
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_data_page *bpage,
- unsigned int offset)
-{
- struct ring_buffer_event *event;
- unsigned long head;
-
- __raw_spin_lock(&cpu_buffer->lock);
- for (head = offset; head < local_read(&bpage->commit);
- head += rb_event_length(event)) {
-
- event = __rb_data_page_index(bpage, head);
- if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
- return;
- /* Only count data entries */
- if (event->type != RINGBUF_TYPE_DATA)
- continue;
- cpu_buffer->entries--;
- }
- __raw_spin_unlock(&cpu_buffer->lock);
-}
-
/**
* ring_buffer_alloc_read_page - allocate a page to read from buffer
* @buffer: the buffer to allocate for.
@@ -2630,6 +2950,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
return bpage;
}
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
/**
* ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2963,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
{
free_page((unsigned long)data);
}
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
/**
* ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3090,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
/* we copied everything to the beginning */
read = 0;
} else {
+ /* update the entry counter */
+ cpu_buffer->read += local_read(&reader->entries);
+
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
reader->page = *data_page;
local_set(&reader->write, 0);
+ local_set(&reader->entries, 0);
reader->read = 0;
*data_page = bpage;
-
- /* update the entry counter */
- rb_remove_entries(cpu_buffer, bpage, read);
}
ret = read;
@@ -2787,7 +3110,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
out:
return ret;
}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#ifdef CONFIG_TRACING
static ssize_t
rb_simple_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -2845,19 +3170,17 @@ static const struct file_operations rb_simple_fops = {
static __init int rb_init_debugfs(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
d_tracer = tracing_init_dentry();
- entry = debugfs_create_file("tracing_on", 0644, d_tracer,
- &ring_buffer_flags, &rb_simple_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'tracing_on' entry\n");
+ trace_create_file("tracing_on", 0644, d_tracer,
+ &ring_buffer_flags, &rb_simple_fops);
return 0;
}
fs_initcall(rb_init_debugfs);
+#endif
#ifdef CONFIG_HOTPLUG_CPU
static int rb_cpu_notify(struct notifier_block *self,
@@ -2870,7 +3193,7 @@ static int rb_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (cpu_isset(cpu, *buffer->cpumask))
+ if (cpumask_test_cpu(cpu, buffer->cpumask))
return NOTIFY_OK;
buffer->buffers[cpu] =
@@ -2881,7 +3204,7 @@ static int rb_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
smp_wmb();
- cpu_set(cpu, *buffer->cpumask);
+ cpumask_set_cpu(cpu, buffer->cpumask);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 00000000000..573d3cc762c
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,419 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+ u64 ts;
+ local_t commit;
+ char data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME 10
+#define SLEEP_TIME 10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST() \
+ do { \
+ if (!kill_test) { \
+ kill_test = 1; \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+enum event_status {
+ EVENT_FOUND,
+ EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+ struct ring_buffer_event *event;
+ int *entry;
+ u64 ts;
+
+ event = ring_buffer_consume(buffer, cpu, &ts);
+ if (!event)
+ return EVENT_DROPPED;
+
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ return EVENT_DROPPED;
+ }
+
+ read++;
+ return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+ struct ring_buffer_event *event;
+ struct rb_page *rpage;
+ unsigned long commit;
+ void *bpage;
+ int *entry;
+ int ret;
+ int inc;
+ int i;
+
+ bpage = ring_buffer_alloc_read_page(buffer);
+ if (!bpage)
+ return EVENT_DROPPED;
+
+ ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ if (ret >= 0) {
+ rpage = bpage;
+ commit = local_read(&rpage->commit);
+ for (i = 0; i < commit && !kill_test; i += inc) {
+
+ if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ KILL_TEST();
+ break;
+ }
+
+ inc = -1;
+ event = (void *)&rpage->data[i];
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ /* failed writes may be discarded events */
+ if (!event->time_delta)
+ KILL_TEST();
+ inc = event->array[0] + 4;
+ break;
+ case RINGBUF_TYPE_TIME_EXTEND:
+ inc = 8;
+ break;
+ case 0:
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ break;
+ }
+ read++;
+ if (!event->array[0]) {
+ KILL_TEST();
+ break;
+ }
+ inc = event->array[0] + 4;
+ break;
+ default:
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ break;
+ }
+ read++;
+ inc = ((event->type_len + 1) * 4);
+ }
+ if (kill_test)
+ break;
+
+ if (inc <= 0) {
+ KILL_TEST();
+ break;
+ }
+ }
+ }
+ ring_buffer_free_read_page(buffer, bpage);
+
+ if (ret < 0)
+ return EVENT_DROPPED;
+ return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+ /* toggle between reading pages and events */
+ read_events ^= 1;
+
+ read = 0;
+ while (!reader_finish && !kill_test) {
+ int found;
+
+ do {
+ int cpu;
+
+ found = 0;
+ for_each_online_cpu(cpu) {
+ enum event_status stat;
+
+ if (read_events)
+ stat = read_event(cpu);
+ else
+ stat = read_page(cpu);
+
+ if (kill_test)
+ break;
+ if (stat == EVENT_FOUND)
+ found = 1;
+ }
+ } while (found && !kill_test);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (reader_finish)
+ break;
+
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ reader_finish = 0;
+ complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+ struct timeval start_tv;
+ struct timeval end_tv;
+ unsigned long long time;
+ unsigned long long entries;
+ unsigned long long overruns;
+ unsigned long missed = 0;
+ unsigned long hit = 0;
+ unsigned long avg;
+ int cnt = 0;
+
+ /*
+ * Hammer the buffer for 10 secs (this may
+ * make the system stall)
+ */
+ trace_printk("Starting ring buffer hammer\n");
+ do_gettimeofday(&start_tv);
+ do {
+ struct ring_buffer_event *event;
+ int *entry;
+
+ event = ring_buffer_lock_reserve(buffer, 10);
+ if (!event) {
+ missed++;
+ } else {
+ hit++;
+ entry = ring_buffer_event_data(event);
+ *entry = smp_processor_id();
+ ring_buffer_unlock_commit(buffer, event);
+ }
+ do_gettimeofday(&end_tv);
+
+ cnt++;
+ if (consumer && !(cnt % wakeup_interval))
+ wake_up_process(consumer);
+
+#ifndef CONFIG_PREEMPT
+ /*
+ * If we are a non preempt kernel, the 10 second run will
+ * stop everything while it runs. Instead, we will call
+ * cond_resched and also add any time that was lost by a
+ * rescedule.
+ *
+ * Do a cond resched at the same frequency we would wake up
+ * the reader.
+ */
+ if (cnt % wakeup_interval)
+ cond_resched();
+#endif
+
+ } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+ trace_printk("End ring buffer hammer\n");
+
+ if (consumer) {
+ /* Init both completions here to avoid races */
+ init_completion(&read_start);
+ init_completion(&read_done);
+ /* the completions must be visible before the finish var */
+ smp_wmb();
+ reader_finish = 1;
+ /* finish var visible before waking up the consumer */
+ smp_wmb();
+ wake_up_process(consumer);
+ wait_for_completion(&read_done);
+ }
+
+ time = end_tv.tv_sec - start_tv.tv_sec;
+ time *= USEC_PER_SEC;
+ time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+ entries = ring_buffer_entries(buffer);
+ overruns = ring_buffer_overruns(buffer);
+
+ if (kill_test)
+ trace_printk("ERROR!\n");
+ trace_printk("Time: %lld (usecs)\n", time);
+ trace_printk("Overruns: %lld\n", overruns);
+ if (disable_reader)
+ trace_printk("Read: (reader disabled)\n");
+ else
+ trace_printk("Read: %ld (by %s)\n", read,
+ read_events ? "events" : "pages");
+ trace_printk("Entries: %lld\n", entries);
+ trace_printk("Total: %lld\n", entries + overruns + read);
+ trace_printk("Missed: %ld\n", missed);
+ trace_printk("Hit: %ld\n", hit);
+
+ /* Convert time from usecs to millisecs */
+ do_div(time, USEC_PER_MSEC);
+ if (time)
+ hit /= (long)time;
+ else
+ trace_printk("TIME IS ZERO??\n");
+
+ trace_printk("Entries per millisec: %ld\n", hit);
+
+ if (hit) {
+ /* Calculate the average time in nanosecs */
+ avg = NSEC_PER_MSEC / hit;
+ trace_printk("%ld ns per entry\n", avg);
+ }
+
+ if (missed) {
+ if (time)
+ missed /= (long)time;
+
+ trace_printk("Total iterations per millisec: %ld\n",
+ hit + missed);
+
+ /* it is possible that hit + missed will overflow and be zero */
+ if (!(hit + missed)) {
+ trace_printk("hit + missed overflowed and totalled zero!\n");
+ hit--; /* make it non zero */
+ }
+
+ /* Caculate the average time in nanosecs */
+ avg = NSEC_PER_MSEC / (hit + missed);
+ trace_printk("%ld ns per entry\n", avg);
+ }
+}
+
+static void wait_to_die(void)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+ while (!kthread_should_stop() && !kill_test) {
+ complete(&read_start);
+
+ ring_buffer_consumer();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop() || kill_test)
+ break;
+
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ if (kill_test)
+ wait_to_die();
+
+ return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+ init_completion(&read_start);
+
+ while (!kthread_should_stop() && !kill_test) {
+ ring_buffer_reset(buffer);
+
+ if (consumer) {
+ smp_wmb();
+ wake_up_process(consumer);
+ wait_for_completion(&read_start);
+ }
+
+ ring_buffer_producer();
+
+ trace_printk("Sleeping for 10 secs\n");
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ * SLEEP_TIME);
+ __set_current_state(TASK_RUNNING);
+ }
+
+ if (kill_test)
+ wait_to_die();
+
+ return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+ int ret;
+
+ /* make a one meg buffer in overwite mode */
+ buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+ if (!buffer)
+ return -ENOMEM;
+
+ if (!disable_reader) {
+ consumer = kthread_create(ring_buffer_consumer_thread,
+ NULL, "rb_consumer");
+ ret = PTR_ERR(consumer);
+ if (IS_ERR(consumer))
+ goto out_fail;
+ }
+
+ producer = kthread_run(ring_buffer_producer_thread,
+ NULL, "rb_producer");
+ ret = PTR_ERR(producer);
+
+ if (IS_ERR(producer))
+ goto out_kill;
+
+ return 0;
+
+ out_kill:
+ if (consumer)
+ kthread_stop(consumer);
+
+ out_fail:
+ ring_buffer_free(buffer);
+ return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+ kthread_stop(producer);
+ if (consumer)
+ kthread_stop(consumer);
+ ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cda81ec58d9..c22b40f8f57 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
#include <linux/writeback.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
+#include <linux/smp_lock.h>
#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
@@ -171,6 +172,13 @@ static struct trace_array global_trace;
static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+ struct ring_buffer_event *event)
+{
+ return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
+
cycle_t ftrace_now(int cpu)
{
u64 ts;
@@ -255,7 +263,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
- TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+ TRACE_ITER_GRAPH_TIME;
/**
* trace_wake_up - wake up tasks waiting for trace input
@@ -276,13 +285,12 @@ void trace_wake_up(void)
static int __init set_buf_size(char *str)
{
unsigned long buf_size;
- int ret;
if (!str)
return 0;
- ret = strict_strtoul(str, 0, &buf_size);
+ buf_size = memparse(str, &str);
/* nr_entries can not be zero */
- if (ret < 0 || buf_size == 0)
+ if (buf_size == 0)
return 0;
trace_buf_size = buf_size;
return 1;
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
"latency-format",
"global-clock",
"sleep-time",
+ "graph-time",
NULL
};
@@ -335,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /debugfs/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
*/
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
return cnt;
}
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
- int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-
- s->buffer[len] = 0;
- seq_puts(m, s->buffer);
-
- trace_seq_init(s);
-}
-
/**
* update_max_tr - snapshot all trace buffers from global_trace to max_tr
* @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
tracing_reset(tr, cpu);
}
+void tracing_reset_current(int cpu)
+{
+ tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+ tracing_reset_online_cpus(&global_trace);
+}
+
#define SAVED_CMDLINES 128
#define NO_CMDLINE_MAP UINT_MAX
static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
return;
}
+ preempt_disable();
__raw_spin_lock(&trace_cmdline_lock);
map = map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
strcpy(comm, "<...>");
__raw_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
}
void tracing_record_cmdline(struct task_struct *tsk)
@@ -838,9 +848,10 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
}
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
- unsigned char type,
+ int type,
unsigned long len,
unsigned long flags, int pc)
{
@@ -883,30 +894,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
}
struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
unsigned long flags, int pc)
{
return trace_buffer_lock_reserve(&global_trace,
type, len, flags, pc);
}
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
unsigned long flags, int pc)
{
- return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+ __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
}
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
unsigned long flags, int pc)
{
- return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+ __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+ ring_buffer_discard_commit(global_trace.buffer, event);
}
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
void
trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
int pc)
{
+ struct ftrace_event_call *call = &event_function;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -921,7 +942,9 @@ trace_function(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->parent_ip = parent_ip;
- ring_buffer_unlock_commit(tr->buffer, event);
+
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +953,7 @@ static int __trace_graph_entry(struct trace_array *tr,
unsigned long flags,
int pc)
{
+ struct ftrace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
struct ftrace_graph_ent_entry *entry;
@@ -942,7 +966,8 @@ static int __trace_graph_entry(struct trace_array *tr,
return 0;
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
- ring_buffer_unlock_commit(global_trace.buffer, event);
+ if (!filter_current_check_discard(call, entry, event))
+ ring_buffer_unlock_commit(global_trace.buffer, event);
return 1;
}
@@ -952,6 +977,7 @@ static void __trace_graph_return(struct trace_array *tr,
unsigned long flags,
int pc)
{
+ struct ftrace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct ftrace_graph_ret_entry *entry;
@@ -964,7 +990,8 @@ static void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
- ring_buffer_unlock_commit(global_trace.buffer, event);
+ if (!filter_current_check_discard(call, entry, event))
+ ring_buffer_unlock_commit(global_trace.buffer, event);
}
#endif
@@ -982,6 +1009,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
int skip, int pc)
{
#ifdef CONFIG_STACKTRACE
+ struct ftrace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
@@ -999,7 +1027,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
trace.entries = entry->caller;
save_stack_trace(&trace);
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
#endif
}
@@ -1024,6 +1053,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
unsigned long flags, int pc)
{
#ifdef CONFIG_STACKTRACE
+ struct ftrace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
struct stack_trace trace;
@@ -1045,7 +1075,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
trace.entries = entry->caller;
save_stack_trace_user(&trace);
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
#endif
}
@@ -1089,6 +1120,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *next,
unsigned long flags, int pc)
{
+ struct ftrace_event_call *call = &event_context_switch;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -1104,7 +1136,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_prio = next->prio;
entry->next_state = next->state;
entry->next_cpu = task_cpu(next);
- trace_buffer_unlock_commit(tr, event, flags, pc);
+
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ trace_buffer_unlock_commit(tr, event, flags, pc);
}
void
@@ -1113,6 +1147,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *curr,
unsigned long flags, int pc)
{
+ struct ftrace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -1129,7 +1164,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_state = wakee->state;
entry->next_cpu = task_cpu(wakee);
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
ftrace_trace_stack(tr, flags, 6, pc);
ftrace_trace_userstack(tr, flags, pc);
}
@@ -1230,11 +1266,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
static u32 trace_buf[TRACE_BUF_SIZE];
+ struct ftrace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
struct bprint_entry *entry;
unsigned long flags;
+ int disable;
int resched;
int cpu, len = 0, size, pc;
@@ -1249,7 +1287,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
cpu = raw_smp_processor_id();
data = tr->data[cpu];
- if (unlikely(atomic_read(&data->disabled)))
+ disable = atomic_inc_return(&data->disabled);
+ if (unlikely(disable != 1))
goto out;
/* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1308,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
entry->fmt = fmt;
memcpy(entry->buf, trace_buf, sizeof(u32) * len);
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
out_unlock:
__raw_spin_unlock(&trace_buf_lock);
local_irq_restore(flags);
out:
+ atomic_dec_return(&data->disabled);
ftrace_preempt_enable(resched);
unpause_graph_tracing();
@@ -1288,12 +1329,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
static char trace_buf[TRACE_BUF_SIZE];
+ struct ftrace_event_call *call = &event_print;
struct ring_buffer_event *event;
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
int cpu, len = 0, size, pc;
struct print_entry *entry;
unsigned long irq_flags;
+ int disable;
if (tracing_disabled || tracing_selftest_running)
return 0;
@@ -1303,7 +1346,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
cpu = raw_smp_processor_id();
data = tr->data[cpu];
- if (unlikely(atomic_read(&data->disabled)))
+ disable = atomic_inc_return(&data->disabled);
+ if (unlikely(disable != 1))
goto out;
pause_graph_tracing();
@@ -1323,13 +1367,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
memcpy(&entry->buf, trace_buf, len);
entry->buf[len] = 0;
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
out_unlock:
__raw_spin_unlock(&trace_buf_lock);
raw_local_irq_restore(irq_flags);
unpause_graph_tracing();
out:
+ atomic_dec_return(&data->disabled);
preempt_enable_notrace();
return len;
@@ -1526,12 +1572,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
p = s_next(m, p, &l);
}
+ trace_event_read_lock();
return p;
}
static void s_stop(struct seq_file *m, void *p)
{
atomic_dec(&trace_record_cmdline_disabled);
+ trace_event_read_unlock();
}
static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1822,7 @@ static int trace_empty(struct trace_iterator *iter)
return 1;
}
+/* Called with trace_event_read_lock() held. */
static enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
@@ -1983,7 +2032,7 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND)) {
+ (file->f_flags & O_TRUNC)) {
long cpu = (long) inode->i_private;
if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2005,25 +2054,23 @@ static int tracing_open(struct inode *inode, struct file *file)
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct tracer *t = m->private;
+ struct tracer *t = v;
(*pos)++;
if (t)
t = t->next;
- m->private = t;
-
return t;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct tracer *t = m->private;
+ struct tracer *t;
loff_t l = 0;
mutex_lock(&trace_types_lock);
- for (; t && l < *pos; t = t_next(m, t, &l))
+ for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
;
return t;
@@ -2059,18 +2106,10 @@ static struct seq_operations show_traces_seq_ops = {
static int show_traces_open(struct inode *inode, struct file *file)
{
- int ret;
-
if (tracing_disabled)
return -ENODEV;
- ret = seq_open(file, &show_traces_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = trace_types;
- }
-
- return ret;
+ return seq_open(file, &show_traces_seq_ops);
}
static ssize_t
@@ -2143,11 +2182,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
- mutex_lock(&tracing_cpumask_update_lock);
err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
if (err)
goto err_unlock;
+ mutex_lock(&tracing_cpumask_update_lock);
+
local_irq_disable();
__raw_spin_lock(&ftrace_max_lock);
for_each_tracing_cpu(cpu) {
@@ -2175,8 +2215,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
return count;
err_unlock:
- mutex_unlock(&tracing_cpumask_update_lock);
- free_cpumask_var(tracing_cpumask);
+ free_cpumask_var(tracing_cpumask_new);
return err;
}
@@ -2366,21 +2405,20 @@ static const struct file_operations tracing_iter_fops = {
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
- "# mkdir /debug\n"
- "# mount -t debugfs nodev /debug\n\n"
- "# cat /debug/tracing/available_tracers\n"
+ "# mount -t debugfs nodev /sys/kernel/debug\n\n"
+ "# cat /sys/kernel/debug/tracing/available_tracers\n"
"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
- "# cat /debug/tracing/current_tracer\n"
+ "# cat /sys/kernel/debug/tracing/current_tracer\n"
"nop\n"
- "# echo sched_switch > /debug/tracing/current_tracer\n"
- "# cat /debug/tracing/current_tracer\n"
+ "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
+ "# cat /sys/kernel/debug/tracing/current_tracer\n"
"sched_switch\n"
- "# cat /debug/tracing/trace_options\n"
+ "# cat /sys/kernel/debug/tracing/trace_options\n"
"noprint-parent nosym-offset nosym-addr noverbose\n"
- "# echo print-parent > /debug/tracing/trace_options\n"
- "# echo 1 > /debug/tracing/tracing_enabled\n"
- "# cat /debug/tracing/trace > /tmp/trace.txt\n"
- "# echo 0 > /debug/tracing/tracing_enabled\n"
+ "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
+ "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+ "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
+ "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
;
static ssize_t
@@ -2397,6 +2435,56 @@ static const struct file_operations tracing_readme_fops = {
};
static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *buf_comm;
+ char *file_buf;
+ char *buf;
+ int len = 0;
+ int pid;
+ int i;
+
+ file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+ if (!file_buf)
+ return -ENOMEM;
+
+ buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+ if (!buf_comm) {
+ kfree(file_buf);
+ return -ENOMEM;
+ }
+
+ buf = file_buf;
+
+ for (i = 0; i < SAVED_CMDLINES; i++) {
+ int r;
+
+ pid = map_cmdline_to_pid[i];
+ if (pid == -1 || pid == NO_CMDLINE_MAP)
+ continue;
+
+ trace_find_cmdline(pid, buf_comm);
+ r = sprintf(buf, "%d %s\n", pid, buf_comm);
+ buf += r;
+ len += r;
+ }
+
+ len = simple_read_from_buffer(ubuf, cnt, ppos,
+ file_buf, len);
+
+ kfree(file_buf);
+ kfree(buf_comm);
+
+ return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_read,
+};
+
+static ssize_t
tracing_ctrl_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -2728,6 +2816,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
/* trace pipe does not show start of buffer */
cpumask_setall(iter->started);
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
iter->cpu_file = cpu_file;
iter->tr = &global_trace;
mutex_init(&iter->mutex);
@@ -2915,6 +3006,7 @@ waitagain:
offsetof(struct trace_iterator, seq));
iter->pos = -1;
+ trace_event_read_lock();
while (find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
int len = iter->seq.len;
@@ -2931,6 +3023,7 @@ waitagain:
if (iter->seq.len >= cnt)
break;
}
+ trace_event_read_unlock();
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -2993,7 +3086,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
break;
}
- trace_consume(iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(iter);
rem -= count;
if (!find_next_entry_inc(iter)) {
rem = 0;
@@ -3053,6 +3147,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
goto out_err;
}
+ trace_event_read_lock();
+
/* Fill as many pages as possible. */
for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3171,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
trace_seq_init(&iter->seq);
}
+ trace_event_read_unlock();
mutex_unlock(&iter->mutex);
spd.nr_pages = i;
@@ -3425,7 +3522,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
- int size, i;
+ int entries, size, i;
size_t ret;
if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3537,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
- for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+ entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+ for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
@@ -3457,7 +3556,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
r = ring_buffer_read_page(ref->buffer, &ref->page,
- len, info->cpu, 0);
+ len, info->cpu, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer,
ref->page);
@@ -3481,6 +3580,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
*ppos += PAGE_SIZE;
+
+ entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
}
spd.nr_pages = i;
@@ -3508,6 +3609,45 @@ static const struct file_operations tracing_buffers_fops = {
.llseek = no_llseek,
};
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long cpu = (unsigned long)filp->private_data;
+ struct trace_array *tr = &global_trace;
+ struct trace_seq *s;
+ unsigned long cnt;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return ENOMEM;
+
+ trace_seq_init(s);
+
+ cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "entries: %ld\n", cnt);
+
+ cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+ cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+ cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+
+ count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+ kfree(s);
+
+ return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_stats_read,
+};
+
#ifdef CONFIG_DYNAMIC_FTRACE
int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3737,7 @@ struct dentry *tracing_dentry_percpu(void)
static void tracing_init_debugfs_percpu(long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu();
- struct dentry *entry, *d_cpu;
+ struct dentry *d_cpu;
/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
char cpu_dir[7];
@@ -3612,21 +3752,18 @@ static void tracing_init_debugfs_percpu(long cpu)
}
/* per cpu trace_pipe */
- entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
- (void *) cpu, &tracing_pipe_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace_pipe' entry\n");
+ trace_create_file("trace_pipe", 0444, d_cpu,
+ (void *) cpu, &tracing_pipe_fops);
/* per cpu trace */
- entry = debugfs_create_file("trace", 0644, d_cpu,
- (void *) cpu, &tracing_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace' entry\n");
+ trace_create_file("trace", 0644, d_cpu,
+ (void *) cpu, &tracing_fops);
- entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
- (void *) cpu, &tracing_buffers_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+ trace_create_file("trace_pipe_raw", 0444, d_cpu,
+ (void *) cpu, &tracing_buffers_fops);
+
+ trace_create_file("stats", 0444, d_cpu,
+ (void *) cpu, &tracing_stats_fops);
}
#ifdef CONFIG_FTRACE_SELFTEST
@@ -3782,6 +3919,22 @@ static const struct file_operations trace_options_core_fops = {
.write = trace_options_core_write,
};
+struct dentry *trace_create_file(const char *name,
+ mode_t mode,
+ struct dentry *parent,
+ void *data,
+ const struct file_operations *fops)
+{
+ struct dentry *ret;
+
+ ret = debugfs_create_file(name, mode, parent, data, fops);
+ if (!ret)
+ pr_warning("Could not create debugfs '%s' entry\n", name);
+
+ return ret;
+}
+
+
static struct dentry *trace_options_init_dentry(void)
{
struct dentry *d_tracer;
@@ -3809,7 +3962,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
struct tracer_opt *opt)
{
struct dentry *t_options;
- struct dentry *entry;
t_options = trace_options_init_dentry();
if (!t_options)
@@ -3818,11 +3970,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
topt->flags = flags;
topt->opt = opt;
- entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+ topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
&trace_options_fops);
- topt->entry = entry;
-
}
static struct trace_option_dentry *
@@ -3877,123 +4027,84 @@ static struct dentry *
create_trace_option_core_file(const char *option, long index)
{
struct dentry *t_options;
- struct dentry *entry;
t_options = trace_options_init_dentry();
if (!t_options)
return NULL;
- entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+ return trace_create_file(option, 0644, t_options, (void *)index,
&trace_options_core_fops);
-
- return entry;
}
static __init void create_trace_options_dir(void)
{
struct dentry *t_options;
- struct dentry *entry;
int i;
t_options = trace_options_init_dentry();
if (!t_options)
return;
- for (i = 0; trace_options[i]; i++) {
- entry = create_trace_option_core_file(trace_options[i], i);
- if (!entry)
- pr_warning("Could not create debugfs %s entry\n",
- trace_options[i]);
- }
+ for (i = 0; trace_options[i]; i++)
+ create_trace_option_core_file(trace_options[i], i);
}
static __init int tracer_init_debugfs(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
int cpu;
d_tracer = tracing_init_dentry();
- entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
- &global_trace, &tracing_ctrl_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+ trace_create_file("tracing_enabled", 0644, d_tracer,
+ &global_trace, &tracing_ctrl_fops);
- entry = debugfs_create_file("trace_options", 0644, d_tracer,
- NULL, &tracing_iter_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace_options' entry\n");
+ trace_create_file("trace_options", 0644, d_tracer,
+ NULL, &tracing_iter_fops);
- create_trace_options_dir();
+ trace_create_file("tracing_cpumask", 0644, d_tracer,
+ NULL, &tracing_cpumask_fops);
- entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
- NULL, &tracing_cpumask_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
- entry = debugfs_create_file("trace", 0644, d_tracer,
- (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace' entry\n");
-
- entry = debugfs_create_file("available_tracers", 0444, d_tracer,
- &global_trace, &show_traces_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
- entry = debugfs_create_file("current_tracer", 0444, d_tracer,
- &global_trace, &set_tracer_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
- entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
- &tracing_max_latency,
- &tracing_max_lat_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'tracing_max_latency' entry\n");
-
- entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
- &tracing_thresh, &tracing_max_lat_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'tracing_thresh' entry\n");
- entry = debugfs_create_file("README", 0644, d_tracer,
- NULL, &tracing_readme_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'README' entry\n");
-
- entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+ trace_create_file("trace", 0644, d_tracer,
+ (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+
+ trace_create_file("available_tracers", 0444, d_tracer,
+ &global_trace, &show_traces_fops);
+
+ trace_create_file("current_tracer", 0644, d_tracer,
+ &global_trace, &set_tracer_fops);
+
+ trace_create_file("tracing_max_latency", 0644, d_tracer,
+ &tracing_max_latency, &tracing_max_lat_fops);
+
+ trace_create_file("tracing_thresh", 0644, d_tracer,
+ &tracing_thresh, &tracing_max_lat_fops);
+
+ trace_create_file("README", 0444, d_tracer,
+ NULL, &tracing_readme_fops);
+
+ trace_create_file("trace_pipe", 0444, d_tracer,
(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'trace_pipe' entry\n");
-
- entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
- &global_trace, &tracing_entries_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'buffer_size_kb' entry\n");
-
- entry = debugfs_create_file("trace_marker", 0220, d_tracer,
- NULL, &tracing_mark_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'trace_marker' entry\n");
+
+ trace_create_file("buffer_size_kb", 0644, d_tracer,
+ &global_trace, &tracing_entries_fops);
+
+ trace_create_file("trace_marker", 0220, d_tracer,
+ NULL, &tracing_mark_fops);
+
+ trace_create_file("saved_cmdlines", 0444, d_tracer,
+ NULL, &tracing_saved_cmdlines_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
- entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
- &ftrace_update_tot_cnt,
- &tracing_dyn_info_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'dyn_ftrace_total_info' entry\n");
+ trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+ &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
#endif
#ifdef CONFIG_SYSPROF_TRACER
init_tracer_sysprof_debugfs(d_tracer);
#endif
+ create_trace_options_dir();
+
for_each_tracing_cpu(cpu)
tracing_init_debugfs_percpu(cpu);
@@ -4064,7 +4175,8 @@ trace_printk_seq(struct trace_seq *s)
static void __ftrace_dump(bool disable_tracing)
{
- static DEFINE_SPINLOCK(ftrace_dump_lock);
+ static raw_spinlock_t ftrace_dump_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
unsigned int old_userobj;
@@ -4073,7 +4185,8 @@ static void __ftrace_dump(bool disable_tracing)
int cnt = 0, cpu;
/* only one dump */
- spin_lock_irqsave(&ftrace_dump_lock, flags);
+ local_irq_save(flags);
+ __raw_spin_lock(&ftrace_dump_lock);
if (dump_ran)
goto out;
@@ -4122,8 +4235,11 @@ static void __ftrace_dump(bool disable_tracing)
iter.pos = -1;
if (find_next_entry_inc(&iter) != NULL) {
- print_trace_line(&iter);
- trace_consume(&iter);
+ int ret;
+
+ ret = print_trace_line(&iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(&iter);
}
trace_printk_seq(&iter.seq);
@@ -4145,7 +4261,8 @@ static void __ftrace_dump(bool disable_tracing)
}
out:
- spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+ __raw_spin_unlock(&ftrace_dump_lock);
+ local_irq_restore(flags);
}
/* By default: disable tracing after the dump */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e685ac2b2ba..8b9f4f6e955 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,9 +9,12 @@
#include <linux/mmiotrace.h>
#include <linux/ftrace.h>
#include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
#include <trace/power.h>
+#include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
+
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -42,20 +45,6 @@ enum trace_type {
};
/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
- unsigned char type;
- unsigned char flags;
- unsigned char preempt_count;
- int pid;
- int tgid;
-};
-
-/*
* Function trace entry - function address and parent function addres:
*/
struct ftrace_entry {
@@ -263,8 +252,6 @@ struct trace_array_cpu {
char comm[TASK_COMM_LEN];
};
-struct trace_iterator;
-
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
__ftrace_bad_type(); \
} while (0)
-/* Return values for print_line callback */
-enum print_line_t {
- TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
- TRACE_TYPE_HANDLED = 1,
- TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
- TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
-};
-
-
/*
* An option specific to a tracer. This is a boolean value.
* The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
struct tracer_stat *stats;
};
-struct trace_seq {
- unsigned char buffer[PAGE_SIZE];
- unsigned int len;
- unsigned int readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
- s->len = 0;
- s->readpos = 0;
-}
-
#define TRACE_PIPE_ALL_CPU -1
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
- struct trace_array *tr;
- struct tracer *trace;
- void *private;
- int cpu_file;
- struct mutex mutex;
- struct ring_buffer_iter *buffer_iter[NR_CPUS];
-
- /* The below is zeroed out in pipe_read */
- struct trace_seq seq;
- struct trace_entry *ent;
- int cpu;
- u64 ts;
-
- unsigned long iter_flags;
- loff_t pos;
- long idx;
-
- cpumask_var_t started;
-};
-
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void trace_wake_up(void);
void tracing_reset(struct trace_array *tr, int cpu);
void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+ mode_t mode,
+ struct dentry *parent,
+ void *data,
+ const struct file_operations *fops);
+
struct dentry *tracing_init_dentry(void);
void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
struct ring_buffer_event;
struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
- unsigned char type,
+ int type,
unsigned long len,
unsigned long flags,
int pc);
@@ -484,24 +432,12 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
struct ring_buffer_event *event,
unsigned long flags, int pc);
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
- unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
- unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
- unsigned long flags, int pc);
-
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
-void tracing_generic_entry_update(struct trace_entry *entry,
- unsigned long flags,
- int pc);
-
void default_wait_pipe(struct trace_iterator *iter);
void poll_wait_pipe(struct trace_iterator *iter);
@@ -514,7 +450,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
void tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *wakee,
@@ -599,6 +534,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+ struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +550,8 @@ extern unsigned long trace_flags;
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
#ifdef CONFIG_DYNAMIC_FTRACE
/* TODO: make this variable */
@@ -644,7 +583,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
return 1;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
-
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function(struct trace_iterator *iter)
@@ -655,6 +593,7 @@ print_graph_function(struct trace_iterator *iter)
extern struct pid *ftrace_pid_trace;
+#ifdef CONFIG_FUNCTION_TRACER
static inline int ftrace_trace_task(struct task_struct *task)
{
if (!ftrace_pid_trace)
@@ -662,6 +601,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
return test_tsk_trace_trace(task);
}
+#else
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+ return 1;
+}
+#endif
/*
* trace_iterator_flags is an enumeration that defines bit
@@ -692,6 +637,7 @@ enum trace_iterator_flags {
TRACE_ITER_LATENCY_FMT = 0x40000,
TRACE_ITER_GLOBAL_CLK = 0x80000,
TRACE_ITER_SLEEP_TIME = 0x100000,
+ TRACE_ITER_GRAPH_TIME = 0x200000,
};
/*
@@ -790,103 +736,113 @@ struct ftrace_event_field {
char *type;
int offset;
int size;
+ int is_signed;
};
-struct ftrace_event_call {
- char *name;
- char *system;
- struct dentry *dir;
- int enabled;
- int (*regfunc)(void);
- void (*unregfunc)(void);
- int id;
- int (*raw_init)(void);
- int (*show_format)(struct trace_seq *s);
- int (*define_fields)(void);
- struct list_head fields;
+struct event_filter {
+ int n_preds;
struct filter_pred **preds;
-
-#ifdef CONFIG_EVENT_PROFILE
- atomic_t profile_count;
- int (*profile_enable)(struct ftrace_event_call *);
- void (*profile_disable)(struct ftrace_event_call *);
-#endif
+ char *filter_string;
};
struct event_subsystem {
struct list_head list;
const char *name;
struct dentry *entry;
- struct filter_pred **preds;
+ void *filter;
};
-#define events_for_each(event) \
- for (event = __start_ftrace_events; \
- (unsigned long)event < (unsigned long)__stop_ftrace_events; \
- event++)
-
-#define MAX_FILTER_PRED 8
-
struct filter_pred;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+ int val1, int val2);
struct filter_pred {
filter_pred_fn_t fn;
u64 val;
- char *str_val;
+ char str_val[MAX_FILTER_STR_VAL];
int str_len;
char *field_name;
int offset;
int not;
- int or;
- int compound;
- int clear;
+ int op;
+ int pop_n;
};
-int trace_define_field(struct ftrace_event_call *call, char *type,
- char *name, int offset, int size);
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
+extern void print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
- struct filter_pred *pred);
-extern void filter_free_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
- struct filter_pred *pred);
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event) \
- for (event = __start_ftrace_events; \
- (unsigned long)event < (unsigned long)__stop_ftrace_events; \
- event++)
+extern int apply_event_filter(struct ftrace_event_call *call,
+ char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+ char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
+ struct trace_seq *s);
+
+static inline int
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+ ring_buffer_discard_commit(buffer, event);
+ return 1;
+ }
+
+ return 0;
+}
+
+#define DEFINE_COMPARISON_PRED(type) \
+static int filter_pred_##type(struct filter_pred *pred, void *event, \
+ int val1, int val2) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = 0; \
+ \
+ switch (pred->op) { \
+ case OP_LT: \
+ match = (*addr < val); \
+ break; \
+ case OP_LE: \
+ match = (*addr <= val); \
+ break; \
+ case OP_GT: \
+ match = (*addr > val); \
+ break; \
+ case OP_GE: \
+ match = (*addr >= val); \
+ break; \
+ default: \
+ break; \
+ } \
+ \
+ return match; \
+}
+
+#define DEFINE_EQUALITY_PRED(size) \
+static int filter_pred_##size(struct filter_pred *pred, void *event, \
+ int val1, int val2) \
+{ \
+ u##size *addr = (u##size *)(event + pred->offset); \
+ u##size val = (u##size)pred->val; \
+ int match; \
+ \
+ match = (val == *addr) ^ pred->not; \
+ \
+ return match; \
+}
+
+extern struct mutex event_mutex;
+extern struct list_head ftrace_events;
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...) \
-do { \
- __trace_printk_check_format(fmt, ##args); \
- tracing_record_cmdline(current); \
- if (__builtin_constant_p(fmt)) { \
- static const char *trace_printk_fmt \
- __attribute__((section("__trace_printk_fmt"))) = \
- __builtin_constant_p(fmt) ? fmt : NULL; \
- \
- __trace_bprintk(ip, trace_printk_fmt, ##args); \
- } else \
- __trace_printk(ip, fmt, ##args); \
-} while (0)
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+ extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
+#include "trace_event_types.h"
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4c364..a29ef23ffb4 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/kallsyms.h>
+#include <linux/time.h>
#include "trace.h"
#include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
trace_assign_type(field, entry);
call = &field->boot_call;
ts = iter->ts;
- nsec_rem = do_div(ts, 1000000000);
+ nsec_rem = do_div(ts, NSEC_PER_SEC);
ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
(unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
trace_assign_type(field, entry);
init_ret = &field->boot_ret;
ts = iter->ts;
- nsec_rem = do_div(ts, 1000000000);
+ nsec_rem = do_div(ts, NSEC_PER_SEC);
ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
"returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8333715e406..7a7a9fd249a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
static void
probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
{
+ struct ftrace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct ring_buffer_event *event;
struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->line = f->line;
entry->correct = val == expect;
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
out:
atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
return 0;
}
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
{
return __start_annotated_branch_profile;
}
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
return 0;
}
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
{
return __start_branch_profile;
}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba997077..11ba5bb4ed0 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,22 +10,30 @@
int ftrace_profile_enable(int event_id)
{
struct ftrace_event_call *event;
+ int ret = -EINVAL;
- for_each_event(event) {
- if (event->id == event_id)
- return event->profile_enable(event);
+ mutex_lock(&event_mutex);
+ list_for_each_entry(event, &ftrace_events, list) {
+ if (event->id == event_id && event->profile_enable) {
+ ret = event->profile_enable(event);
+ break;
+ }
}
+ mutex_unlock(&event_mutex);
- return -EINVAL;
+ return ret;
}
void ftrace_profile_disable(int event_id)
{
struct ftrace_event_call *event;
- for_each_event(event) {
- if (event->id == event_id)
- return event->profile_disable(event);
+ mutex_lock(&event_mutex);
+ list_for_each_entry(event, &ftrace_events, list) {
+ if (event->id == event_id) {
+ event->profile_disable(event);
+ break;
+ }
}
+ mutex_unlock(&event_mutex);
}
-
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee71dd..6db005e1248 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
ftrace_graph_ret_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ret.func, func)
+ TRACE_FIELD(unsigned long long, ret.calltime, calltime)
+ TRACE_FIELD(unsigned long long, ret.rettime, rettime)
+ TRACE_FIELD(unsigned long, ret.overrun, overrun)
TRACE_FIELD(int, ret.depth, depth)
),
TP_RAW_FMT("<-- %lx (%d)")
@@ -57,7 +60,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
);
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, arg1, arg1)
TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +125,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned int, line, line)
- TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
- TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+ TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+ TRACE_FUNC_SIZE+1, func)
+ TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+ TRACE_FUNC_SIZE+1, file)
TRACE_FIELD(char, correct, correct)
),
TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +144,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
TRACE_STRUCT(
- TRACE_FIELD(ktime_t, state_data.stamp, stamp)
- TRACE_FIELD(ktime_t, state_data.end, end)
+ TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+ TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
TRACE_FIELD(int, state_data.type, type)
TRACE_FIELD(int, state_data.state, state)
),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 576f4fa2af0..e75276a49cf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,19 +8,25 @@
*
*/
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
+#include <linux/delay.h>
#include "trace_output.h"
#define TRACE_SYSTEM "TRACE_SYSTEM"
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
+
+LIST_HEAD(ftrace_events);
int trace_define_field(struct ftrace_event_call *call, char *type,
- char *name, int offset, int size)
+ char *name, int offset, int size, int is_signed)
{
struct ftrace_event_field *field;
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
field->offset = offset;
field->size = size;
+ field->is_signed = is_signed;
list_add(&field->link, &call->fields);
return 0;
@@ -51,47 +58,94 @@ err:
return -ENOMEM;
}
+EXPORT_SYMBOL_GPL(trace_define_field);
-static void ftrace_clear_events(void)
-{
- struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+#ifdef CONFIG_MODULES
- while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+ struct ftrace_event_field *field, *next;
- if (call->enabled) {
- call->enabled = 0;
- call->unregfunc();
- }
- call++;
+ list_for_each_entry_safe(field, next, &call->fields, link) {
+ list_del(&field->link);
+ kfree(field->type);
+ kfree(field->name);
+ kfree(field);
}
}
+#endif /* CONFIG_MODULES */
+
static void ftrace_event_enable_disable(struct ftrace_event_call *call,
int enable)
{
-
switch (enable) {
case 0:
if (call->enabled) {
call->enabled = 0;
+ tracing_stop_cmdline_record();
call->unregfunc();
}
break;
case 1:
if (!call->enabled) {
call->enabled = 1;
+ tracing_start_cmdline_record();
call->regfunc();
}
break;
}
}
+static void ftrace_clear_events(void)
+{
+ struct ftrace_event_call *call;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+ ftrace_event_enable_disable(call, 0);
+ }
+ mutex_unlock(&event_mutex);
+}
+
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+ const char *event, int set)
+{
+ struct ftrace_event_call *call;
+ int ret = -EINVAL;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ if (!call->name || !call->regfunc)
+ continue;
+
+ if (match &&
+ strcmp(match, call->name) != 0 &&
+ strcmp(match, call->system) != 0)
+ continue;
+
+ if (sub && strcmp(sub, call->system) != 0)
+ continue;
+
+ if (event && strcmp(event, call->name) != 0)
+ continue;
+
+ ftrace_event_enable_disable(call, set);
+
+ ret = 0;
+ }
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
static int ftrace_set_clr_event(char *buf, int set)
{
- struct ftrace_event_call *call = __start_ftrace_events;
char *event = NULL, *sub = NULL, *match;
- int ret = -EINVAL;
/*
* The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
event = NULL;
}
- mutex_lock(&event_mutex);
- for_each_event(call) {
-
- if (!call->name || !call->regfunc)
- continue;
-
- if (match &&
- strcmp(match, call->name) != 0 &&
- strcmp(match, call->system) != 0)
- continue;
-
- if (sub && strcmp(sub, call->system) != 0)
- continue;
-
- if (event && strcmp(event, call->name) != 0)
- continue;
-
- ftrace_event_enable_disable(call, set);
-
- ret = 0;
- }
- mutex_unlock(&event_mutex);
+ return __ftrace_set_clr_event(match, sub, event, set);
+}
- return ret;
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+ return __ftrace_set_clr_event(NULL, system, event, set);
}
/* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = m->private;
- struct ftrace_event_call *next = call;
+ struct list_head *list = m->private;
+ struct ftrace_event_call *call;
(*pos)++;
for (;;) {
- if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+ if (list == &ftrace_events)
return NULL;
+ call = list_entry(list, struct ftrace_event_call, list);
+
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
@@ -240,46 +290,68 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
if (call->regfunc)
break;
- call++;
- next = call;
+ list = list->next;
}
- m->private = ++next;
+ m->private = list->next;
return call;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
- return t_next(m, NULL, pos);
+ struct ftrace_event_call *call = NULL;
+ loff_t l;
+
+ mutex_lock(&event_mutex);
+
+ m->private = ftrace_events.next;
+ for (l = 0; l <= *pos; ) {
+ call = t_next(m, NULL, &l);
+ if (!call)
+ break;
+ }
+ return call;
}
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = m->private;
- struct ftrace_event_call *next;
+ struct list_head *list = m->private;
+ struct ftrace_event_call *call;
(*pos)++;
retry:
- if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+ if (list == &ftrace_events)
return NULL;
+ call = list_entry(list, struct ftrace_event_call, list);
+
if (!call->enabled) {
- call++;
+ list = list->next;
goto retry;
}
- next = call;
- m->private = ++next;
+ m->private = list->next;
return call;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
- return s_next(m, NULL, pos);
+ struct ftrace_event_call *call = NULL;
+ loff_t l;
+
+ mutex_lock(&event_mutex);
+
+ m->private = ftrace_events.next;
+ for (l = 0; l <= *pos; ) {
+ call = s_next(m, NULL, &l);
+ if (!call)
+ break;
+ }
+ return call;
}
static int t_show(struct seq_file *m, void *v)
@@ -295,26 +367,20 @@ static int t_show(struct seq_file *m, void *v)
static void t_stop(struct seq_file *m, void *p)
{
+ mutex_unlock(&event_mutex);
}
static int
ftrace_event_seq_open(struct inode *inode, struct file *file)
{
- int ret;
const struct seq_operations *seq_ops;
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_clear_events();
seq_ops = inode->i_private;
- ret = seq_open(file, seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
-
- m->private = __start_ftrace_events;
- }
- return ret;
+ return seq_open(file, seq_ops);
}
static ssize_t
@@ -374,8 +440,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char set_to_char[4] = { '?', '0', '1', 'X' };
+ const char *system = filp->private_data;
+ struct ftrace_event_call *call;
+ char buf[2];
+ int set = 0;
+ int ret;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!call->name || !call->regfunc)
+ continue;
+
+ if (system && strcmp(call->system, system) != 0)
+ continue;
+
+ /*
+ * We need to find out if all the events are set
+ * or if all events or cleared, or if we have
+ * a mixture.
+ */
+ set |= (1 << !!call->enabled);
+
+ /*
+ * If we have a mixture, no need to look further.
+ */
+ if (set == 3)
+ break;
+ }
+ mutex_unlock(&event_mutex);
+
+ buf[0] = set_to_char[set];
+ buf[1] = '\n';
+
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+ return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char *system = filp->private_data;
+ unsigned long val;
+ char buf[64];
+ ssize_t ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+ if (ret)
+ goto out;
+
+ ret = cnt;
+
+out:
+ *ppos += cnt;
+
+ return ret;
+}
+
+extern char *__bad_type_size(void);
+
#undef FIELD
#define FIELD(type, name) \
+ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
#type, "common_" #name, offsetof(typeof(field), name), \
sizeof(field.name)
@@ -391,7 +542,7 @@ static int trace_write_header(struct trace_seq *s)
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\n",
- FIELD(unsigned char, type),
+ FIELD(unsigned short, type),
FIELD(unsigned char, flags),
FIELD(unsigned char, preempt_count),
FIELD(int, pid),
@@ -481,7 +632,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
- filter_print_preds(call->preds, s);
+ print_event_filter(call, s);
r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
@@ -494,38 +645,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
- char buf[64], *pbuf = buf;
- struct filter_pred *pred;
+ char *buf;
int err;
- if (cnt >= sizeof(buf))
+ if (cnt >= PAGE_SIZE)
return -EINVAL;
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
- buf[cnt] = '\0';
-
- pred = kzalloc(sizeof(*pred), GFP_KERNEL);
- if (!pred)
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
return -ENOMEM;
- err = filter_parse(&pbuf, pred);
- if (err < 0) {
- filter_free_pred(pred);
- return err;
- }
-
- if (pred->clear) {
- filter_free_preds(call);
- filter_free_pred(pred);
- return cnt;
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long) buf);
+ return -EFAULT;
}
+ buf[cnt] = '\0';
- err = filter_add_pred(call, pred);
- if (err < 0) {
- filter_free_pred(pred);
+ err = apply_event_filter(call, buf);
+ free_page((unsigned long) buf);
+ if (err < 0)
return err;
- }
*ppos += cnt;
@@ -549,7 +688,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
- filter_print_preds(system->preds, s);
+ print_subsystem_event_filter(system, s);
r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
@@ -562,45 +701,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct event_subsystem *system = filp->private_data;
- char buf[64], *pbuf = buf;
- struct filter_pred *pred;
+ char *buf;
int err;
- if (cnt >= sizeof(buf))
+ if (cnt >= PAGE_SIZE)
return -EINVAL;
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
- buf[cnt] = '\0';
-
- pred = kzalloc(sizeof(*pred), GFP_KERNEL);
- if (!pred)
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
return -ENOMEM;
- err = filter_parse(&pbuf, pred);
- if (err < 0) {
- filter_free_pred(pred);
- return err;
- }
-
- if (pred->clear) {
- filter_free_subsystem_preds(system);
- filter_free_pred(pred);
- return cnt;
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long) buf);
+ return -EFAULT;
}
+ buf[cnt] = '\0';
- err = filter_add_subsystem_pred(system, pred);
- if (err < 0) {
- filter_free_subsystem_preds(system);
- filter_free_pred(pred);
+ err = apply_subsystem_event_filter(system, buf);
+ free_page((unsigned long) buf);
+ if (err < 0)
return err;
- }
*ppos += cnt;
return cnt;
}
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ int (*func)(struct trace_seq *s) = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ func(s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+ kfree(s);
+
+ return r;
+}
+
static const struct seq_operations show_event_seq_ops = {
.start = t_start,
.next = t_next,
@@ -658,6 +808,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
.write = subsystem_filter_write,
};
+static const struct file_operations ftrace_system_enable_fops = {
+ .open = tracing_open_generic,
+ .read = system_enable_read,
+ .write = system_enable_write,
+};
+
+static const struct file_operations ftrace_show_header_fops = {
+ .open = tracing_open_generic,
+ .read = show_header,
+};
+
static struct dentry *event_trace_events_dir(void)
{
static struct dentry *d_tracer;
@@ -684,6 +845,7 @@ static struct dentry *
event_subsystem_dir(const char *name, struct dentry *d_events)
{
struct event_subsystem *system;
+ struct dentry *entry;
/* First see if we did not already create this dir */
list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +869,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
return d_events;
}
- system->name = name;
+ system->name = kstrdup(name, GFP_KERNEL);
+ if (!system->name) {
+ debugfs_remove(system->entry);
+ kfree(system);
+ return d_events;
+ }
+
list_add(&system->list, &event_subsystems);
- system->preds = NULL;
+ system->filter = NULL;
+
+ system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+ if (!system->filter) {
+ pr_warning("Could not allocate filter for subsystem "
+ "'%s'\n", name);
+ return system->entry;
+ }
+
+ entry = debugfs_create_file("filter", 0644, system->entry, system,
+ &ftrace_subsystem_filter_fops);
+ if (!entry) {
+ kfree(system->filter);
+ system->filter = NULL;
+ pr_warning("Could not create debugfs "
+ "'%s/filter' entry\n", name);
+ }
+
+ entry = trace_create_file("enable", 0644, system->entry,
+ (void *)system->name,
+ &ftrace_system_enable_fops);
return system->entry;
}
static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+ const struct file_operations *id,
+ const struct file_operations *enable,
+ const struct file_operations *filter,
+ const struct file_operations *format)
{
struct dentry *entry;
int ret;
@@ -725,7 +917,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
- if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+ if (strcmp(call->system, TRACE_SYSTEM) != 0)
d_events = event_subsystem_dir(call->system, d_events);
if (call->raw_init) {
@@ -744,21 +936,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
return -1;
}
- if (call->regfunc) {
- entry = debugfs_create_file("enable", 0644, call->dir, call,
- &ftrace_enable_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'%s/enable' entry\n", call->name);
- }
+ if (call->regfunc)
+ entry = trace_create_file("enable", 0644, call->dir, call,
+ enable);
- if (call->id) {
- entry = debugfs_create_file("id", 0444, call->dir, call,
- &ftrace_event_id_fops);
- if (!entry)
- pr_warning("Could not create debugfs '%s/id' entry\n",
- call->name);
- }
+ if (call->id && call->profile_enable)
+ entry = trace_create_file("id", 0444, call->dir, call,
+ id);
if (call->define_fields) {
ret = call->define_fields();
@@ -767,32 +951,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
" events/%s\n", call->name);
return ret;
}
- entry = debugfs_create_file("filter", 0644, call->dir, call,
- &ftrace_event_filter_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'%s/filter' entry\n", call->name);
+ entry = trace_create_file("filter", 0644, call->dir, call,
+ filter);
}
/* A trace may not want to export its format */
if (!call->show_format)
return 0;
- entry = debugfs_create_file("format", 0444, call->dir, call,
- &ftrace_event_format_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'%s/format' entry\n", call->name);
+ entry = trace_create_file("format", 0444, call->dir, call,
+ format);
return 0;
}
+#define for_each_event(event, start, end) \
+ for (event = start; \
+ (unsigned long)event < (unsigned long)end; \
+ event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+ struct list_head list;
+ struct module *mod;
+ struct file_operations id;
+ struct file_operations enable;
+ struct file_operations format;
+ struct file_operations filter;
+};
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+ struct ftrace_module_file_ops *file_ops;
+
+ /*
+ * This is a bit of a PITA. To allow for correct reference
+ * counting, modules must "own" their file_operations.
+ * To do this, we allocate the file operations that will be
+ * used in the event directory.
+ */
+
+ file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+ if (!file_ops)
+ return NULL;
+
+ file_ops->mod = mod;
+
+ file_ops->id = ftrace_event_id_fops;
+ file_ops->id.owner = mod;
+
+ file_ops->enable = ftrace_enable_fops;
+ file_ops->enable.owner = mod;
+
+ file_ops->filter = ftrace_event_filter_fops;
+ file_ops->filter.owner = mod;
+
+ file_ops->format = ftrace_event_format_fops;
+ file_ops->format.owner = mod;
+
+ list_add(&file_ops->list, &ftrace_module_file_list);
+
+ return file_ops;
+}
+
+static void trace_module_add_events(struct module *mod)
+{
+ struct ftrace_module_file_ops *file_ops = NULL;
+ struct ftrace_event_call *call, *start, *end;
+ struct dentry *d_events;
+
+ start = mod->trace_events;
+ end = mod->trace_events + mod->num_trace_events;
+
+ if (start == end)
+ return;
+
+ d_events = event_trace_events_dir();
+ if (!d_events)
+ return;
+
+ for_each_event(call, start, end) {
+ /* The linker may leave blanks */
+ if (!call->name)
+ continue;
+
+ /*
+ * This module has events, create file ops for this module
+ * if not already done.
+ */
+ if (!file_ops) {
+ file_ops = trace_create_file_ops(mod);
+ if (!file_ops)
+ return;
+ }
+ call->mod = mod;
+ list_add(&call->list, &ftrace_events);
+ event_create_dir(call, d_events,
+ &file_ops->id, &file_ops->enable,
+ &file_ops->filter, &file_ops->format);
+ }
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+ struct ftrace_module_file_ops *file_ops;
+ struct ftrace_event_call *call, *p;
+ bool found = false;
+
+ down_write(&trace_event_mutex);
+ list_for_each_entry_safe(call, p, &ftrace_events, list) {
+ if (call->mod == mod) {
+ found = true;
+ ftrace_event_enable_disable(call, 0);
+ if (call->event)
+ __unregister_ftrace_event(call->event);
+ debugfs_remove_recursive(call->dir);
+ list_del(&call->list);
+ trace_destroy_fields(call);
+ destroy_preds(call);
+ }
+ }
+
+ /* Now free the file_operations */
+ list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+ if (file_ops->mod == mod)
+ break;
+ }
+ if (&file_ops->list != &ftrace_module_file_list) {
+ list_del(&file_ops->list);
+ kfree(file_ops);
+ }
+
+ /*
+ * It is safest to reset the ring buffer if the module being unloaded
+ * registered any events.
+ */
+ if (found)
+ tracing_reset_current_online_cpus();
+ up_write(&trace_event_mutex);
+}
+
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ mutex_lock(&event_mutex);
+ switch (val) {
+ case MODULE_STATE_COMING:
+ trace_module_add_events(mod);
+ break;
+ case MODULE_STATE_GOING:
+ trace_module_remove_events(mod);
+ break;
+ }
+ mutex_unlock(&event_mutex);
+
+ return 0;
+}
+#else
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block trace_module_nb = {
+ .notifier_call = trace_module_notify,
+ .priority = 0,
+};
+
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
static __init int event_trace_init(void)
{
- struct ftrace_event_call *call = __start_ftrace_events;
+ struct ftrace_event_call *call;
struct dentry *d_tracer;
struct dentry *entry;
struct dentry *d_events;
+ int ret;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -816,13 +1163,243 @@ static __init int event_trace_init(void)
if (!d_events)
return 0;
- for_each_event(call) {
+ /* ring buffer internal formats */
+ trace_create_file("header_page", 0444, d_events,
+ ring_buffer_print_page_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("header_event", 0444, d_events,
+ ring_buffer_print_entry_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("enable", 0644, d_events,
+ NULL, &ftrace_system_enable_fops);
+
+ for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
/* The linker may leave blanks */
if (!call->name)
continue;
- event_create_dir(call, d_events);
+ list_add(&call->list, &ftrace_events);
+ event_create_dir(call, d_events, &ftrace_event_id_fops,
+ &ftrace_enable_fops, &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
}
+ ret = register_module_notifier(&trace_module_nb);
+ if (ret)
+ pr_warning("Failed to register trace events module notifier\n");
+
return 0;
}
fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+ spin_lock(&test_spinlock);
+ spin_lock_irq(&test_spinlock_irq);
+ udelay(1);
+ spin_unlock_irq(&test_spinlock_irq);
+ spin_unlock(&test_spinlock);
+
+ mutex_lock(&test_mutex);
+ msleep(1);
+ mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+ void *test_malloc;
+
+ test_malloc = kmalloc(1234, GFP_KERNEL);
+ if (!test_malloc)
+ pr_info("failed to kmalloc\n");
+
+ schedule_on_each_cpu(test_work);
+
+ kfree(test_malloc);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop())
+ schedule();
+
+ return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+ struct task_struct *test_thread;
+
+ test_thread = kthread_run(event_test_thread, NULL, "test-events");
+ msleep(1);
+ kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init void event_trace_self_tests(void)
+{
+ struct ftrace_event_call *call;
+ struct event_subsystem *system;
+ int ret;
+
+ pr_info("Running tests on trace events:\n");
+
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ /* Only test those that have a regfunc */
+ if (!call->regfunc)
+ continue;
+
+ pr_info("Testing event %s: ", call->name);
+
+ /*
+ * If an event is already enabled, someone is using
+ * it and the self test should not be on.
+ */
+ if (call->enabled) {
+ pr_warning("Enabled event during self test!\n");
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ ftrace_event_enable_disable(call, 1);
+ event_test_stuff();
+ ftrace_event_enable_disable(call, 0);
+
+ pr_cont("OK\n");
+ }
+
+ /* Now test at the sub system level */
+
+ pr_info("Running tests on trace event systems:\n");
+
+ list_for_each_entry(system, &event_subsystems, list) {
+
+ /* the ftrace system is special, skip it */
+ if (strcmp(system->name, "ftrace") == 0)
+ continue;
+
+ pr_info("Testing event system %s: ", system->name);
+
+ ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error enabling system %s\n",
+ system->name);
+ continue;
+ }
+
+ event_test_stuff();
+
+ ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+ if (WARN_ON_ONCE(ret))
+ pr_warning("error disabling system %s\n",
+ system->name);
+
+ pr_cont("OK\n");
+ }
+
+ /* Test with all events enabled */
+
+ pr_info("Running tests on all trace events:\n");
+ pr_info("Testing all events: ");
+
+ ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error enabling all events\n");
+ return;
+ }
+
+ event_test_stuff();
+
+ /* reset sysname */
+ ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error disabling all events\n");
+ return;
+ }
+
+ pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct ring_buffer_event *event;
+ struct ftrace_entry *entry;
+ unsigned long flags;
+ long disabled;
+ int resched;
+ int cpu;
+ int pc;
+
+ pc = preempt_count();
+ resched = ftrace_preempt_disable();
+ cpu = raw_smp_processor_id();
+ disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+ if (disabled != 1)
+ goto out;
+
+ local_save_flags(flags);
+
+ event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+ flags, pc);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+
+ trace_nowake_buffer_unlock_commit(event, flags, pc);
+
+ out:
+ atomic_dec(&per_cpu(test_event_disable, cpu));
+ ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata =
+{
+ .func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+ register_ftrace_function(&trace_ops);
+ pr_info("Running tests again, along with the function tracer\n");
+ event_trace_self_tests();
+ unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+
+ event_trace_self_tests();
+
+ event_trace_self_test_with_function();
+
+ return 0;
+}
+
+late_initcall(event_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e03cbf1e38f..f32dc9d1ea7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,119 +22,295 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
+#include <linux/mutex.h>
#include "trace.h"
#include "trace_output.h"
-static int filter_pred_64(struct filter_pred *pred, void *event)
+enum filter_op_ids
{
- u64 *addr = (u64 *)(event + pred->offset);
- u64 val = (u64)pred->val;
- int match;
+ OP_OR,
+ OP_AND,
+ OP_NE,
+ OP_EQ,
+ OP_LT,
+ OP_LE,
+ OP_GT,
+ OP_GE,
+ OP_NONE,
+ OP_OPEN_PAREN,
+};
+
+struct filter_op {
+ int id;
+ char *string;
+ int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+ { OP_OR, "||", 1 },
+ { OP_AND, "&&", 2 },
+ { OP_NE, "!=", 4 },
+ { OP_EQ, "==", 4 },
+ { OP_LT, "<", 5 },
+ { OP_LE, "<=", 5 },
+ { OP_GT, ">", 5 },
+ { OP_GE, ">=", 5 },
+ { OP_NONE, "OP_NONE", 0 },
+ { OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+ FILT_ERR_NONE,
+ FILT_ERR_INVALID_OP,
+ FILT_ERR_UNBALANCED_PAREN,
+ FILT_ERR_TOO_MANY_OPERANDS,
+ FILT_ERR_OPERAND_TOO_LONG,
+ FILT_ERR_FIELD_NOT_FOUND,
+ FILT_ERR_ILLEGAL_FIELD_OP,
+ FILT_ERR_ILLEGAL_INTVAL,
+ FILT_ERR_BAD_SUBSYS_FILTER,
+ FILT_ERR_TOO_MANY_PREDS,
+ FILT_ERR_MISSING_FIELD,
+ FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+ "No error",
+ "Invalid operator",
+ "Unbalanced parens",
+ "Too many operands",
+ "Operand too long",
+ "Field not found",
+ "Illegal operation for field type",
+ "Illegal integer value",
+ "Couldn't find or set field in one of a subsystem's events",
+ "Too many terms in predicate expression",
+ "Missing field name and/or value",
+ "Meaningless filter expression",
+};
+
+struct opstack_op {
+ int op;
+ struct list_head list;
+};
+
+struct postfix_elt {
+ int op;
+ char *operand;
+ struct list_head list;
+};
+
+struct filter_parse_state {
+ struct filter_op *ops;
+ struct list_head opstack;
+ struct list_head postfix;
+ int lasterr;
+ int lasterr_pos;
+
+ struct {
+ char *string;
+ unsigned int cnt;
+ unsigned int tail;
+ } infix;
+
+ struct {
+ char string[MAX_FILTER_STR_VAL];
+ int pos;
+ unsigned int tail;
+ } operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+ void *event __attribute((unused)),
+ int val1, int val2)
+{
+ return val1 && val2;
+}
+
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+ void *event __attribute((unused)),
+ int val1, int val2)
+{
+ return val1 || val2;
+}
+
+/* Filter predicate for fixed sized arrays of characters */
+static int filter_pred_string(struct filter_pred *pred, void *event,
+ int val1, int val2)
+{
+ char *addr = (char *)(event + pred->offset);
+ int cmp, match;
- match = (val == *addr) ^ pred->not;
+ cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+ match = (!cmp) ^ pred->not;
return match;
}
-static int filter_pred_32(struct filter_pred *pred, void *event)
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+ int val1, int val2)
{
- u32 *addr = (u32 *)(event + pred->offset);
- u32 val = (u32)pred->val;
- int match;
+ unsigned short str_loc = *(unsigned short *)(event + pred->offset);
+ char *addr = (char *)(event + str_loc);
+ int cmp, match;
- match = (val == *addr) ^ pred->not;
+ cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+ match = (!cmp) ^ pred->not;
return match;
}
-static int filter_pred_16(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+ int val1, int val2)
+{
+ return 0;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
{
- u16 *addr = (u16 *)(event + pred->offset);
- u16 val = (u16)pred->val;
- int match;
+ struct event_filter *filter = call->filter;
+ int match, top = 0, val1 = 0, val2 = 0;
+ int stack[MAX_FILTER_PRED];
+ struct filter_pred *pred;
+ int i;
- match = (val == *addr) ^ pred->not;
+ for (i = 0; i < filter->n_preds; i++) {
+ pred = filter->preds[i];
+ if (!pred->pop_n) {
+ match = pred->fn(pred, rec, val1, val2);
+ stack[top++] = match;
+ continue;
+ }
+ if (pred->pop_n > top) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+ val1 = stack[--top];
+ val2 = stack[--top];
+ match = pred->fn(pred, rec, val1, val2);
+ stack[top++] = match;
+ }
- return match;
+ return stack[--top];
}
+EXPORT_SYMBOL_GPL(filter_match_preds);
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
{
- u8 *addr = (u8 *)(event + pred->offset);
- u8 val = (u8)pred->val;
- int match;
+ ps->lasterr = err;
+ ps->lasterr_pos = pos;
+}
- match = (val == *addr) ^ pred->not;
+static void remove_filter_string(struct event_filter *filter)
+{
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+}
- return match;
+static int replace_filter_string(struct event_filter *filter,
+ char *filter_string)
+{
+ kfree(filter->filter_string);
+ filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+ if (!filter->filter_string)
+ return -ENOMEM;
+
+ return 0;
}
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int append_filter_string(struct event_filter *filter,
+ char *string)
{
- char *addr = (char *)(event + pred->offset);
- int cmp, match;
+ int newlen;
+ char *new_filter_string;
- cmp = strncmp(addr, pred->str_val, pred->str_len);
+ BUG_ON(!filter->filter_string);
+ newlen = strlen(filter->filter_string) + strlen(string) + 1;
+ new_filter_string = kmalloc(newlen, GFP_KERNEL);
+ if (!new_filter_string)
+ return -ENOMEM;
- match = (!cmp) ^ pred->not;
+ strcpy(new_filter_string, filter->filter_string);
+ strcat(new_filter_string, string);
+ kfree(filter->filter_string);
+ filter->filter_string = new_filter_string;
- return match;
+ return 0;
}
-/* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+static void append_filter_err(struct filter_parse_state *ps,
+ struct event_filter *filter)
{
- int i, matched, and_failed = 0;
- struct filter_pred *pred;
+ int pos = ps->lasterr_pos;
+ char *buf, *pbuf;
- for (i = 0; i < MAX_FILTER_PRED; i++) {
- if (call->preds[i]) {
- pred = call->preds[i];
- if (and_failed && !pred->or)
- continue;
- matched = pred->fn(pred, rec);
- if (!matched && !pred->or) {
- and_failed = 1;
- continue;
- } else if (matched && pred->or)
- return 1;
- } else
- break;
- }
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return;
- if (and_failed)
- return 0;
+ append_filter_string(filter, "\n");
+ memset(buf, ' ', PAGE_SIZE);
+ if (pos > PAGE_SIZE - 128)
+ pos = 0;
+ buf[pos] = '^';
+ pbuf = &buf[pos] + 1;
- return 1;
+ sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+ append_filter_string(filter, buf);
+ free_page((unsigned long) buf);
}
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- char *field_name;
- struct filter_pred *pred;
- int i;
+ struct event_filter *filter = call->filter;
- if (!preds) {
+ mutex_lock(&event_mutex);
+ if (filter->filter_string)
+ trace_seq_printf(s, "%s\n", filter->filter_string);
+ else
trace_seq_printf(s, "none\n");
- return;
- }
+ mutex_unlock(&event_mutex);
+}
- for (i = 0; i < MAX_FILTER_PRED; i++) {
- if (preds[i]) {
- pred = preds[i];
- field_name = pred->field_name;
- if (i)
- trace_seq_printf(s, pred->or ? "|| " : "&& ");
- trace_seq_printf(s, "%s ", field_name);
- trace_seq_printf(s, pred->not ? "!= " : "== ");
- if (pred->str_val)
- trace_seq_printf(s, "%s\n", pred->str_val);
- else
- trace_seq_printf(s, "%llu\n", pred->val);
- } else
- break;
- }
+void print_subsystem_event_filter(struct event_subsystem *system,
+ struct trace_seq *s)
+{
+ struct event_filter *filter = system->filter;
+
+ mutex_lock(&event_mutex);
+ if (filter->filter_string)
+ trace_seq_printf(s, "%s\n", filter->filter_string);
+ else
+ trace_seq_printf(s, "none\n");
+ mutex_unlock(&event_mutex);
}
static struct ftrace_event_field *
@@ -150,284 +326,839 @@ find_event_field(struct ftrace_event_call *call, char *name)
return NULL;
}
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
{
if (!pred)
return;
kfree(pred->field_name);
- kfree(pred->str_val);
kfree(pred);
}
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
{
- int i;
+ kfree(pred->field_name);
+ pred->field_name = NULL;
+ pred->str_len = 0;
+}
- if (call->preds) {
- for (i = 0; i < MAX_FILTER_PRED; i++)
- filter_free_pred(call->preds[i]);
- kfree(call->preds);
- call->preds = NULL;
+static int filter_set_pred(struct filter_pred *dest,
+ struct filter_pred *src,
+ filter_pred_fn_t fn)
+{
+ *dest = *src;
+ if (src->field_name) {
+ dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+ if (!dest->field_name)
+ return -ENOMEM;
}
+ dest->fn = fn;
+
+ return 0;
}
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_disable_preds(struct ftrace_event_call *call)
{
- struct ftrace_event_call *call = __start_ftrace_events;
+ struct event_filter *filter = call->filter;
int i;
- if (system->preds) {
- for (i = 0; i < MAX_FILTER_PRED; i++)
- filter_free_pred(system->preds[i]);
- kfree(system->preds);
- system->preds = NULL;
- }
+ call->filter_active = 0;
+ filter->n_preds = 0;
- events_for_each(call) {
- if (!call->name || !call->regfunc)
- continue;
+ for (i = 0; i < MAX_FILTER_PRED; i++)
+ filter->preds[i]->fn = filter_pred_none;
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+ struct event_filter *filter = call->filter;
+ int i;
- if (!strcmp(call->system, system->name))
- filter_free_preds(call);
+ for (i = 0; i < MAX_FILTER_PRED; i++) {
+ if (filter->preds[i])
+ filter_free_pred(filter->preds[i]);
}
+ kfree(filter->preds);
+ kfree(filter->filter_string);
+ kfree(filter);
+ call->filter = NULL;
}
-static int __filter_add_pred(struct ftrace_event_call *call,
- struct filter_pred *pred)
+int init_preds(struct ftrace_event_call *call)
{
+ struct event_filter *filter;
+ struct filter_pred *pred;
int i;
- if (call->preds && !pred->compound)
- filter_free_preds(call);
+ filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!call->filter)
+ return -ENOMEM;
- if (!call->preds) {
- call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
- GFP_KERNEL);
- if (!call->preds)
- return -ENOMEM;
- }
+ call->filter_active = 0;
+ filter->n_preds = 0;
+
+ filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+ if (!filter->preds)
+ goto oom;
for (i = 0; i < MAX_FILTER_PRED; i++) {
- if (!call->preds[i]) {
- call->preds[i] = pred;
- return 0;
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
+ goto oom;
+ pred->fn = filter_pred_none;
+ filter->preds[i] = pred;
+ }
+
+ return 0;
+
+oom:
+ destroy_preds(call);
+
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(init_preds);
+
+static void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+ struct event_filter *filter = system->filter;
+ struct ftrace_event_call *call;
+ int i;
+
+ if (filter->n_preds) {
+ for (i = 0; i < filter->n_preds; i++)
+ filter_free_pred(filter->preds[i]);
+ kfree(filter->preds);
+ filter->preds = NULL;
+ filter->n_preds = 0;
+ }
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!call->define_fields)
+ continue;
+
+ if (!strcmp(call->system, system->name)) {
+ filter_disable_preds(call);
+ remove_filter_string(call->filter);
}
}
+}
+
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+ struct ftrace_event_call *call,
+ struct filter_pred *pred,
+ filter_pred_fn_t fn)
+{
+ struct event_filter *filter = call->filter;
+ int idx, err;
+
+ if (filter->n_preds == MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
- return -ENOSPC;
+ idx = filter->n_preds;
+ filter_clear_pred(filter->preds[idx]);
+ err = filter_set_pred(filter->preds[idx], pred, fn);
+ if (err)
+ return err;
+
+ filter->n_preds++;
+ call->filter_active = 1;
+
+ return 0;
}
+enum {
+ FILTER_STATIC_STRING = 1,
+ FILTER_DYN_STRING
+};
+
static int is_string_field(const char *type)
{
+ if (strstr(type, "__data_loc") && strstr(type, "char"))
+ return FILTER_DYN_STRING;
+
if (strchr(type, '[') && strstr(type, "char"))
- return 1;
+ return FILTER_STATIC_STRING;
return 0;
}
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+ if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+ return 0;
+
+ return 1;
+}
+
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+ int field_is_signed)
+{
+ filter_pred_fn_t fn = NULL;
+
+ switch (field_size) {
+ case 8:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_64;
+ else if (field_is_signed)
+ fn = filter_pred_s64;
+ else
+ fn = filter_pred_u64;
+ break;
+ case 4:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_32;
+ else if (field_is_signed)
+ fn = filter_pred_s32;
+ else
+ fn = filter_pred_u32;
+ break;
+ case 2:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_16;
+ else if (field_is_signed)
+ fn = filter_pred_s16;
+ else
+ fn = filter_pred_u16;
+ break;
+ case 1:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_8;
+ else if (field_is_signed)
+ fn = filter_pred_s8;
+ else
+ fn = filter_pred_u8;
+ break;
+ }
+
+ return fn;
+}
+
+static int filter_add_pred(struct filter_parse_state *ps,
+ struct ftrace_event_call *call,
+ struct filter_pred *pred)
{
struct ftrace_event_field *field;
+ filter_pred_fn_t fn;
+ unsigned long long val;
+ int string_type;
+ int ret;
+
+ pred->fn = filter_pred_none;
+
+ if (pred->op == OP_AND) {
+ pred->pop_n = 2;
+ return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+ } else if (pred->op == OP_OR) {
+ pred->pop_n = 2;
+ return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+ }
field = find_event_field(call, pred->field_name);
- if (!field)
+ if (!field) {
+ parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
return -EINVAL;
+ }
pred->offset = field->offset;
- if (is_string_field(field->type)) {
- if (!pred->str_val)
- return -EINVAL;
- pred->fn = filter_pred_string;
+ if (!is_legal_op(field, pred->op)) {
+ parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+ return -EINVAL;
+ }
+
+ string_type = is_string_field(field->type);
+ if (string_type) {
+ if (string_type == FILTER_STATIC_STRING)
+ fn = filter_pred_string;
+ else
+ fn = filter_pred_strloc;
pred->str_len = field->size;
- return __filter_add_pred(call, pred);
+ if (pred->op == OP_NE)
+ pred->not = 1;
+ return filter_add_pred_fn(ps, call, pred, fn);
} else {
- if (pred->str_val)
+ if (field->is_signed)
+ ret = strict_strtoll(pred->str_val, 0, &val);
+ else
+ ret = strict_strtoull(pred->str_val, 0, &val);
+ if (ret) {
+ parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
return -EINVAL;
+ }
+ pred->val = val;
}
- switch (field->size) {
- case 8:
- pred->fn = filter_pred_64;
- break;
- case 4:
- pred->fn = filter_pred_32;
- break;
- case 2:
- pred->fn = filter_pred_16;
- break;
- case 1:
- pred->fn = filter_pred_8;
- break;
- default:
+ fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+ if (!fn) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
return -EINVAL;
}
- return __filter_add_pred(call, pred);
+ if (pred->op == OP_NE)
+ pred->not = 1;
+
+ return filter_add_pred_fn(ps, call, pred, fn);
}
-static struct filter_pred *copy_pred(struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+ struct event_subsystem *system,
+ struct filter_pred *pred,
+ char *filter_string)
{
- struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
- if (!new_pred)
- return NULL;
+ struct event_filter *filter = system->filter;
+ struct ftrace_event_call *call;
+ int err = 0;
- memcpy(new_pred, pred, sizeof(*pred));
+ if (!filter->preds) {
+ filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+ GFP_KERNEL);
- if (pred->field_name) {
- new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
- if (!new_pred->field_name) {
- kfree(new_pred);
- return NULL;
- }
+ if (!filter->preds)
+ return -ENOMEM;
}
- if (pred->str_val) {
- new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
- if (!new_pred->str_val) {
- filter_free_pred(new_pred);
- return NULL;
+ if (filter->n_preds == MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ if (!call->define_fields)
+ continue;
+
+ if (strcmp(call->system, system->name))
+ continue;
+
+ err = filter_add_pred(ps, call, pred);
+ if (err) {
+ filter_free_subsystem_preds(system);
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ goto out;
}
+ replace_filter_string(call->filter, filter_string);
}
- return new_pred;
+ filter->preds[filter->n_preds] = pred;
+ filter->n_preds++;
+out:
+ return err;
}
-int filter_add_subsystem_pred(struct event_subsystem *system,
- struct filter_pred *pred)
+static void parse_init(struct filter_parse_state *ps,
+ struct filter_op *ops,
+ char *infix_string)
{
- struct ftrace_event_call *call = __start_ftrace_events;
- struct filter_pred *event_pred;
- int i;
+ memset(ps, '\0', sizeof(*ps));
- if (system->preds && !pred->compound)
- filter_free_subsystem_preds(system);
+ ps->infix.string = infix_string;
+ ps->infix.cnt = strlen(infix_string);
+ ps->ops = ops;
- if (!system->preds) {
- system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
- GFP_KERNEL);
- if (!system->preds)
- return -ENOMEM;
+ INIT_LIST_HEAD(&ps->opstack);
+ INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+ ps->infix.cnt--;
+
+ return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+ if (ps->infix.tail == strlen(ps->infix.string))
+ return 0;
+
+ return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+ ps->infix.cnt--;
+ ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+ int a, int b)
+{
+ return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+ int i;
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (ps->ops[i].string[0] == c)
+ return 1;
}
- for (i = 0; i < MAX_FILTER_PRED; i++) {
- if (!system->preds[i]) {
- system->preds[i] = pred;
- break;
+ return 0;
+}
+
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+ char nextc = infix_peek(ps);
+ char opstr[3];
+ int i;
+
+ opstr[0] = firstc;
+ opstr[1] = nextc;
+ opstr[2] = '\0';
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (!strcmp(opstr, ps->ops[i].string)) {
+ infix_advance(ps);
+ return ps->ops[i].id;
}
}
- if (i == MAX_FILTER_PRED)
- return -ENOSPC;
+ opstr[1] = '\0';
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (!strcmp(opstr, ps->ops[i].string))
+ return ps->ops[i].id;
+ }
+
+ return OP_NONE;
+}
+
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+ memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+ ps->operand.tail = 0;
+}
+
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+ if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
+ return -EINVAL;
+
+ ps->operand.string[ps->operand.tail++] = c;
+
+ return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+ struct opstack_op *opstack_op;
- events_for_each(call) {
- int err;
+ opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+ if (!opstack_op)
+ return -ENOMEM;
+
+ opstack_op->op = op;
+ list_add(&opstack_op->list, &ps->opstack);
+
+ return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+ return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+ struct opstack_op *opstack_op;
+
+ if (filter_opstack_empty(ps))
+ return OP_NONE;
+
+ opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+ return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+ struct opstack_op *opstack_op;
+ int op;
+
+ if (filter_opstack_empty(ps))
+ return OP_NONE;
+
+ opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+ op = opstack_op->op;
+ list_del(&opstack_op->list);
+
+ kfree(opstack_op);
+
+ return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+ while (!filter_opstack_empty(ps))
+ filter_opstack_pop(ps);
+}
- if (!call->name || !call->regfunc)
+static char *curr_operand(struct filter_parse_state *ps)
+{
+ return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+ struct postfix_elt *elt;
+
+ elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+ if (!elt)
+ return -ENOMEM;
+
+ elt->op = OP_NONE;
+ elt->operand = kstrdup(operand, GFP_KERNEL);
+ if (!elt->operand) {
+ kfree(elt);
+ return -ENOMEM;
+ }
+
+ list_add_tail(&elt->list, &ps->postfix);
+
+ return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+ struct postfix_elt *elt;
+
+ elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+ if (!elt)
+ return -ENOMEM;
+
+ elt->op = op;
+ elt->operand = NULL;
+
+ list_add_tail(&elt->list, &ps->postfix);
+
+ return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+ struct postfix_elt *elt;
+
+ while (!list_empty(&ps->postfix)) {
+ elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+ kfree(elt->operand);
+ list_del(&elt->list);
+ }
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+ int in_string = 0;
+ int op, top_op;
+ char ch;
+
+ while ((ch = infix_next(ps))) {
+ if (ch == '"') {
+ in_string ^= 1;
continue;
+ }
- if (strcmp(call->system, system->name))
+ if (in_string)
+ goto parse_operand;
+
+ if (isspace(ch))
continue;
- if (!find_event_field(call, pred->field_name))
+ if (is_op_char(ps, ch)) {
+ op = infix_get_op(ps, ch);
+ if (op == OP_NONE) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
+ return -EINVAL;
+ }
+
+ if (strlen(curr_operand(ps))) {
+ postfix_append_operand(ps, curr_operand(ps));
+ clear_operand_string(ps);
+ }
+
+ while (!filter_opstack_empty(ps)) {
+ top_op = filter_opstack_top(ps);
+ if (!is_precedence_lower(ps, top_op, op)) {
+ top_op = filter_opstack_pop(ps);
+ postfix_append_op(ps, top_op);
+ continue;
+ }
+ break;
+ }
+
+ filter_opstack_push(ps, op);
continue;
+ }
- event_pred = copy_pred(pred);
- if (!event_pred)
- goto oom;
+ if (ch == '(') {
+ filter_opstack_push(ps, OP_OPEN_PAREN);
+ continue;
+ }
- err = filter_add_pred(call, event_pred);
- if (err)
- filter_free_pred(event_pred);
- if (err == -ENOMEM)
- goto oom;
+ if (ch == ')') {
+ if (strlen(curr_operand(ps))) {
+ postfix_append_operand(ps, curr_operand(ps));
+ clear_operand_string(ps);
+ }
+
+ top_op = filter_opstack_pop(ps);
+ while (top_op != OP_NONE) {
+ if (top_op == OP_OPEN_PAREN)
+ break;
+ postfix_append_op(ps, top_op);
+ top_op = filter_opstack_pop(ps);
+ }
+ if (top_op == OP_NONE) {
+ parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+ return -EINVAL;
+ }
+ continue;
+ }
+parse_operand:
+ if (append_operand_char(ps, ch)) {
+ parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+ return -EINVAL;
+ }
+ }
+
+ if (strlen(curr_operand(ps)))
+ postfix_append_operand(ps, curr_operand(ps));
+
+ while (!filter_opstack_empty(ps)) {
+ top_op = filter_opstack_pop(ps);
+ if (top_op == OP_NONE)
+ break;
+ if (top_op == OP_OPEN_PAREN) {
+ parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+ return -EINVAL;
+ }
+ postfix_append_op(ps, top_op);
}
return 0;
+}
-oom:
- system->preds[i] = NULL;
- return -ENOMEM;
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+ struct filter_pred *pred;
+
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
+ return NULL;
+
+ pred->field_name = kstrdup(operand1, GFP_KERNEL);
+ if (!pred->field_name) {
+ kfree(pred);
+ return NULL;
+ }
+
+ strcpy(pred->str_val, operand2);
+ pred->str_len = strlen(operand2);
+
+ pred->op = op;
+
+ return pred;
}
-int filter_parse(char **pbuf, struct filter_pred *pred)
-{
- char *tmp, *tok, *val_str = NULL;
- int tok_n = 0;
-
- /* field ==/!= number, or/and field ==/!= number, number */
- while ((tok = strsep(pbuf, " \n"))) {
- if (tok_n == 0) {
- if (!strcmp(tok, "0")) {
- pred->clear = 1;
- return 0;
- } else if (!strcmp(tok, "&&")) {
- pred->or = 0;
- pred->compound = 1;
- } else if (!strcmp(tok, "||")) {
- pred->or = 1;
- pred->compound = 1;
- } else
- pred->field_name = tok;
- tok_n = 1;
+static struct filter_pred *create_logical_pred(int op)
+{
+ struct filter_pred *pred;
+
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
+ return NULL;
+
+ pred->op = op;
+
+ return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+ int n_normal_preds = 0, n_logical_preds = 0;
+ struct postfix_elt *elt;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE)
+ continue;
+
+ if (elt->op == OP_AND || elt->op == OP_OR) {
+ n_logical_preds++;
continue;
}
- if (tok_n == 1) {
- if (!pred->field_name)
- pred->field_name = tok;
- else if (!strcmp(tok, "!="))
- pred->not = 1;
- else if (!strcmp(tok, "=="))
- pred->not = 0;
+ n_normal_preds++;
+ }
+
+ if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+ parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int replace_preds(struct event_subsystem *system,
+ struct ftrace_event_call *call,
+ struct filter_parse_state *ps,
+ char *filter_string)
+{
+ char *operand1 = NULL, *operand2 = NULL;
+ struct filter_pred *pred;
+ struct postfix_elt *elt;
+ int err;
+
+ err = check_preds(ps);
+ if (err)
+ return err;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE) {
+ if (!operand1)
+ operand1 = elt->operand;
+ else if (!operand2)
+ operand2 = elt->operand;
else {
- pred->field_name = NULL;
+ parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
return -EINVAL;
}
- tok_n = 2;
continue;
}
- if (tok_n == 2) {
- if (pred->compound) {
- if (!strcmp(tok, "!="))
- pred->not = 1;
- else if (!strcmp(tok, "=="))
- pred->not = 0;
- else {
- pred->field_name = NULL;
- return -EINVAL;
- }
+
+ if (elt->op == OP_AND || elt->op == OP_OR) {
+ pred = create_logical_pred(elt->op);
+ if (!pred)
+ return -ENOMEM;
+ if (call) {
+ err = filter_add_pred(ps, call, pred);
+ filter_free_pred(pred);
} else {
- val_str = tok;
- break; /* done */
+ err = filter_add_subsystem_pred(ps, system,
+ pred, filter_string);
+ if (err)
+ filter_free_pred(pred);
}
- tok_n = 3;
+ if (err)
+ return err;
+
+ operand1 = operand2 = NULL;
continue;
}
- if (tok_n == 3) {
- val_str = tok;
- break; /* done */
+
+ if (!operand1 || !operand2) {
+ parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+ return -EINVAL;
+ }
+
+ pred = create_pred(elt->op, operand1, operand2);
+ if (!pred)
+ return -ENOMEM;
+ if (call) {
+ err = filter_add_pred(ps, call, pred);
+ filter_free_pred(pred);
+ } else {
+ err = filter_add_subsystem_pred(ps, system, pred,
+ filter_string);
+ if (err)
+ filter_free_pred(pred);
}
+ if (err)
+ return err;
+
+ operand1 = operand2 = NULL;
}
- if (!val_str) {
- pred->field_name = NULL;
- return -EINVAL;
+ return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+ int err;
+
+ struct filter_parse_state *ps;
+
+ mutex_lock(&event_mutex);
+
+ if (!strcmp(strstrip(filter_string), "0")) {
+ filter_disable_preds(call);
+ remove_filter_string(call->filter);
+ mutex_unlock(&event_mutex);
+ return 0;
}
- pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
- if (!pred->field_name)
- return -ENOMEM;
+ err = -ENOMEM;
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out_unlock;
- pred->val = simple_strtoull(val_str, &tmp, 0);
- if (tmp == val_str) {
- pred->str_val = kstrdup(val_str, GFP_KERNEL);
- if (!pred->str_val)
- return -ENOMEM;
- } else if (*tmp != '\0')
- return -EINVAL;
+ filter_disable_preds(call);
+ replace_filter_string(call->filter, filter_string);
- return 0;
+ parse_init(ps, filter_ops, filter_string);
+ err = filter_parse(ps);
+ if (err) {
+ append_filter_err(ps, call->filter);
+ goto out;
+ }
+
+ err = replace_preds(NULL, call, ps, filter_string);
+ if (err)
+ append_filter_err(ps, call->filter);
+
+out:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
}
+int apply_subsystem_event_filter(struct event_subsystem *system,
+ char *filter_string)
+{
+ int err;
+
+ struct filter_parse_state *ps;
+
+ mutex_lock(&event_mutex);
+
+ if (!strcmp(strstrip(filter_string), "0")) {
+ filter_free_subsystem_preds(system);
+ remove_filter_string(system->filter);
+ mutex_unlock(&event_mutex);
+ return 0;
+ }
+
+ err = -ENOMEM;
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out_unlock;
+
+ filter_free_subsystem_preds(system);
+ replace_filter_string(system->filter, filter_string);
+
+ parse_init(ps, filter_ops, filter_string);
+ err = filter_parse(ps);
+ if (err) {
+ append_filter_err(ps, system->filter);
+ goto out;
+ }
+
+ err = replace_preds(system, NULL, ps, filter_string);
+ if (err)
+ append_filter_err(ps, system->filter);
+
+out:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
+}
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 38985f9b379..00000000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- * struct trace_entry ent;
- * <type> <item>;
- * <type2> <item2>[<len>];
- * [...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
-#undef __array
-#define __array(type, item, len) type item[len];
-
-#undef __field
-#define __field(type, item) type item;
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
- struct ftrace_raw_##name { \
- struct trace_entry ent; \
- tstruct \
- }; \
- static struct ftrace_event_call event_##name
-
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index d363c6672c6..00000000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- * struct trace_seq *s = &iter->seq;
- * struct ftrace_raw_<call> *field; <-- defined in stage 1
- * struct trace_entry *entry;
- * int ret;
- *
- * entry = iter->ent;
- *
- * if (entry->type != event_<call>.id) {
- * WARN_ON_ONCE(1);
- * return TRACE_TYPE_UNHANDLED;
- * }
- *
- * field = (typeof(field))entry;
- *
- * ret = trace_seq_printf(s, <TP_printk> "\n");
- * if (!ret)
- * return TRACE_TYPE_PARTIAL_LINE;
- *
- * return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-
-#undef __entry
-#define __entry field
-
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
-enum print_line_t \
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
-{ \
- struct trace_seq *s = &iter->seq; \
- struct ftrace_raw_##call *field; \
- struct trace_entry *entry; \
- int ret; \
- \
- entry = iter->ent; \
- \
- if (entry->type != event_##call.id) { \
- WARN_ON_ONCE(1); \
- return TRACE_TYPE_UNHANDLED; \
- } \
- \
- field = (typeof(field))entry; \
- \
- ret = trace_seq_printf(s, #call ": " print); \
- if (!ret) \
- return TRACE_TYPE_PARTIAL_LINE; \
- \
- return TRACE_TYPE_HANDLED; \
-}
-
-#include <trace/trace_event_types.h>
-
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- * struct ftrace_raw_##call field;
- * int ret;
- *
- * ret = trace_seq_printf(s, #type " " #item ";"
- * " offset:%u; size:%u;\n",
- * offsetof(struct ftrace_raw_##call, item),
- * sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item) \
- ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
- "offset:%u;\tsize:%u;\n", \
- (unsigned int)offsetof(typeof(field), item), \
- (unsigned int)sizeof(field.item)); \
- if (!ret) \
- return 0;
-
-#undef __array
-#define __array(type, item, len) \
- ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
- "offset:%u;\tsize:%u;\n", \
- (unsigned int)offsetof(typeof(field), item), \
- (unsigned int)sizeof(field.item)); \
- if (!ret) \
- return 0;
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
-static int \
-ftrace_format_##call(struct trace_seq *s) \
-{ \
- struct ftrace_raw_##call field; \
- int ret; \
- \
- tstruct; \
- \
- trace_seq_printf(s, "\nprint fmt: " print); \
- \
- return ret; \
-}
-
-#include <trace/trace_event_types.h>
-
-#undef __field
-#define __field(type, item) \
- ret = trace_define_field(event_call, #type, #item, \
- offsetof(typeof(field), item), \
- sizeof(field.item)); \
- if (ret) \
- return ret;
-
-#undef __array
-#define __array(type, item, len) \
- ret = trace_define_field(event_call, #type "[" #len "]", #item, \
- offsetof(typeof(field), item), \
- sizeof(field.item)); \
- if (ret) \
- return ret;
-
-#define __common_field(type, item) \
- ret = trace_define_field(event_call, #type, "common_" #item, \
- offsetof(typeof(field.ent), item), \
- sizeof(field.ent.item)); \
- if (ret) \
- return ret;
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
-int \
-ftrace_define_fields_##call(void) \
-{ \
- struct ftrace_raw_##call field; \
- struct ftrace_event_call *event_call = &event_##call; \
- int ret; \
- \
- __common_field(unsigned char, type); \
- __common_field(unsigned char, flags); \
- __common_field(unsigned char, preempt_count); \
- __common_field(int, pid); \
- __common_field(int, tgid); \
- \
- tstruct; \
- \
- return ret; \
-}
-
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644
index 9d2fa78cecc..00000000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Stage 3 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * static void ftrace_event_<call>(proto)
- * {
- * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(void)
- * {
- * int ret;
- *
- * ret = register_trace_<call>(ftrace_event_<call>);
- * if (!ret)
- * pr_info("event trace: Could not activate trace point "
- * "probe to <call>");
- * return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- * unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- * .name = "<call>",
- * .regfunc = ftrace_reg_event_<call>,
- * .unregfunc = ftrace_unreg_event_<call>,
- * }
- *
- *
- * For those macros defined with TRACE_EVENT:
- *
- * static struct ftrace_event_call event_<call>;
- *
- * static void ftrace_raw_event_<call>(proto)
- * {
- * struct ring_buffer_event *event;
- * struct ftrace_raw_<call> *entry; <-- defined in stage 1
- * unsigned long irq_flags;
- * int pc;
- *
- * local_save_flags(irq_flags);
- * pc = preempt_count();
- *
- * event = trace_current_buffer_lock_reserve(event_<call>.id,
- * sizeof(struct ftrace_raw_<call>),
- * irq_flags, pc);
- * if (!event)
- * return;
- * entry = ring_buffer_event_data(event);
- *
- * <assign>; <-- Here we assign the entries by the __field and
- * __array macros.
- *
- * trace_current_buffer_unlock_commit(event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(void)
- * {
- * int ret;
- *
- * ret = register_trace_<call>(ftrace_raw_event_<call>);
- * if (!ret)
- * pr_info("event trace: Could not activate trace point "
- * "probe to <call>");
- * return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- * unregister_trace_<call>(ftrace_raw_event_<call>);
- * }
- *
- * static struct trace_event ftrace_event_type_<call> = {
- * .trace = ftrace_raw_output_<call>, <-- stage 2
- * };
- *
- * static int ftrace_raw_init_event_<call>(void)
- * {
- * int id;
- *
- * id = register_ftrace_event(&ftrace_event_type_<call>);
- * if (!id)
- * return -ENODEV;
- * event_<call>.id = id;
- * return 0;
- * }
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- * .name = "<call>",
- * .system = "<system>",
- * .raw_init = ftrace_raw_init_event_<call>,
- * .regfunc = ftrace_reg_event_<call>,
- * .unregfunc = ftrace_unreg_event_<call>,
- * .show_format = ftrace_format_<call>,
- * }
- *
- */
-
-#undef TP_FMT
-#define TP_FMT(fmt, args...) fmt "\n", ##args
-
-#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args) \
-static void ftrace_profile_##call(proto) \
-{ \
- extern void perf_tpcounter_event(int); \
- perf_tpcounter_event(event_##call.id); \
-} \
- \
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
-{ \
- int ret = 0; \
- \
- if (!atomic_inc_return(&call->profile_count)) \
- ret = register_trace_##call(ftrace_profile_##call); \
- \
- return ret; \
-} \
- \
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
-{ \
- if (atomic_add_negative(-1, &call->profile_count)) \
- unregister_trace_##call(ftrace_profile_##call); \
-}
-
-#define _TRACE_PROFILE_INIT(call) \
- .profile_count = ATOMIC_INIT(-1), \
- .profile_enable = ftrace_profile_enable_##call, \
- .profile_disable = ftrace_profile_disable_##call,
-
-#else
-#define _TRACE_PROFILE(call, proto, args)
-#define _TRACE_PROFILE_INIT(call)
-#endif
-
-#define _TRACE_FORMAT(call, proto, args, fmt) \
-static void ftrace_event_##call(proto) \
-{ \
- event_trace_printk(_RET_IP_, #call ": " fmt); \
-} \
- \
-static int ftrace_reg_event_##call(void) \
-{ \
- int ret; \
- \
- ret = register_trace_##call(ftrace_event_##call); \
- if (ret) \
- pr_info("event trace: Could not activate trace point " \
- "probe to " #call "\n"); \
- return ret; \
-} \
- \
-static void ftrace_unreg_event_##call(void) \
-{ \
- unregister_trace_##call(ftrace_event_##call); \
-} \
- \
-static struct ftrace_event_call event_##call; \
- \
-static int ftrace_init_event_##call(void) \
-{ \
- int id; \
- \
- id = register_ftrace_event(NULL); \
- if (!id) \
- return -ENODEV; \
- event_##call.id = id; \
- return 0; \
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt) \
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
-static struct ftrace_event_call __used \
-__attribute__((__aligned__(4))) \
-__attribute__((section("_ftrace_events"))) event_##call = { \
- .name = #call, \
- .system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_init_event_##call, \
- .regfunc = ftrace_reg_event_##call, \
- .unregfunc = ftrace_unreg_event_##call, \
- _TRACE_PROFILE_INIT(call) \
-}
-
-#undef __entry
-#define __entry entry
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
- \
-static struct ftrace_event_call event_##call; \
- \
-static void ftrace_raw_event_##call(proto) \
-{ \
- struct ftrace_event_call *call = &event_##call; \
- struct ring_buffer_event *event; \
- struct ftrace_raw_##call *entry; \
- unsigned long irq_flags; \
- int pc; \
- \
- local_save_flags(irq_flags); \
- pc = preempt_count(); \
- \
- event = trace_current_buffer_lock_reserve(event_##call.id, \
- sizeof(struct ftrace_raw_##call), \
- irq_flags, pc); \
- if (!event) \
- return; \
- entry = ring_buffer_event_data(event); \
- \
- assign; \
- \
- if (call->preds && !filter_match_preds(call, entry)) \
- ring_buffer_event_discard(event); \
- \
- trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
- \
-} \
- \
-static int ftrace_raw_reg_event_##call(void) \
-{ \
- int ret; \
- \
- ret = register_trace_##call(ftrace_raw_event_##call); \
- if (ret) \
- pr_info("event trace: Could not activate trace point " \
- "probe to " #call "\n"); \
- return ret; \
-} \
- \
-static void ftrace_raw_unreg_event_##call(void) \
-{ \
- unregister_trace_##call(ftrace_raw_event_##call); \
-} \
- \
-static struct trace_event ftrace_event_type_##call = { \
- .trace = ftrace_raw_output_##call, \
-}; \
- \
-static int ftrace_raw_init_event_##call(void) \
-{ \
- int id; \
- \
- id = register_ftrace_event(&ftrace_event_type_##call); \
- if (!id) \
- return -ENODEV; \
- event_##call.id = id; \
- INIT_LIST_HEAD(&event_##call.fields); \
- return 0; \
-} \
- \
-static struct ftrace_event_call __used \
-__attribute__((__aligned__(4))) \
-__attribute__((section("_ftrace_events"))) event_##call = { \
- .name = #call, \
- .system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_raw_init_event_##call, \
- .regfunc = ftrace_raw_reg_event_##call, \
- .unregfunc = ftrace_raw_unreg_event_##call, \
- .show_format = ftrace_format_##call, \
- .define_fields = ftrace_define_fields_##call, \
- _TRACE_PROFILE_INIT(call) \
-}
-
-#include <trace/trace_event_types.h>
-
-#undef _TRACE_PROFILE
-#undef _TRACE_PROFILE_INIT
-
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c33ebf..d06cf898dc8 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
#undef TRACE_STRUCT
#define TRACE_STRUCT(args...) args
+extern void __bad_type_size(void);
+
#undef TRACE_FIELD
#define TRACE_FIELD(type, item, assign) \
+ if (sizeof(type) != sizeof(field.item)) \
+ __bad_type_size(); \
ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
if (!ret) \
return 0;
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+ TRACE_FIELD(type, item, assign)
#undef TP_RAW_FMT
#define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s) \
return ret; \
}
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
+ tpfmt) \
+static int \
+ftrace_format_##call(struct trace_seq *s) \
+{ \
+ struct args field; \
+ int ret; \
+ \
+ tstruct; \
+ \
+ trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
+ \
+ return ret; \
+}
+
#include "trace_event_types.h"
#undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \
#define TRACE_FIELD(type, item, assign)\
entry->item = assign;
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+ TRACE_FIELD(type, item, assign)
+
#undef TP_CMD
#define TP_CMD(cmd...) cmd
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s) \
#define TRACE_ENTRY entry
#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
cmd;
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+int ftrace_define_fields_##call(void); \
+static int ftrace_raw_init_event_##call(void); \
+ \
+struct ftrace_event_call __used \
+__attribute__((__aligned__(4))) \
+__attribute__((section("_ftrace_events"))) event_##call = { \
+ .name = #call, \
+ .id = proto, \
+ .system = __stringify(TRACE_SYSTEM), \
+ .raw_init = ftrace_raw_init_event_##call, \
+ .show_format = ftrace_format_##call, \
+ .define_fields = ftrace_define_fields_##call, \
+}; \
+static int ftrace_raw_init_event_##call(void) \
+{ \
+ INIT_LIST_HEAD(&event_##call.fields); \
+ init_preds(&event_##call); \
+ return 0; \
+} \
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
+ tpfmt) \
\
-static struct ftrace_event_call __used \
+struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.id = proto, \
.system = __stringify(TRACE_SYSTEM), \
.show_format = ftrace_format_##call, \
+};
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed_type(type)); \
+ if (ret) \
+ return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
+ ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), 0); \
+ if (ret) \
+ return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed); \
+ if (ret) \
+ return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+int \
+ftrace_define_fields_##call(void) \
+{ \
+ struct ftrace_event_call *event_call = &event_##call; \
+ struct args field; \
+ int ret; \
+ \
+ __common_field(unsigned char, type, 0); \
+ __common_field(unsigned char, flags, 0); \
+ __common_field(unsigned char, preempt_count, 0); \
+ __common_field(int, pid, 1); \
+ __common_field(int, tgid, 1); \
+ \
+ tstruct; \
+ \
+ return ret; \
}
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
+ tpfmt)
+
#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44f..75ef000613c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
static void tracing_stop_function_trace(void)
{
ftrace_function_enabled = 0;
- /* OK if they are not registered */
- unregister_ftrace_function(&trace_stack_ops);
- unregister_ftrace_function(&trace_ops);
+
+ if (func_flags.val & TRACE_FUNC_OPT_STACK)
+ unregister_ftrace_function(&trace_stack_ops);
+ else
+ unregister_ftrace_function(&trace_ops);
}
static int func_set_flag(u32 old_flags, u32 bit, int set)
@@ -300,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
if (count == -1)
seq_printf(m, ":unlimited\n");
else
- seq_printf(m, ":count=%ld", count);
- seq_putc(m, '\n');
+ seq_printf(m, ":count=%ld\n", count);
return 0;
}
@@ -362,7 +363,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
out_reg:
ret = register_ftrace_function_probe(glob, ops, count);
- return ret;
+ return ret < 0 ? ret : 0;
}
static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a..420ec348757 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
/* Add a function return address to the trace stack on thread info.*/
int
-ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+ unsigned long frame_pointer)
{
unsigned long long calltime;
int index;
@@ -65,6 +66,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
if (!current->ret_stack)
return -EBUSY;
+ /*
+ * We must make sure the ret_stack is tested before we read
+ * anything else.
+ */
+ smp_rmb();
+
/* The return trace stack is full */
if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
atomic_inc(&current->trace_overrun);
@@ -78,14 +85,17 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
current->ret_stack[index].ret = ret;
current->ret_stack[index].func = func;
current->ret_stack[index].calltime = calltime;
+ current->ret_stack[index].subtime = 0;
+ current->ret_stack[index].fp = frame_pointer;
*depth = index;
return 0;
}
/* Retrieve a function return address to the trace stack on thread info.*/
-void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+static void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+ unsigned long frame_pointer)
{
int index;
@@ -99,28 +109,52 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
return;
}
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+ /*
+ * The arch may choose to record the frame pointer used
+ * and check it here to make sure that it is what we expect it
+ * to be. If gcc does not set the place holder of the return
+ * address in the frame pointer, and does a copy instead, then
+ * the function graph trace will fail. This test detects this
+ * case.
+ *
+ * Currently, x86_32 with optimize for size (-Os) makes the latest
+ * gcc do the above.
+ */
+ if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ ftrace_graph_stop();
+ WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+ " from func %pF return to %lx\n",
+ current->ret_stack[index].fp,
+ frame_pointer,
+ (void *)current->ret_stack[index].func,
+ current->ret_stack[index].ret);
+ *ret = (unsigned long)panic;
+ return;
+ }
+#endif
+
*ret = current->ret_stack[index].ret;
trace->func = current->ret_stack[index].func;
trace->calltime = current->ret_stack[index].calltime;
trace->overrun = atomic_read(&current->trace_overrun);
trace->depth = index;
- barrier();
- current->curr_ret_stack--;
-
}
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
-unsigned long ftrace_return_to_handler(void)
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
{
struct ftrace_graph_ret trace;
unsigned long ret;
- ftrace_pop_return_trace(&trace, &ret);
+ ftrace_pop_return_trace(&trace, &ret, frame_pointer);
trace.rettime = trace_clock_local();
ftrace_graph_return(&trace);
+ barrier();
+ current->curr_ret_stack--;
if (unlikely(!ret)) {
ftrace_graph_stop();
@@ -426,8 +460,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
{
unsigned long nsecs_rem = do_div(duration, 1000);
/* log10(ULONG_MAX) + '\0' */
@@ -464,12 +498,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+ int ret;
+
+ ret = trace_print_graph_duration(duration, s);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
ret = trace_seq_printf(s, "| ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ return TRACE_TYPE_HANDLED;
}
/* Case of a leaf function on its call entry */
@@ -798,9 +843,16 @@ print_graph_function(struct trace_iterator *iter)
switch (entry->type) {
case TRACE_GRAPH_ENT: {
- struct ftrace_graph_ent_entry *field;
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * sizeof(struct ftrace_graph_ent_entry) is very small,
+ * it can be safely saved at the stack.
+ */
+ struct ftrace_graph_ent_entry *field, saved;
trace_assign_type(field, entry);
- return print_graph_entry(field, s, iter);
+ saved = *field;
+ return print_graph_entry(&saved, s, iter);
}
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347..ca7d7c4d0c2 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,10 +1,9 @@
/*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
*
* Copyright (C) 2008-2009 Intel Corporation.
* Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
*/
-#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/debugfs.h>
#include <linux/ftrace.h>
@@ -15,110 +14,119 @@
#include <asm/ds.h>
-#include "trace.h"
#include "trace_output.h"
+#include "trace.h"
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
#define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
static struct trace_array *hw_branch_trace __read_mostly;
-/*
- * Start tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_start_cpu(void *arg)
+static void bts_trace_init_cpu(int cpu)
{
- if (this_tracer)
- ds_release_bts(this_tracer);
-
- this_tracer =
- ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
- /* ovfl = */ NULL, /* th = */ (size_t)-1,
- BTS_KERNEL);
- if (IS_ERR(this_tracer)) {
- this_tracer = NULL;
- return;
- }
+ per_cpu(tracer, cpu) =
+ ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+
+ if (IS_ERR(per_cpu(tracer, cpu)))
+ per_cpu(tracer, cpu) = NULL;
}
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
{
- spin_lock(&bts_tracer_lock);
+ int cpu;
+
+ hw_branch_trace = tr;
+ trace_hw_branches_enabled = 0;
- on_each_cpu(bts_trace_start_cpu, NULL, 1);
- trace_hw_branches_enabled = 1;
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ bts_trace_init_cpu(cpu);
- spin_unlock(&bts_tracer_lock);
+ if (likely(per_cpu(tracer, cpu)))
+ trace_hw_branches_enabled = 1;
+ }
+ trace_hw_branches_suspended = 0;
+ put_online_cpus();
+
+ /* If we could not enable tracing on a single cpu, we fail. */
+ return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
}
-/*
- * Stop tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_reset(struct trace_array *tr)
{
- if (this_tracer) {
- ds_release_bts(this_tracer);
- this_tracer = NULL;
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ if (likely(per_cpu(tracer, cpu))) {
+ ds_release_bts(per_cpu(tracer, cpu));
+ per_cpu(tracer, cpu) = NULL;
+ }
}
+ trace_hw_branches_enabled = 0;
+ trace_hw_branches_suspended = 0;
+ put_online_cpus();
}
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_start(struct trace_array *tr)
{
- spin_lock(&bts_tracer_lock);
+ int cpu;
- trace_hw_branches_enabled = 0;
- on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_resume_bts(per_cpu(tracer, cpu));
+ trace_hw_branches_suspended = 0;
+ put_online_cpus();
+}
- spin_unlock(&bts_tracer_lock);
+static void bts_trace_stop(struct trace_array *tr)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_suspend_bts(per_cpu(tracer, cpu));
+ trace_hw_branches_suspended = 1;
+ put_online_cpus();
}
static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- unsigned int cpu = (unsigned long)hcpu;
-
- spin_lock(&bts_tracer_lock);
-
- if (!trace_hw_branches_enabled)
- goto out;
+ int cpu = (long)hcpu;
switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+ /* The notification is sent with interrupts enabled. */
+ if (trace_hw_branches_enabled) {
+ bts_trace_init_cpu(cpu);
+
+ if (trace_hw_branches_suspended &&
+ likely(per_cpu(tracer, cpu)))
+ ds_suspend_bts(per_cpu(tracer, cpu));
+ }
break;
+
case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
- break;
+ /* The notification is sent with interrupts enabled. */
+ if (likely(per_cpu(tracer, cpu))) {
+ ds_release_bts(per_cpu(tracer, cpu));
+ per_cpu(tracer, cpu) = NULL;
+ }
}
- out:
- spin_unlock(&bts_tracer_lock);
return NOTIFY_DONE;
}
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
.notifier_call = bts_hotcpu_handler
};
-static int bts_trace_init(struct trace_array *tr)
-{
- hw_branch_trace = tr;
-
- bts_trace_start(tr);
-
- return 0;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
- bts_trace_stop(tr);
-}
-
static void bts_trace_print_header(struct seq_file *m)
{
seq_puts(m, "# CPU# TO <- FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
{
+ unsigned long symflags = TRACE_ITER_SYM_OFFSET;
struct trace_entry *entry = iter->ent;
struct trace_seq *seq = &iter->seq;
struct hw_branch_entry *it;
- unsigned long symflags = TRACE_ITER_SYM_OFFSET;
trace_assign_type(it, entry);
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
void trace_hw_branch(u64 from, u64 to)
{
+ struct ftrace_event_call *call = &event_hw_branch;
struct trace_array *tr = hw_branch_trace;
struct ring_buffer_event *event;
struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
entry->ent.type = TRACE_HW_BRANCHES;
entry->from = from;
entry->to = to;
- trace_buffer_unlock_commit(tr, event, 0, 0);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ trace_buffer_unlock_commit(tr, event, 0, 0);
out:
atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
/*
* Collect the trace on the current cpu and write it into the ftrace buffer.
*
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
*/
static void trace_bts_cpu(void *arg)
{
- struct trace_array *tr = (struct trace_array *) arg;
+ struct trace_array *tr = (struct trace_array *)arg;
const struct bts_trace *trace;
unsigned char *at;
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
if (unlikely(!this_tracer))
return;
- ds_suspend_bts(this_tracer);
trace = ds_read_bts(this_tracer);
if (!trace)
- goto out;
+ return;
for (at = trace->ds.top; (void *)at < trace->ds.end;
at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
for (at = trace->ds.begin; (void *)at < trace->ds.top;
at += trace->ds.size)
trace_bts_at(trace, at);
-
-out:
- ds_resume_bts(this_tracer);
}
static void trace_bts_prepare(struct trace_iterator *iter)
{
- spin_lock(&bts_tracer_lock);
+ int cpu;
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_suspend_bts(per_cpu(tracer, cpu));
+ /*
+ * We need to collect the trace on the respective cpu since ftrace
+ * implicitly adds the record for the current cpu.
+ * Once that is more flexible, we could collect the data from any cpu.
+ */
on_each_cpu(trace_bts_cpu, iter->tr, 1);
- spin_unlock(&bts_tracer_lock);
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_resume_bts(per_cpu(tracer, cpu));
+ put_online_cpus();
}
static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
void trace_hw_branch_oops(void)
{
- spin_lock(&bts_tracer_lock);
-
- trace_bts_cpu(hw_branch_trace);
-
- spin_unlock(&bts_tracer_lock);
+ if (this_tracer) {
+ ds_suspend_bts_noirq(this_tracer);
+ trace_bts_cpu(hw_branch_trace);
+ ds_resume_bts_noirq(this_tracer);
+ }
}
struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
.start = bts_trace_start,
.stop = bts_trace_stop,
.open = trace_bts_prepare,
- .close = trace_bts_close
+ .close = trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
};
__init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcddd8b..d53b45ed080 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
#include <linux/kernel.h>
#include <linux/mmiotrace.h>
#include <linux/pci.h>
+#include <linux/time.h>
+
#include <asm/atomic.h>
#include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
struct mmiotrace_rw *rw;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
- unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
int ret = 1;
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
struct mmiotrace_map *m;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
- unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
int ret;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64b54a59c55..e0c2545622e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,11 +14,24 @@
/* must be a power of 2 */
#define EVENT_HASHSIZE 128
-static DEFINE_MUTEX(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
+
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
static int next_event_type = __TRACE_LAST_TYPE + 1;
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+ seq_write(m, s->buffer, len);
+
+ trace_seq_init(s);
+}
+
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -84,6 +97,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
return len;
}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
{
@@ -201,6 +247,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
return 0;
}
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+ unsigned long flags,
+ const struct trace_print_flags *flag_array)
+{
+ unsigned long mask;
+ const char *str;
+ const char *ret = p->buffer + p->len;
+ int i;
+
+ for (i = 0; flag_array[i].name && flags; i++) {
+
+ mask = flag_array[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ str = flag_array[i].name;
+ flags &= ~mask;
+ if (p->len && delim)
+ trace_seq_puts(p, delim);
+ trace_seq_puts(p, str);
+ }
+
+ /* check for left over flags */
+ if (flags) {
+ if (p->len && delim)
+ trace_seq_puts(p, delim);
+ trace_seq_printf(p, "0x%lx", flags);
+ }
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_flags_seq);
+
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+ const struct trace_print_flags *symbol_array)
+{
+ int i;
+ const char *ret = p->buffer + p->len;
+
+ for (i = 0; symbol_array[i].name; i++) {
+
+ if (val != symbol_array[i].mask)
+ continue;
+
+ trace_seq_puts(p, symbol_array[i].name);
+ break;
+ }
+
+ if (!p->len)
+ trace_seq_printf(p, "0x%lx", val);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
+
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
{
@@ -311,17 +418,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
if (ip == ULONG_MAX || !ret)
break;
- if (i && ret)
- ret = trace_seq_puts(s, " <- ");
+ if (ret)
+ ret = trace_seq_puts(s, " => ");
if (!ip) {
if (ret)
ret = trace_seq_puts(s, "??");
+ if (ret)
+ ret = trace_seq_puts(s, "\n");
continue;
}
if (!ret)
break;
if (ret)
ret = seq_print_user_ip(s, mm, ip, sym_flags);
+ ret = trace_seq_puts(s, "\n");
}
if (mm)
@@ -455,6 +565,7 @@ static int task_state_char(unsigned long state)
* @type: the type of event to look for
*
* Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
*/
struct trace_event *ftrace_find_event(int type)
{
@@ -464,7 +575,7 @@ struct trace_event *ftrace_find_event(int type)
key = type & (EVENT_HASHSIZE - 1);
- hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+ hlist_for_each_entry(event, n, &event_hash[key], node) {
if (event->type == type)
return event;
}
@@ -472,6 +583,46 @@ struct trace_event *ftrace_find_event(int type)
return NULL;
}
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+ struct trace_event *e;
+ int last = __TRACE_LAST_TYPE;
+
+ if (list_empty(&ftrace_event_list)) {
+ *list = &ftrace_event_list;
+ return last + 1;
+ }
+
+ /*
+ * We used up all possible max events,
+ * lets see if somebody freed one.
+ */
+ list_for_each_entry(e, &ftrace_event_list, list) {
+ if (e->type != last + 1)
+ break;
+ last++;
+ }
+
+ /* Did we used up all 65 thousand events??? */
+ if ((last + 1) > FTRACE_MAX_EVENT)
+ return 0;
+
+ *list = &e->list;
+ return last + 1;
+}
+
+void trace_event_read_lock(void)
+{
+ down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+ up_read(&trace_event_mutex);
+}
+
/**
* register_ftrace_event - register output for an event type
* @event: the event type to register
@@ -492,22 +643,42 @@ int register_ftrace_event(struct trace_event *event)
unsigned key;
int ret = 0;
- mutex_lock(&trace_event_mutex);
+ down_write(&trace_event_mutex);
- if (!event) {
- ret = next_event_type++;
+ if (WARN_ON(!event))
goto out;
- }
- if (!event->type)
- event->type = next_event_type++;
- else if (event->type > __TRACE_LAST_TYPE) {
+ INIT_LIST_HEAD(&event->list);
+
+ if (!event->type) {
+ struct list_head *list = NULL;
+
+ if (next_event_type > FTRACE_MAX_EVENT) {
+
+ event->type = trace_search_list(&list);
+ if (!event->type)
+ goto out;
+
+ } else {
+
+ event->type = next_event_type++;
+ list = &ftrace_event_list;
+ }
+
+ if (WARN_ON(ftrace_find_event(event->type)))
+ goto out;
+
+ list_add_tail(&event->list, list);
+
+ } else if (event->type > __TRACE_LAST_TYPE) {
printk(KERN_WARNING "Need to add type to trace.h\n");
WARN_ON(1);
- }
-
- if (ftrace_find_event(event->type))
goto out;
+ } else {
+ /* Is this event already used */
+ if (ftrace_find_event(event->type))
+ goto out;
+ }
if (event->trace == NULL)
event->trace = trace_nop_print;
@@ -520,14 +691,25 @@ int register_ftrace_event(struct trace_event *event)
key = event->type & (EVENT_HASHSIZE - 1);
- hlist_add_head_rcu(&event->node, &event_hash[key]);
+ hlist_add_head(&event->node, &event_hash[key]);
ret = event->type;
out:
- mutex_unlock(&trace_event_mutex);
+ up_write(&trace_event_mutex);
return ret;
}
+EXPORT_SYMBOL_GPL(register_ftrace_event);
+
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+ hlist_del(&event->node);
+ list_del(&event->list);
+ return 0;
+}
/**
* unregister_ftrace_event - remove a no longer used event
@@ -535,12 +717,13 @@ int register_ftrace_event(struct trace_event *event)
*/
int unregister_ftrace_event(struct trace_event *event)
{
- mutex_lock(&trace_event_mutex);
- hlist_del(&event->node);
- mutex_unlock(&trace_event_mutex);
+ down_write(&trace_event_mutex);
+ __unregister_ftrace_event(event);
+ up_write(&trace_event_mutex);
return 0;
}
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
/*
* Standard events
@@ -833,14 +1016,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
+ if (!trace_seq_puts(s, "<stack trace>\n"))
+ goto partial;
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- if (i) {
- if (!trace_seq_puts(s, " <= "))
- goto partial;
+ if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
+ break;
+ if (!trace_seq_puts(s, " => "))
+ goto partial;
- if (!seq_print_ip_sym(s, field->caller[i], flags))
- goto partial;
- }
+ if (!seq_print_ip_sym(s, field->caller[i], flags))
+ goto partial;
if (!trace_seq_puts(s, "\n"))
goto partial;
}
@@ -868,10 +1053,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!seq_print_userip_objs(field, s, flags))
+ if (!trace_seq_puts(s, "<user stack trace>\n"))
goto partial;
- if (!trace_seq_putc(s, '\n'))
+ if (!seq_print_userip_objs(field, s, flags))
goto partial;
return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index e0bde39c2dd..d38bec4a9c3 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,41 +1,17 @@
#ifndef __TRACE_EVENTS_H
#define __TRACE_EVENTS_H
+#include <linux/trace_seq.h>
#include "trace.h"
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
- int flags);
-
-struct trace_event {
- struct hlist_node node;
- int type;
- trace_print_func trace;
- trace_print_func raw;
- trace_print_func hex;
- trace_print_func binary;
-};
-
extern enum print_line_t
trace_print_bprintk_msg_only(struct trace_iterator *iter);
extern enum print_line_t
trace_print_printk_msg_only(struct trace_iterator *iter);
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
- size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
- size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
extern int seq_print_userip_objs(const struct userstack_entry *entry,
struct trace_seq *s, unsigned long sym_flags);
extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
@@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
int flags);
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
#define MAX_MEMHEX_BYTES 8
#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 118439709fb..8a30d9874cd 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
static void probe_power_end(struct power_trace *it)
{
+ struct ftrace_event_call *call = &event_power;
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
@@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)
goto out;
entry = ring_buffer_event_data(event);
entry->state_data = *it;
- trace_buffer_unlock_commit(tr, event, 0, 0);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
static void probe_power_mark(struct power_trace *it, unsigned int type,
unsigned int level)
{
+ struct ftrace_event_call *call = &event_power;
struct ring_buffer_event *event;
struct trace_power *entry;
struct trace_array_cpu *data;
@@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
goto out;
entry = ring_buffer_event_data(event);
entry->state_data = *it;
- trace_buffer_unlock_commit(tr, event, 0, 0);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556107f..687699d365a 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
EXPORT_SYMBOL_GPL(__ftrace_vprintk);
static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+t_start(struct seq_file *m, loff_t *pos)
{
- const char **fmt = m->private;
- const char **next = fmt;
-
- (*pos)++;
+ const char **fmt = __start___trace_bprintk_fmt + *pos;
if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
return NULL;
-
- next = fmt;
- m->private = ++next;
-
return fmt;
}
-static void *t_start(struct seq_file *m, loff_t *pos)
+static void *t_next(struct seq_file *m, void * v, loff_t *pos)
{
- return t_next(m, NULL, pos);
+ (*pos)++;
+ return t_start(m, pos);
}
static int t_show(struct seq_file *m, void *v)
@@ -182,7 +176,7 @@ static int t_show(struct seq_file *m, void *v)
const char *str = *fmt;
int i;
- seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
+ seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
/*
* Tabs and new lines need to be converted.
@@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = {
static int
ftrace_formats_open(struct inode *inode, struct file *file)
{
- int ret;
-
- ret = seq_open(file, &show_format_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
-
- m->private = __start___trace_bprintk_fmt;
- }
- return ret;
+ return seq_open(file, &show_format_seq_ops);
}
static const struct file_operations ftrace_formats_fops = {
@@ -245,17 +231,13 @@ static const struct file_operations ftrace_formats_fops = {
static __init int init_trace_printk_function_export(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
d_tracer = tracing_init_dentry();
if (!d_tracer)
return 0;
- entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+ trace_create_file("printk_formats", 0444, d_tracer,
NULL, &ftrace_formats_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'printk_formats' entry\n");
return 0;
}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea6f1a..a98106dd979 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#include "trace.h"
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
int cpu;
int pc;
- if (!sched_ref || sched_stopped)
+ if (unlikely(!sched_ref))
return;
tracing_record_cmdline(prev);
tracing_record_cmdline(next);
- if (!tracer_enabled)
+ if (!tracer_enabled || sched_stopped)
return;
pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
unsigned long flags;
int cpu, pc;
- if (!likely(tracer_enabled))
+ if (unlikely(!sched_ref))
return;
- pc = preempt_count();
tracing_record_cmdline(current);
- if (sched_stopped)
+ if (!tracer_enabled || sched_stopped)
return;
+ pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153..eacb2722517 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#include "trace.h"
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
pc = preempt_count();
- /* The task we are waiting for is waking up */
- data = wakeup_trace->data[wakeup_cpu];
-
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
if (unlikely(!tracer_enabled || next != wakeup_task))
goto out_unlock;
+ /* The task we are waiting for is waking up */
+ data = wakeup_trace->data[wakeup_cpu];
+
trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 08f4eb2763d..00dd6485bdd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET:
+ case TRACE_HW_BRANCHES:
return 1;
}
return 0;
@@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
#else
# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
#endif /* CONFIG_DYNAMIC_FTRACE */
+
/*
* Simple verification test of ftrace function tracer.
* Enable ftrace, sleep 1/10 second, and then read the trace
@@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
return ret;
}
#endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+ struct trace_array *tr)
+{
+ struct trace_iterator *iter;
+ struct tracer tracer;
+ unsigned long count;
+ int ret;
+
+ if (!trace->open) {
+ printk(KERN_CONT "missing open function...");
+ return -1;
+ }
+
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /*
+ * The hw-branch tracer needs to collect the trace from the various
+ * cpu trace buffers - before tracing is stopped.
+ */
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ memcpy(&tracer, trace, sizeof(tracer));
+
+ iter->trace = &tracer;
+ iter->tr = tr;
+ iter->pos = -1;
+ mutex_init(&iter->mutex);
+
+ trace->open(iter);
+
+ mutex_destroy(&iter->mutex);
+ kfree(iter);
+
+ tracing_stop();
+
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT "no entries found..");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65f966..6a2a9d484cd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries);
+ max_stack_trace.nr_entries - 1);
if (!stack_tracer_enabled && !max_stack_size)
print_disabled(m);
@@ -301,17 +301,14 @@ static const struct seq_operations stack_trace_seq_ops = {
static int stack_trace_open(struct inode *inode, struct file *file)
{
- int ret;
-
- ret = seq_open(file, &stack_trace_seq_ops);
-
- return ret;
+ return seq_open(file, &stack_trace_seq_ops);
}
static const struct file_operations stack_trace_fops = {
.open = stack_trace_open,
.read = seq_read,
.llseek = seq_lseek,
+ .release = seq_release,
};
int
@@ -326,10 +323,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
if (ret || !write ||
- (last_stack_tracer_enabled == stack_tracer_enabled))
+ (last_stack_tracer_enabled == !!stack_tracer_enabled))
goto out;
- last_stack_tracer_enabled = stack_tracer_enabled;
+ last_stack_tracer_enabled = !!stack_tracer_enabled;
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
@@ -352,19 +349,14 @@ __setup("stacktrace", enable_stacktrace);
static __init int stack_trace_init(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
d_tracer = tracing_init_dentry();
- entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
- &max_stack_size, &stack_max_size_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+ trace_create_file("stack_max_size", 0644, d_tracer,
+ &max_stack_size, &stack_max_size_fops);
- entry = debugfs_create_file("stack_trace", 0444, d_tracer,
- NULL, &stack_trace_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'stack_trace' entry\n");
+ trace_create_file("stack_trace", 0444, d_tracer,
+ NULL, &stack_trace_fops);
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index acdebd771a9..aea321c82fa 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
/*
* Infrastructure for statistic tracing (histogram output).
*
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
*
* Based on the code from trace_branch.c which is
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,22 +10,27 @@
#include <linux/list.h>
+#include <linux/rbtree.h>
#include <linux/debugfs.h>
#include "trace_stat.h"
#include "trace.h"
-/* List of stat entries from a tracer */
-struct trace_stat_list {
- struct list_head list;
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+ struct rb_node node;
void *stat;
};
/* A stat session is the stats output in one file */
-struct tracer_stat_session {
+struct stat_session {
struct list_head session_list;
struct tracer_stat *ts;
- struct list_head stat_list;
+ struct rb_root stat_root;
struct mutex stat_mutex;
struct dentry *file;
};
@@ -37,77 +42,136 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
/* The root directory for all stat files */
static struct dentry *stat_dir;
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct rb_node *node)
+{
+ struct stat_node *snode;
+ struct rb_node *parent = rb_parent(node);
+
+ if (node->rb_left)
+ return node->rb_left;
+ else if (node->rb_right)
+ return node->rb_right;
+ else {
+ if (!parent)
+ ;
+ else if (parent->rb_left == node)
+ parent->rb_left = NULL;
+ else
+ parent->rb_right = NULL;
+
+ snode = container_of(node, struct stat_node, node);
+ kfree(snode);
+
+ return parent;
+ }
+}
-static void reset_stat_session(struct tracer_stat_session *session)
+static void __reset_stat_session(struct stat_session *session)
{
- struct trace_stat_list *node, *next;
+ struct rb_node *node = session->stat_root.rb_node;
- list_for_each_entry_safe(node, next, &session->stat_list, list)
- kfree(node);
+ while (node)
+ node = release_next(node);
- INIT_LIST_HEAD(&session->stat_list);
+ session->stat_root = RB_ROOT;
}
-static void destroy_session(struct tracer_stat_session *session)
+static void reset_stat_session(struct stat_session *session)
+{
+ mutex_lock(&session->stat_mutex);
+ __reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+}
+
+static void destroy_session(struct stat_session *session)
{
debugfs_remove(session->file);
- reset_stat_session(session);
+ __reset_stat_session(session);
mutex_destroy(&session->stat_mutex);
kfree(session);
}
+typedef int (*cmp_stat_t)(void *, void *);
+
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct stat_node *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->stat = stat;
+
+ /*
+ * Figure out where to put new node
+ * This is a descendent sorting
+ */
+ while (*new) {
+ struct stat_node *this;
+ int result;
+
+ this = container_of(*new, struct stat_node, node);
+ result = cmp(data->stat, this->stat);
+
+ parent = *new;
+ if (result >= 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+ return 0;
+}
+
/*
* For tracers that don't provide a stat_cmp callback.
- * This one will force an immediate insertion on tail of
- * the list.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
*/
static int dummy_cmp(void *p1, void *p2)
{
- return 1;
+ return -1;
}
/*
- * Initialize the stat list at each trace_stat file opening.
+ * Initialize the stat rbtree at each trace_stat file opening.
* All of these copies and sorting are required on all opening
* since the stats could have changed between two file sessions.
*/
-static int stat_seq_init(struct tracer_stat_session *session)
+static int stat_seq_init(struct stat_session *session)
{
- struct trace_stat_list *iter_entry, *new_entry;
struct tracer_stat *ts = session->ts;
+ struct rb_root *root = &session->stat_root;
void *stat;
int ret = 0;
int i;
mutex_lock(&session->stat_mutex);
- reset_stat_session(session);
+ __reset_stat_session(session);
if (!ts->stat_cmp)
ts->stat_cmp = dummy_cmp;
- stat = ts->stat_start();
+ stat = ts->stat_start(ts);
if (!stat)
goto exit;
- /*
- * The first entry. Actually this is the second, but the first
- * one (the stat_list head) is pointless.
- */
- new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
- if (!new_entry) {
- ret = -ENOMEM;
+ ret = insert_stat(root, stat, ts->stat_cmp);
+ if (ret)
goto exit;
- }
-
- INIT_LIST_HEAD(&new_entry->list);
-
- list_add(&new_entry->list, &session->stat_list);
-
- new_entry->stat = stat;
/*
- * Iterate over the tracer stat entries and store them in a sorted
- * list.
+ * Iterate over the tracer stat entries and store them in an rbtree.
*/
for (i = 1; ; i++) {
stat = ts->stat_next(stat, i);
@@ -116,37 +180,17 @@ static int stat_seq_init(struct tracer_stat_session *session)
if (!stat)
break;
- new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
- if (!new_entry) {
- ret = -ENOMEM;
- goto exit_free_list;
- }
-
- INIT_LIST_HEAD(&new_entry->list);
- new_entry->stat = stat;
-
- list_for_each_entry_reverse(iter_entry, &session->stat_list,
- list) {
-
- /* Insertion with a descendent sorting */
- if (ts->stat_cmp(iter_entry->stat,
- new_entry->stat) >= 0) {
-
- list_add(&new_entry->list, &iter_entry->list);
- break;
- }
- }
-
- /* The current larger value */
- if (list_empty(&new_entry->list))
- list_add(&new_entry->list, &session->stat_list);
+ ret = insert_stat(root, stat, ts->stat_cmp);
+ if (ret)
+ goto exit_free_rbtree;
}
+
exit:
mutex_unlock(&session->stat_mutex);
return ret;
-exit_free_list:
- reset_stat_session(session);
+exit_free_rbtree:
+ __reset_stat_session(session);
mutex_unlock(&session->stat_mutex);
return ret;
}
@@ -154,38 +198,47 @@ exit_free_list:
static void *stat_seq_start(struct seq_file *s, loff_t *pos)
{
- struct tracer_stat_session *session = s->private;
+ struct stat_session *session = s->private;
+ struct rb_node *node;
+ int i;
- /* Prevent from tracer switch or stat_list modification */
+ /* Prevent from tracer switch or rbtree modification */
mutex_lock(&session->stat_mutex);
/* If we are in the beginning of the file, print the headers */
if (!*pos && session->ts->stat_headers)
return SEQ_START_TOKEN;
- return seq_list_start(&session->stat_list, *pos);
+ node = rb_first(&session->stat_root);
+ for (i = 0; node && i < *pos; i++)
+ node = rb_next(node);
+
+ return node;
}
static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
{
- struct tracer_stat_session *session = s->private;
+ struct stat_session *session = s->private;
+ struct rb_node *node = p;
+
+ (*pos)++;
if (p == SEQ_START_TOKEN)
- return seq_list_start(&session->stat_list, *pos);
+ return rb_first(&session->stat_root);
- return seq_list_next(p, &session->stat_list, pos);
+ return rb_next(node);
}
static void stat_seq_stop(struct seq_file *s, void *p)
{
- struct tracer_stat_session *session = s->private;
+ struct stat_session *session = s->private;
mutex_unlock(&session->stat_mutex);
}
static int stat_seq_show(struct seq_file *s, void *v)
{
- struct tracer_stat_session *session = s->private;
- struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+ struct stat_session *session = s->private;
+ struct stat_node *l = container_of(v, struct stat_node, node);
if (v == SEQ_START_TOKEN)
return session->ts->stat_headers(s);
@@ -204,31 +257,34 @@ static const struct seq_operations trace_stat_seq_ops = {
static int tracing_stat_open(struct inode *inode, struct file *file)
{
int ret;
+ struct seq_file *m;
+ struct stat_session *session = inode->i_private;
- struct tracer_stat_session *session = inode->i_private;
+ ret = stat_seq_init(session);
+ if (ret)
+ return ret;
ret = seq_open(file, &trace_stat_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = session;
- ret = stat_seq_init(session);
+ if (ret) {
+ reset_stat_session(session);
+ return ret;
}
+ m = file->private_data;
+ m->private = session;
return ret;
}
/*
- * Avoid consuming memory with our now useless list.
+ * Avoid consuming memory with our now useless rbtree.
*/
static int tracing_stat_release(struct inode *i, struct file *f)
{
- struct tracer_stat_session *session = i->i_private;
+ struct stat_session *session = i->i_private;
- mutex_lock(&session->stat_mutex);
reset_stat_session(session);
- mutex_unlock(&session->stat_mutex);
- return 0;
+ return seq_release(i, f);
}
static const struct file_operations tracing_stat_fops = {
@@ -251,7 +307,7 @@ static int tracing_stat_init(void)
return 0;
}
-static int init_stat_file(struct tracer_stat_session *session)
+static int init_stat_file(struct stat_session *session)
{
if (!stat_dir && tracing_stat_init())
return -ENODEV;
@@ -266,7 +322,7 @@ static int init_stat_file(struct tracer_stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
- struct tracer_stat_session *session, *node, *tmp;
+ struct stat_session *session, *node;
int ret;
if (!trace)
@@ -277,7 +333,7 @@ int register_stat_tracer(struct tracer_stat *trace)
/* Already registered? */
mutex_lock(&all_stat_sessions_mutex);
- list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+ list_for_each_entry(node, &all_stat_sessions, session_list) {
if (node->ts == trace) {
mutex_unlock(&all_stat_sessions_mutex);
return -EINVAL;
@@ -286,15 +342,13 @@ int register_stat_tracer(struct tracer_stat *trace)
mutex_unlock(&all_stat_sessions_mutex);
/* Init the session */
- session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+ session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
return -ENOMEM;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
- INIT_LIST_HEAD(&session->stat_list);
mutex_init(&session->stat_mutex);
- session->file = NULL;
ret = init_stat_file(session);
if (ret) {
@@ -312,7 +366,7 @@ int register_stat_tracer(struct tracer_stat *trace)
void unregister_stat_tracer(struct tracer_stat *trace)
{
- struct tracer_stat_session *node, *tmp;
+ struct stat_session *node, *tmp;
mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274cf7f3..f3546a2cd82 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
/* The name of your stat file */
const char *name;
/* Iteration over statistic entries */
- void *(*stat_start)(void);
+ void *(*stat_start)(struct tracer_stat *trace);
void *(*stat_next)(void *prev, int idx);
/* Compare two entries for stats sorting */
int (*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149..f6693969287 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = stack_trace_timer_fn;
- hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);
}
static void start_stack_timers(void)
@@ -321,11 +322,7 @@ static const struct file_operations sysprof_sample_fops = {
void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
{
- struct dentry *entry;
- entry = debugfs_create_file("sysprof_sample_period", 0644,
+ trace_create_file("sysprof_sample_period", 0644,
d_tracer, NULL, &sysprof_sample_fops);
- if (entry)
- return;
- pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 797201e4a13..97fcea4acce 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
*/
-#include <trace/workqueue.h>
+#include <trace/events/workqueue.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include "trace_stat.h"
@@ -16,8 +16,6 @@
/* A cpu workqueue thread */
struct cpu_workqueue_stats {
struct list_head list;
-/* Useful to know if we print the cpu headers */
- bool first_entry;
int cpu;
pid_t pid;
/* Can be inserted from interrupt or user context, need to be atomic */
@@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node, *next;
+ struct cpu_workqueue_stats *node;
unsigned long flags;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
- list) {
+ list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
if (node->pid == wq_thread->pid) {
atomic_inc(&node->inserted);
goto found;
@@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
- struct cpu_workqueue_stats *node, *next;
+ struct cpu_workqueue_stats *node;
unsigned long flags;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
- list) {
+ list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
if (node->pid == wq_thread->pid) {
node->executed++;
goto found;
@@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
cws->pid = wq_thread->pid;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- if (list_empty(&workqueue_cpu_stat(cpu)->list))
- cws->first_entry = true;
list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
}
@@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
return ret;
}
-static void *workqueue_stat_start(void)
+static void *workqueue_stat_start(struct tracer_stat *trace)
{
int cpu;
void *ret = NULL;
@@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
static int workqueue_stat_show(struct seq_file *s, void *p)
{
struct cpu_workqueue_stats *cws = p;
- unsigned long flags;
- int cpu = cws->cpu;
struct pid *pid;
struct task_struct *tsk;
- spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
- seq_printf(s, "\n");
- spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
pid = find_get_pid(cws->pid);
if (pid) {
tsk = get_pid_task(pid, PIDTYPE_PID);
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1..2c000e7132a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
put_user_ns(up->user_ns);
}
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
- struct user_struct *user;
- struct hlist_node *h;
-
- hlist_for_each_entry(user, h, hashent, uidhash_node) {
- if (user->uid == uid) {
- atomic_inc(&user->__count);
- return user;
- }
- }
-
- return NULL;
-}
-
#ifdef CONFIG_USER_SCHED
static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+ struct user_struct *user;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ if (user->uid == uid) {
+ /* possibly resurrect an "almost deleted" object */
+ if (atomic_inc_return(&user->__count) == 1)
+ cancel_delayed_work(&user->work);
+ return user;
+ }
+ }
+
+ return NULL;
+}
+
static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
static DEFINE_MUTEX(uids_mutex);
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
return uids_user_create(&root_user);
}
-/* work function to remove sysfs directory for a user and free up
+/* delayed work function to remove sysfs directory for a user and free up
* corresponding structures.
*/
static void cleanup_user_struct(struct work_struct *w)
{
- struct user_struct *up = container_of(w, struct user_struct, work);
+ struct user_struct *up = container_of(w, struct user_struct, work.work);
unsigned long flags;
int remove_user = 0;
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
*/
uids_mutex_lock();
- local_irq_save(flags);
-
- if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+ spin_lock_irqsave(&uidhash_lock, flags);
+ if (atomic_read(&up->__count) == 0) {
uid_hash_remove(up);
remove_user = 1;
- spin_unlock_irqrestore(&uidhash_lock, flags);
- } else {
- local_irq_restore(flags);
}
+ spin_unlock_irqrestore(&uidhash_lock, flags);
if (!remove_user)
goto done;
@@ -331,16 +330,28 @@ done:
*/
static void free_user(struct user_struct *up, unsigned long flags)
{
- /* restore back the count */
- atomic_inc(&up->__count);
spin_unlock_irqrestore(&uidhash_lock, flags);
-
- INIT_WORK(&up->work, cleanup_user_struct);
- schedule_work(&up->work);
+ INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
+ schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
}
#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+ struct user_struct *user;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ if (user->uid == uid) {
+ atomic_inc(&user->__count);
+ return user;
+ }
+ }
+
+ return NULL;
+}
+
int uids_sysfs_init(void) { return 0; }
static inline int uids_user_create(struct user_struct *up) { return 0; }
static inline void uids_mutex_lock(void) { }
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af..8a82b4b8ea5 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
#include <linux/err.h>
#include <linux/slab.h>
+static struct uts_namespace *create_uts_ns(void)
+{
+ struct uts_namespace *uts_ns;
+
+ uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ if (uts_ns)
+ kref_init(&uts_ns->kref);
+ return uts_ns;
+}
+
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
- ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ ns = create_uts_ns();
if (!ns)
return ERR_PTR(-ENOMEM);
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
up_read(&uts_sem);
- kref_init(&ns->kref);
return ns;
}
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c..c4bd3d825f3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
#include <linux/wait.h>
#include <linux/hash.h>
-void init_waitqueue_head(wait_queue_head_t *q)
+void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
{
spin_lock_init(&q->lock);
+ lockdep_set_class(&q->lock, key);
INIT_LIST_HEAD(&q->task_list);
}
-EXPORT_SYMBOL(init_waitqueue_head);
+EXPORT_SYMBOL(__init_waitqueue_head);
void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
@@ -154,7 +155,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_locked_key(q, mode, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f71fb2a0895..0668795d881 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
-#include <trace/workqueue.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
/*
* The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
}
-DEFINE_TRACE(workqueue_insertion);
-
static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, struct list_head *head)
{
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
-DEFINE_TRACE(workqueue_execution);
-
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
return cwq;
}
-DEFINE_TRACE(workqueue_creation);
-
static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
}
EXPORT_SYMBOL_GPL(__create_workqueue_key);
-DEFINE_TRACE(workqueue_destruction);
-
static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
{
/*