aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c21
-rw-r--r--kernel/futex.c7
-rw-r--r--kernel/hrtimer.c55
-rw-r--r--kernel/hung_task.c217
-rw-r--r--kernel/irq/devres.c16
-rw-r--r--kernel/irq/handle.c50
-rw-r--r--kernel/irq/manage.c189
-rw-r--r--kernel/irq/numa_migrate.c1
-rw-r--r--kernel/kprobes.c281
-rw-r--r--kernel/kthread.c26
-rw-r--r--kernel/lockdep.c5
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/mutex.c3
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/posix-cpu-timers.c9
-rw-r--r--kernel/power/disk.c8
-rw-r--r--kernel/power/user.c9
-rw-r--r--kernel/ptrace.c16
-rw-r--r--kernel/rcuclassic.c23
-rw-r--r--kernel/rcupreempt.c48
-rw-r--r--kernel/rcutree.c20
-rw-r--r--kernel/rcutree.h10
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/sched.c174
-rw-r--r--kernel/sched_cpupri.c5
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c100
-rw-r--r--kernel/sysctl.c42
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/blktrace.c17
-rw-r--r--kernel/trace/kmemtrace.c319
-rw-r--r--kernel/trace/trace.c57
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c14
-rw-r--r--kernel/trace/trace_events_stage_2.h4
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_syscalls.c2
-rw-r--r--kernel/workqueue.c36
46 files changed, 1405 insertions, 468 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bab1dffe37e..42423665660 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 6686ed1e4aa..abf9cf3b95c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -837,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
*/
if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id) &&
- !capable(CAP_KILL))
+ tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD;
signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@ -924,6 +923,8 @@ NORET_TYPE void do_exit(long code)
schedule();
}
+ exit_irq_thread();
+
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8765b..b9e2edd0072 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -645,6 +645,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+ tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
tsk->mm = NULL;
tsk->active_mm = NULL;
@@ -797,6 +800,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
sig->cputime_expires.virt_exp = cputime_zero;
sig->cputime_expires.sched_exp = 0;
+ if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+ sig->cputime_expires.prof_exp =
+ secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+ sig->cputimer.running = 1;
+ }
+
/* The timer lists. */
INIT_LIST_HEAD(&sig->cpu_timers[0]);
INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -812,11 +821,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
atomic_inc(&current->signal->live);
return 0;
}
- sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
-
- if (sig)
- posix_cpu_timers_init_group(sig);
+ sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
if (!sig)
return -ENOMEM;
@@ -856,6 +862,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
+ posix_cpu_timers_init_group(sig);
+
acct_init_pacct(&sig->pacct);
tty_audit_fork(sig);
@@ -1032,11 +1040,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->default_timer_slack_ns = current->timer_slack_ns;
-#ifdef CONFIG_DETECT_SOFTLOCKUP
- p->last_switch_count = 0;
- p->last_switch_timestamp = 0;
-#endif
-
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 6b50a024bca..eef8cd26b5e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -883,7 +883,12 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
- /* drop_futex_key_refs() must be called outside the spinlocks. */
+ /*
+ * drop_futex_key_refs() must be called outside the spinlocks. During
+ * the requeue we moved futex_q's from the hash bucket at key1 to the
+ * one at key2 and updated their key pointer. We no longer need to
+ * hold the references to key1.
+ */
while (--drop_count >= 0)
drop_futex_key_refs(&key1);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a42ca..cb8a15c1958 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ int wakeup)
{
if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
- spin_unlock(&base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- spin_lock(&base->cpu_base->lock);
+ if (wakeup) {
+ spin_unlock(&base->cpu_base->lock);
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ spin_lock(&base->cpu_base->lock);
+ } else
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
return 1;
}
+
return 0;
}
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ int wakeup)
{
return 0;
}
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
return 0;
}
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int
-hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
- const enum hrtimer_mode mode)
+int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode,
+ int wakeup)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
* XXX send_remote_softirq() ?
*/
if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
- hrtimer_enqueue_reprogram(timer, new_base);
+ hrtimer_enqueue_reprogram(timer, new_base, wakeup);
unlock_hrtimer_base(timer, &flags);
return ret;
}
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode)
+{
+ return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
+}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
/**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
int
hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
{
- return hrtimer_start_range_ns(timer, tim, 0, mode);
+ return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 00000000000..022a4927b78
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,217 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * The number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+ sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = hung_task_panic,
+};
+
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, when a freshly created task is scheduled once, changes
+ * its state to TASK_UNINTERRUPTIBLE without having ever been
+ * switched out once, it musn't be checked.
+ */
+ if (unlikely(t->flags & PF_FROZEN || !switch_count))
+ return;
+
+ if (switch_count != t->last_switch_count) {
+ t->last_switch_count = switch_count;
+ return;
+ }
+ if (!sysctl_hung_task_warnings)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid, timeout);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ touch_nmi_watchdog();
+
+ if (sysctl_hung_task_panic)
+ panic("hung_task: blocked tasks");
+}
+
+/*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+ get_task_struct(g);
+ get_task_struct(t);
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ put_task_struct(t);
+ put_task_struct(g);
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
+{
+ int max_count = sysctl_hung_task_check_count;
+ int batch_count = HUNG_TASK_BATCHING;
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if (test_taint(TAINT_DIE) || did_panic)
+ return;
+
+ rcu_read_lock();
+ do_each_thread(g, t) {
+ if (!--max_count)
+ goto unlock;
+ if (!--batch_count) {
+ batch_count = HUNG_TASK_BATCHING;
+ rcu_lock_break(g, t);
+ /* Exit if t or g was unhashed during refresh. */
+ if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+ goto unlock;
+ }
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, timeout);
+ } while_each_thread(g, t);
+ unlock:
+ rcu_read_unlock();
+}
+
+static unsigned long timeout_jiffies(unsigned long timeout)
+{
+ /* timeout of 0 will disable the watchdog */
+ return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto out;
+
+ wake_up_process(watchdog_task);
+
+ out:
+ return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+ set_user_nice(current, 0);
+
+ for ( ; ; ) {
+ unsigned long timeout = sysctl_hung_task_timeout_secs;
+
+ while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+ timeout = sysctl_hung_task_timeout_secs;
+
+ check_hung_uninterruptible_tasks(timeout);
+ }
+
+ return 0;
+}
+
+static int __init hung_task_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+ watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+ return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bf..d06df9c41cb 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
}
/**
- * devm_request_irq - allocate an interrupt line for a managed device
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
+ * @thread_fn: function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
* If an IRQ allocated with this function needs to be freed
* separately, dev_free_irq() must be used.
*/
-int devm_request_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, unsigned long irqflags,
- const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ unsigned long irqflags, const char *devname,
+ void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
- rc = request_irq(irq, handler, irqflags, devname, dev_id);
+ rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+ dev_id);
if (rc) {
devres_free(dr);
return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
return 0;
}
-EXPORT_SYMBOL(devm_request_irq);
+EXPORT_SYMBOL(devm_request_threaded_irq);
/**
* devm_free_irq - free an interrupt
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 343acecae62..d82142be8dd 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -339,6 +339,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
return IRQ_NONE;
}
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+ if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+ return;
+
+ printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+ "but no thread function available.", irq, action->name);
+}
+
DEFINE_TRACE(irq_handler_entry);
DEFINE_TRACE(irq_handler_exit);
@@ -363,8 +372,47 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
trace_irq_handler_entry(irq, action);
ret = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, ret);
- if (ret == IRQ_HANDLED)
+
+ switch (ret) {
+ case IRQ_WAKE_THREAD:
+ /*
+ * Set result to handled so the spurious check
+ * does not trigger.
+ */
+ ret = IRQ_HANDLED;
+
+ /*
+ * Catch drivers which return WAKE_THREAD but
+ * did not set up a thread function
+ */
+ if (unlikely(!action->thread_fn)) {
+ warn_no_thread(irq, action);
+ break;
+ }
+
+ /*
+ * Wake up the handler thread for this
+ * action. In case the thread crashed and was
+ * killed we just pretend that we handled the
+ * interrupt. The hardirq handler above has
+ * disabled the device interrupt, so no irq
+ * storm is lurking.
+ */
+ if (likely(!test_bit(IRQTF_DIED,
+ &action->thread_flags))) {
+ set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+ wake_up_process(action->thread);
+ }
+
+ /* Fall through to add to randomness */
+ case IRQ_HANDLED:
status |= action->flags;
+ break;
+
+ default:
+ break;
+ }
+
retval |= ret;
action = action->next;
} while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1516ab77355..7e2e7dd4cd2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
*/
#include <linux/irq.h>
+#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
+#include <linux/sched.h>
#include "internals.h"
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-cpumask_var_t irq_default_affinity;
-
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
/* Oops, that failed? */
} while (status & IRQ_INPROGRESS);
+
+ /*
+ * We made sure that no hardirq handler is running. Now verify
+ * that no threaded handlers are active.
+ */
+ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
}
EXPORT_SYMBOL(synchronize_irq);
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
+
/**
* irq_can_set_affinity - Check if the affinity of a given irq can be set
* @irq: Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
return 1;
}
+static void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+{
+ struct irqaction *action = desc->action;
+
+ while (action) {
+ if (action->thread)
+ set_cpus_allowed_ptr(action->thread, cpumask);
+ action = action->next;
+ }
+}
+
/**
* irq_set_affinity - Set the irq affinity of a given irq
* @irq: Interrupt to set affinity
@@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
cpumask_copy(desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
#endif
+ irq_set_thread_affinity(desc, cpumask);
desc->status |= IRQ_AFFINITY_SET;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
@@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq)
spin_lock_irqsave(&desc->lock, flags);
ret = setup_affinity(irq, desc);
+ if (!ret)
+ irq_set_thread_affinity(desc, desc->affinity);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
@@ -401,6 +424,90 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
return ret;
}
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (test_and_clear_bit(IRQTF_RUNTHREAD,
+ &action->thread_flags)) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+ schedule();
+ }
+ return -1;
+}
+
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+ struct irqaction *action = data;
+ struct irq_desc *desc = irq_to_desc(action->irq);
+ int wake;
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+ current->irqaction = action;
+
+ while (!irq_wait_for_interrupt(action)) {
+
+ atomic_inc(&desc->threads_active);
+
+ spin_lock_irq(&desc->lock);
+ if (unlikely(desc->status & IRQ_DISABLED)) {
+ /*
+ * CHECKME: We might need a dedicated
+ * IRQ_THREAD_PENDING flag here, which
+ * retriggers the thread in check_irq_resend()
+ * but AFAICT IRQ_PENDING should be fine as it
+ * retriggers the interrupt itself --- tglx
+ */
+ desc->status |= IRQ_PENDING;
+ spin_unlock_irq(&desc->lock);
+ } else {
+ spin_unlock_irq(&desc->lock);
+
+ action->thread_fn(action->irq, action->dev_id);
+ }
+
+ wake = atomic_dec_and_test(&desc->threads_active);
+
+ if (wake && waitqueue_active(&desc->wait_for_threads))
+ wake_up(&desc->wait_for_threads);
+ }
+
+ /*
+ * Clear irqaction. Otherwise exit_irq_thread() would make
+ * fuzz about an active irq thread going into nirvana.
+ */
+ current->irqaction = NULL;
+ return 0;
+}
+
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+ struct task_struct *tsk = current;
+
+ if (!tsk->irqaction)
+ return;
+
+ printk(KERN_ERR
+ "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+ tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+
+ /*
+ * Set the THREAD DIED flag to prevent further wakeups of the
+ * soon to be gone threaded handler.
+ */
+ set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -437,6 +544,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
/*
+ * Threaded handler ?
+ */
+ if (new->thread_fn) {
+ struct task_struct *t;
+
+ t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+ new->name);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ /*
+ * We keep the reference to the task struct even if
+ * the thread dies to avoid that the interrupt code
+ * references an already freed task_struct.
+ */
+ get_task_struct(t);
+ new->thread = t;
+ wake_up_process(t);
+ }
+
+ /*
* The following block of code has to be executed atomically
*/
spin_lock_irqsave(&desc->lock, flags);
@@ -473,15 +600,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!shared) {
irq_chip_set_defaults(desc->chip);
+ init_waitqueue_head(&desc->wait_for_threads);
+
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc, irq,
new->flags & IRQF_TRIGGER_MASK);
- if (ret) {
- spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
- }
+ if (ret)
+ goto out_thread;
} else
compat_irq_chip_set_default_handler(desc);
#if defined(CONFIG_IRQ_PER_CPU)
@@ -549,8 +676,19 @@ mismatch:
dump_stack();
}
#endif
+ ret = -EBUSY;
+
+out_thread:
spin_unlock_irqrestore(&desc->lock, flags);
- return -EBUSY;
+ if (new->thread) {
+ struct task_struct *t = new->thread;
+
+ new->thread = NULL;
+ if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+ kthread_stop(t);
+ put_task_struct(t);
+ }
+ return ret;
}
/**
@@ -576,6 +714,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action, **action_ptr;
+ struct task_struct *irqthread;
unsigned long flags;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -622,6 +761,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
else
desc->chip->disable(irq);
}
+
+ irqthread = action->thread;
+ action->thread = NULL;
+
spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -629,6 +772,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Make sure it's not being used on another CPU: */
synchronize_irq(irq);
+ if (irqthread) {
+ if (!test_bit(IRQTF_DIED, &action->thread_flags))
+ kthread_stop(irqthread);
+ put_task_struct(irqthread);
+ }
+
#ifdef CONFIG_DEBUG_SHIRQ
/*
* It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -681,9 +830,12 @@ void free_irq(unsigned int irq, void *dev_id)
EXPORT_SYMBOL(free_irq);
/**
- * request_irq - allocate an interrupt line
+ * request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
+ * @handler: Function to be called when the IRQ occurs.
+ * Primary handler for threaded interrupts
+ * @thread_fn: Function called from the irq handler thread
+ * If NULL, no irq thread is created
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
@@ -695,6 +847,15 @@ EXPORT_SYMBOL(free_irq);
* raises, you must take care both to initialise your hardware
* and to set up the interrupt handler in the right order.
*
+ * If you want to set up a threaded irq handler for your device
+ * then you need to supply @handler and @thread_fn. @handler ist
+ * still called in hard interrupt context and has to check
+ * whether the interrupt originates from the device. If yes it
+ * needs to disable the interrupt on the device and return
+ * IRQ_THREAD_WAKE which will wake up the handler thread and run
+ * @thread_fn. This split handler design is necessary to support
+ * shared interrupts.
+ *
* Dev_id must be globally unique. Normally the address of the
* device data structure is used as the cookie. Since the handler
* receives this value it makes sense to use it.
@@ -710,8 +871,9 @@ EXPORT_SYMBOL(free_irq);
* IRQF_TRIGGER_* Specify active edge(s) or level
*
*/
-int request_irq(unsigned int irq, irq_handler_t handler,
- unsigned long irqflags, const char *devname, void *dev_id)
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+ irq_handler_t thread_fn, unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -759,6 +921,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
return -ENOMEM;
action->handler = handler;
+ action->thread_fn = thread_fn;
action->flags = irqflags;
action->name = devname;
action->dev_id = dev_id;
@@ -788,4 +951,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
#endif
return retval;
}
-EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(request_threaded_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 243d6121e50..44bbdcbaf8d 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
{
free_kstat_irqs(old_desc, desc);
+ free_desc_masks(old_desc, desc);
arch_free_chip_data(old_desc, desc);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5016bfb682b..a5e74ddee0e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -68,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
/* NOTE: change this value only with kprobe_mutex held */
-static bool kprobe_enabled;
+static bool kprobes_all_disarmed;
static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -328,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
- if (kp->pre_handler && !kprobe_gone(kp)) {
+ if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
if (kp->pre_handler(kp, regs))
return 1;
@@ -344,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
- if (kp->post_handler && !kprobe_gone(kp)) {
+ if (kp->post_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
kp->post_handler(kp, regs, flags);
reset_kprobe_instance();
@@ -518,20 +518,28 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
}
/*
-* Add the new probe to old_p->list. Fail if this is the
+* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
-static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
+ BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
if (p->break_handler) {
- if (old_p->break_handler)
+ if (ap->break_handler)
return -EEXIST;
- list_add_tail_rcu(&p->list, &old_p->list);
- old_p->break_handler = aggr_break_handler;
+ list_add_tail_rcu(&p->list, &ap->list);
+ ap->break_handler = aggr_break_handler;
} else
- list_add_rcu(&p->list, &old_p->list);
- if (p->post_handler && !old_p->post_handler)
- old_p->post_handler = aggr_post_handler;
+ list_add_rcu(&p->list, &ap->list);
+ if (p->post_handler && !ap->post_handler)
+ ap->post_handler = aggr_post_handler;
+
+ if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ if (!kprobes_all_disarmed)
+ /* Arm the breakpoint again. */
+ arch_arm_kprobe(ap);
+ }
return 0;
}
@@ -544,6 +552,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
+ ap->flags = p->flags;
ap->pre_handler = aggr_pre_handler;
ap->fault_handler = aggr_fault_handler;
/* We don't care the kprobe which has gone. */
@@ -566,44 +575,59 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
struct kprobe *p)
{
int ret = 0;
- struct kprobe *ap;
+ struct kprobe *ap = old_p;
- if (kprobe_gone(old_p)) {
+ if (old_p->pre_handler != aggr_pre_handler) {
+ /* If old_p is not an aggr_probe, create new aggr_kprobe. */
+ ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+ if (!ap)
+ return -ENOMEM;
+ add_aggr_kprobe(ap, old_p);
+ }
+
+ if (kprobe_gone(ap)) {
/*
* Attempting to insert new probe at the same location that
* had a probe in the module vaddr area which already
* freed. So, the instruction slot has already been
* released. We need a new slot for the new probe.
*/
- ret = arch_prepare_kprobe(old_p);
+ ret = arch_prepare_kprobe(ap);
if (ret)
+ /*
+ * Even if fail to allocate new slot, don't need to
+ * free aggr_probe. It will be used next time, or
+ * freed by unregister_kprobe.
+ */
return ret;
- }
- if (old_p->pre_handler == aggr_pre_handler) {
- copy_kprobe(old_p, p);
- ret = add_new_kprobe(old_p, p);
- ap = old_p;
- } else {
- ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
- if (!ap) {
- if (kprobe_gone(old_p))
- arch_remove_kprobe(old_p);
- return -ENOMEM;
- }
- add_aggr_kprobe(ap, old_p);
- copy_kprobe(ap, p);
- ret = add_new_kprobe(ap, p);
- }
- if (kprobe_gone(old_p)) {
+
/*
- * If the old_p has gone, its breakpoint has been disarmed.
- * We have to arm it again after preparing real kprobes.
+ * Clear gone flag to prevent allocating new slot again, and
+ * set disabled flag because it is not armed yet.
*/
- ap->flags &= ~KPROBE_FLAG_GONE;
- if (kprobe_enabled)
- arch_arm_kprobe(ap);
+ ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+ | KPROBE_FLAG_DISABLED;
}
- return ret;
+
+ copy_kprobe(ap, p);
+ return add_new_kprobe(ap, p);
+}
+
+/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
+static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (!kprobe_disabled(kp))
+ /*
+ * There is an active probe on the list.
+ * We can't disable aggr_kprobe.
+ */
+ return 0;
+ }
+ p->flags |= KPROBE_FLAG_DISABLED;
+ return 1;
}
static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -664,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
return -EINVAL;
}
- p->flags = 0;
+ /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+ p->flags &= KPROBE_FLAG_DISABLED;
+
/*
* Check if are we probing a module.
*/
@@ -709,7 +735,7 @@ int __kprobes register_kprobe(struct kprobe *p)
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
- if (kprobe_enabled)
+ if (!kprobes_all_disarmed && !kprobe_disabled(p))
arch_arm_kprobe(p);
out_unlock_text:
@@ -722,26 +748,39 @@ out:
return ret;
}
+EXPORT_SYMBOL_GPL(register_kprobe);
-/*
- * Unregister a kprobe without a scheduler synchronization.
- */
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
{
struct kprobe *old_p, *list_p;
old_p = get_kprobe(p->addr);
if (unlikely(!old_p))
- return -EINVAL;
+ return NULL;
if (p != old_p) {
list_for_each_entry_rcu(list_p, &old_p->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
- goto valid_p;
- return -EINVAL;
+ goto valid;
+ return NULL;
}
-valid_p:
+valid:
+ return old_p;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+{
+ struct kprobe *old_p, *list_p;
+
+ old_p = __get_valid_kprobe(p);
+ if (old_p == NULL)
+ return -EINVAL;
+
if (old_p == p ||
(old_p->pre_handler == aggr_pre_handler &&
list_is_singular(&old_p->list))) {
@@ -750,7 +789,7 @@ valid_p:
* enabled and not gone - otherwise, the breakpoint would
* already have been removed. We save on flushing icache.
*/
- if (kprobe_enabled && !kprobe_gone(old_p)) {
+ if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
mutex_lock(&text_mutex);
arch_disarm_kprobe(p);
mutex_unlock(&text_mutex);
@@ -768,6 +807,11 @@ valid_p:
}
noclean:
list_del_rcu(&p->list);
+ if (!kprobe_disabled(old_p)) {
+ try_to_disable_aggr_kprobe(old_p);
+ if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+ arch_disarm_kprobe(old_p);
+ }
}
return 0;
}
@@ -803,11 +847,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
}
return ret;
}
+EXPORT_SYMBOL_GPL(register_kprobes);
void __kprobes unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
+EXPORT_SYMBOL_GPL(unregister_kprobe);
void __kprobes unregister_kprobes(struct kprobe **kps, int num)
{
@@ -826,6 +872,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
if (kps[i]->addr)
__unregister_kprobe_bottom(kps[i]);
}
+EXPORT_SYMBOL_GPL(unregister_kprobes);
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
@@ -865,16 +912,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
}
return ret;
}
+EXPORT_SYMBOL_GPL(register_jprobes);
int __kprobes register_jprobe(struct jprobe *jp)
{
return register_jprobes(&jp, 1);
}
+EXPORT_SYMBOL_GPL(register_jprobe);
void __kprobes unregister_jprobe(struct jprobe *jp)
{
unregister_jprobes(&jp, 1);
}
+EXPORT_SYMBOL_GPL(unregister_jprobe);
void __kprobes unregister_jprobes(struct jprobe **jps, int num)
{
@@ -894,6 +944,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
__unregister_kprobe_bottom(&jps[i]->kp);
}
}
+EXPORT_SYMBOL_GPL(unregister_jprobes);
#ifdef CONFIG_KRETPROBES
/*
@@ -987,6 +1038,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
free_rp_inst(rp);
return ret;
}
+EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
{
@@ -1004,11 +1056,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
}
return ret;
}
+EXPORT_SYMBOL_GPL(register_kretprobes);
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
{
@@ -1030,24 +1084,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
}
}
}
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int __kprobes register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(register_kretprobes);
+
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
}
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
{
}
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
static int __kprobes pre_handler_kretprobe(struct kprobe *p,
struct pt_regs *regs)
@@ -1061,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
static void __kprobes kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
+
p->flags |= KPROBE_FLAG_GONE;
if (p->pre_handler == aggr_pre_handler) {
/*
@@ -1173,8 +1234,8 @@ static int __init init_kprobes(void)
}
}
- /* By default, kprobes are enabled */
- kprobe_enabled = true;
+ /* By default, kprobes are armed */
+ kprobes_all_disarmed = false;
err = arch_init_kprobes();
if (!err)
@@ -1202,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
else
kprobe_type = "k";
if (sym)
- seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
- sym, offset, (modname ? modname : " "),
- (kprobe_gone(p) ? "[GONE]" : ""));
+ seq_printf(pi, "%p %s %s+0x%x %s %s%s\n",
+ p->addr, kprobe_type, sym, offset,
+ (modname ? modname : " "),
+ (kprobe_gone(p) ? "[GONE]" : ""),
+ ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+ "[DISABLED]" : ""));
else
- seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
- (kprobe_gone(p) ? "[GONE]" : ""));
+ seq_printf(pi, "%p %s %p %s%s\n",
+ p->addr, kprobe_type, p->addr,
+ (kprobe_gone(p) ? "[GONE]" : ""),
+ ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+ "[DISABLED]" : ""));
}
static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1272,7 +1339,72 @@ static struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
-static void __kprobes enable_all_kprobes(void)
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+ struct kprobe *p;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Check whether specified probe is valid. */
+ p = __get_valid_kprobe(kp);
+ if (unlikely(p == NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If the probe is already disabled (or gone), just return */
+ if (kprobe_disabled(kp))
+ goto out;
+
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ /* When kp != p, p is always enabled. */
+ try_to_disable_aggr_kprobe(p);
+
+ if (!kprobes_all_disarmed && kprobe_disabled(p))
+ arch_disarm_kprobe(p);
+out:
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+ struct kprobe *p;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Check whether specified probe is valid. */
+ p = __get_valid_kprobe(kp);
+ if (unlikely(p == NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (kprobe_gone(kp)) {
+ /* This kprobe has gone, we couldn't enable it. */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!kprobes_all_disarmed && kprobe_disabled(p))
+ arch_arm_kprobe(p);
+
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags &= ~KPROBE_FLAG_DISABLED;
+out:
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
+static void __kprobes arm_all_kprobes(void)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -1281,20 +1413,20 @@ static void __kprobes enable_all_kprobes(void)
mutex_lock(&kprobe_mutex);
- /* If kprobes are already enabled, just return */
- if (kprobe_enabled)
+ /* If kprobes are armed, just return */
+ if (!kprobes_all_disarmed)
goto already_enabled;
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
- if (!kprobe_gone(p))
+ if (!kprobe_disabled(p))
arch_arm_kprobe(p);
}
mutex_unlock(&text_mutex);
- kprobe_enabled = true;
+ kprobes_all_disarmed = false;
printk(KERN_INFO "Kprobes globally enabled\n");
already_enabled:
@@ -1302,7 +1434,7 @@ already_enabled:
return;
}
-static void __kprobes disable_all_kprobes(void)
+static void __kprobes disarm_all_kprobes(void)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -1311,17 +1443,17 @@ static void __kprobes disable_all_kprobes(void)
mutex_lock(&kprobe_mutex);
- /* If kprobes are already disabled, just return */
- if (!kprobe_enabled)
+ /* If kprobes are already disarmed, just return */
+ if (kprobes_all_disarmed)
goto already_disabled;
- kprobe_enabled = false;
+ kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
- if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
+ if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
arch_disarm_kprobe(p);
}
}
@@ -1347,7 +1479,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
{
char buf[3];
- if (kprobe_enabled)
+ if (!kprobes_all_disarmed)
buf[0] = '1';
else
buf[0] = '0';
@@ -1370,12 +1502,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
case 'y':
case 'Y':
case '1':
- enable_all_kprobes();
+ arm_all_kprobes();
break;
case 'n':
case 'N':
case '0':
- disable_all_kprobes();
+ disarm_all_kprobes();
break;
}
@@ -1418,16 +1550,5 @@ late_initcall(debugfs_kprobe_init);
module_init(init_kprobes);
-EXPORT_SYMBOL_GPL(register_kprobe);
-EXPORT_SYMBOL_GPL(unregister_kprobe);
-EXPORT_SYMBOL_GPL(register_kprobes);
-EXPORT_SYMBOL_GPL(unregister_kprobes);
-EXPORT_SYMBOL_GPL(register_jprobe);
-EXPORT_SYMBOL_GPL(unregister_jprobe);
-EXPORT_SYMBOL_GPL(register_jprobes);
-EXPORT_SYMBOL_GPL(unregister_jprobes);
+/* defined in arch/.../kernel/kprobes.c */
EXPORT_SYMBOL_GPL(jprobe_return);
-EXPORT_SYMBOL_GPL(register_kretprobe);
-EXPORT_SYMBOL_GPL(unregister_kretprobe);
-EXPORT_SYMBOL_GPL(register_kretprobes);
-EXPORT_SYMBOL_GPL(unregister_kretprobes);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 84bbadd4d02..4ebaf8519ab 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -76,6 +76,7 @@ static int kthread(void *_create)
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
+ create->result = current;
complete(&create->started);
schedule();
@@ -96,22 +97,10 @@ static void create_kthread(struct kthread_create_info *create)
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
- if (pid < 0) {
+ if (pid < 0)
create->result = ERR_PTR(pid);
- } else {
- struct sched_param param = { .sched_priority = 0 };
+ else
wait_for_completion(&create->started);
- read_lock(&tasklist_lock);
- create->result = find_task_by_pid_ns(pid, &init_pid_ns);
- read_unlock(&tasklist_lock);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler(create->result, SCHED_NORMAL, &param);
- set_user_nice(create->result, KTHREAD_NICE_LEVEL);
- set_cpus_allowed_ptr(create->result, cpu_all_mask);
- }
complete(&create->done);
}
@@ -154,11 +143,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
wait_for_completion(&create.done);
if (!IS_ERR(create.result)) {
+ struct sched_param param = { .sched_priority = 0 };
va_list args;
+
va_start(args, namefmt);
vsnprintf(create.result->comm, sizeof(create.result->comm),
namefmt, args);
va_end(args);
+ /*
+ * root may have changed our (kthreadd's) priority or CPU mask.
+ * The kernel thread should not inherit these properties.
+ */
+ sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
+ set_user_nice(create.result, KTHREAD_NICE_LEVEL);
+ set_cpus_allowed_ptr(create.result, cpu_all_mask);
}
return create.result;
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81b5f33970b..b0f01186696 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -793,6 +793,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return NULL;
}
class = lock_classes + nr_lock_classes++;
@@ -856,6 +857,7 @@ static struct lock_list *alloc_list_entry(void)
printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return NULL;
}
return list_entries + nr_list_entries++;
@@ -1682,6 +1684,7 @@ cache_hit:
printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return 0;
}
chain = lock_chains + nr_lock_chains++;
@@ -2541,6 +2544,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_locks_off();
printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return 0;
}
@@ -2637,6 +2641,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_locks_off();
printk("BUG: MAX_LOCK_DEPTH too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return 0;
}
diff --git a/kernel/module.c b/kernel/module.c
index c268a771595..e797812a4d9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1952,9 +1952,6 @@ static noinline struct module *load_module(void __user *umod,
if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
- /* Don't keep __versions around; it's just for loading. */
- if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
- sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
}
modindex = find_sec(hdr, sechdrs, secstrings,
@@ -2391,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);
+ /* We need to finish all async code before the module init sequence is done */
+ async_synchronize_full();
+
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a..507cf2b5e9f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
preempt_disable();
mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
+ !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
/*
* Optimistic spinning.
*
diff --git a/kernel/panic.c b/kernel/panic.c
index 3fd8c5bf8b3..934fb377f4b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -213,8 +213,16 @@ unsigned long get_taint(void)
void add_taint(unsigned flag)
{
- /* can't trust the integrity of the kernel anymore: */
- debug_locks = 0;
+ /*
+ * Can't trust the integrity of the kernel anymore.
+ * We don't call directly debug_locks_off() because the issue
+ * is not necessarily serious enough to set oops_in_progress to 1
+ * Also we want to keep up lockdep for staging development and
+ * post-warning case.
+ */
+ if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
+ printk(KERN_WARNING "Disabling lockdep due to kernel taint\n");
+
set_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(add_taint);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8e5d9a68b02..c9dcf98b446 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -18,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
cputime = secs_to_cputime(rlim_new);
if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
- cputime_lt(current->signal->it_prof_expires, cputime)) {
+ cputime_gt(current->signal->it_prof_expires, cputime)) {
spin_lock_irq(&current->sighand->siglock);
set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
spin_unlock_irq(&current->sighand->siglock);
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_sched_runtime(p);
break;
}
return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{
struct task_cputime cputime;
- thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break;
case CPUCLOCK_VIRT:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = thread_group_sched_runtime(p);
break;
}
return 0;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5f21ab2bbcd..0854770b63b 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
+#include <scsi/scsi_scan.h>
#include <asm/suspend.h>
#include "power.h"
@@ -645,6 +646,13 @@ static int software_resume(void)
return 0;
/*
+ * We can't depend on SCSI devices being available after loading one of
+ * their modules if scsi_complete_async_scans() is not called and the
+ * resume device usually is a SCSI one.
+ */
+ scsi_complete_async_scans();
+
+ /*
* name_to_dev_t() below takes a sysfs buffer mutex when sysfs
* is configured into the kernel. Since the regular hibernate
* trigger path is via sysfs which takes a buffer mutex before
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6c85359364f..ed97375daae 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,6 +24,7 @@
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/smp_lock.h>
+#include <scsi/scsi_scan.h>
#include <asm/uaccess.h>
@@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
filp->private_data = data;
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+ /* Hibernating. The image device should be accessible. */
data->swap = swsusp_resume_device ?
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
@@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
if (error)
pm_notifier_call_chain(PM_POST_HIBERNATION);
} else {
+ /*
+ * Resuming. We may need to wait for the image device to
+ * appear.
+ */
+ wait_for_device_probe();
+ scsi_complete_async_scans();
+
data->swap = -1;
data->mode = O_WRONLY;
error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec3419..64191fa09b7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -21,9 +21,7 @@
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
@@ -48,7 +46,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
}
-
+
/*
* Turn a tracing stop into a normal stop now, since with no tracer there
* would be no way to wake it up with SIGCONT or SIGKILL. If there was a
@@ -173,7 +171,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
task_lock(task);
err = __ptrace_may_access(task, mode);
task_unlock(task);
- return (!err ? true : false);
+ return !err;
}
int ptrace_attach(struct task_struct *task)
@@ -358,7 +356,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
copied += retval;
src += retval;
dst += retval;
- len -= retval;
+ len -= retval;
}
return copied;
}
@@ -383,7 +381,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
copied += retval;
src += retval;
dst += retval;
- len -= retval;
+ len -= retval;
}
return copied;
}
@@ -496,9 +494,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
if (unlikely(!arch_has_single_step()))
return -EIO;
user_enable_single_step(child);
- }
- else
+ } else {
user_disable_single_step(child);
+ }
child->exit_code = data;
wake_up_process(child);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640a6b9..0f2b0b31130 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_BITS_NONE,
};
+
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cpumask = CPU_BITS_NONE,
};
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ rdp->passed_quiesc = 1;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+ rdp->passed_quiesc = 1;
+}
static int blimit = 10;
static int qhimark = 10000;
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e850fb7..ce97a4df64d 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
};
+struct rcu_dyntick_sched {
+ int dynticks;
+ int dynticks_snap;
+ int sched_qs;
+ int sched_qs_snap;
+ int sched_dynticks_snap;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+ .dynticks = 1,
+};
+
+void rcu_qsctr_inc(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ rdssp->sched_qs++;
+}
+
+#ifdef CONFIG_NO_HZ
+
+void rcu_enter_nohz(void)
+{
+ static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+ smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+ __get_cpu_var(rcu_dyntick_sched).dynticks++;
+ WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
+}
+
+void rcu_exit_nohz(void)
+{
+ static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+ __get_cpu_var(rcu_dyntick_sched).dynticks++;
+ smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
+ &rs);
+}
+
+#endif /* CONFIG_NO_HZ */
+
+
static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
static struct rcu_ctrlblk rcu_ctrlblk = {
.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
.completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
}
}
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
- .dynticks = 1,
-};
-
#ifdef CONFIG_NO_HZ
static DEFINE_PER_CPU(int, rcu_update_flag);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce31579ec..7f326692257 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ rdp->passed_quiesc = 1;
+ rdp->passed_quiesc_completed = rdp->completed;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+ rdp->passed_quiesc = 1;
+ rdp->passed_quiesc_completed = rdp->completed;
+}
+
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 00000000000..5e872bbf07f
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
+
+/*
+ * RCU implementation internal declarations:
+ */
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e83782..4ee954f6a8d 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,8 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include "rcutree.h"
+
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
diff --git a/kernel/sched.c b/kernel/sched.c
index bec249885e1..5724508c3b6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
+ unsigned long delta;
+ ktime_t soft, hard;
+
if (hrtimer_active(&rt_b->rt_period_timer))
break;
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start_expires(&rt_b->rt_period_timer,
- HRTIMER_MODE_ABS);
+
+ soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+ hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+ HRTIMER_MODE_ABS, 0);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+ __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+ HRTIMER_MODE_REL, 0);
}
static inline void init_hrtick(void)
@@ -1410,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct rq_iterator *iterator);
#endif
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val) {}
#endif
static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4503,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);
/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
*/
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ if (task_current(rq, p)) {
+ update_rq_clock(rq);
+ ns = rq->clock - p->se.exec_start;
+ if ((s64)ns < 0)
+ ns = 0;
+ }
+
+ return ns;
+}
+
unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
@@ -4513,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
u64 ns = 0;
rq = task_rq_lock(p, &flags);
+ ns = do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
- if (task_current(rq, p)) {
- u64 delta_exec;
+ return ns;
+}
- update_rq_clock(rq);
- delta_exec = rq->clock - p->se.exec_start;
- if ((s64)delta_exec > 0)
- ns = delta_exec;
- }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns = 0;
+
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
+
+ return ns;
+}
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+ struct task_cputime totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+ rq = task_rq_lock(p, &flags);
+ thread_group_cputime(p, &totals);
+ ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);
return ns;
@@ -4551,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
cpustat->nice = cputime64_add(cpustat->nice, tmp);
else
cpustat->user = cputime64_add(cpustat->user, tmp);
+
+ cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
@@ -4612,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
cpustat->system = cputime64_add(cpustat->system, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
/* Account for system time used */
acct_update_integrals(p);
}
@@ -7294,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
- printk(KERN_CONT " %s", str);
+ printk(KERN_CONT " %s (__cpu_power = %d)", str,
+ group->__cpu_power);
group = group->next;
} while (group != sd->groups);
@@ -9917,6 +9991,7 @@ struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};
@@ -9941,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ int i;
if (!ca)
- return ERR_PTR(-ENOMEM);
+ goto out;
ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage) {
- kfree(ca);
- return ERR_PTR(-ENOMEM);
- }
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ if (percpu_counter_init(&ca->cpustat[i], 0))
+ goto out_free_counters;
if (cgrp->parent)
ca->parent = cgroup_ca(cgrp->parent);
return &ca->css;
+
+out_free_counters:
+ while (--i >= 0)
+ percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
}
/* destroy an existing cpu accounting group */
@@ -9962,7 +10049,10 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
kfree(ca);
}
@@ -10049,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
return 0;
}
+static const char *cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+ s64 val = percpu_counter_read(&ca->cpustat[i]);
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[i], val);
+ }
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
@@ -10059,7 +10168,10 @@ static struct cftype files[] = {
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},
-
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
};
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10081,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
return;
cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
ca = task_ca(tsk);
for (; ca; ca = ca->parent) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
+
+ rcu_read_unlock();
+}
+
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val)
+{
+ struct cpuacct *ca;
+
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ rcu_read_lock();
+ ca = task_ca(tsk);
+
+ do {
+ percpu_counter_add(&ca->cpustat[idx], val);
+ ca = ca->parent;
+ } while (ca);
+ rcu_read_unlock();
}
struct cgroup_subsys cpuacct_subsys = {
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b..cdd3c89574c 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
*
* Note: This function returns the recommended CPUs as calculated during the
* current invokation. By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ if (lowest_mask)
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
return 1;
}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 299d012b439..f2c66f8f971 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- cpumask_var_t mask;
-
if (rq->curr->rt.nr_cpus_allowed == 1)
return;
- if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
- return;
-
if (p->rt.nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, mask))
- goto free;
+ && cpupri_find(&rq->rd->cpupri, p, NULL))
+ return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
- goto free;
+ if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ return;
/*
* There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
*/
requeue_task_rt(rq, p, 1);
resched_task(rq->curr);
-free:
- free_cpumask_var(mask);
}
#endif /* CONFIG_SMP */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d105a82543d..2fecefacdc5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,7 +65,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
-static inline void wakeup_softirqd(void)
+void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __get_cpu_var(ksoftirqd);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a245510..88796c33083 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -166,97 +166,11 @@ void softlockup_tick(void)
}
/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- if (t->flags & PF_FROZEN)
- return;
-
- if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
- t->last_switch_count = switch_count;
- t->last_switch_timestamp = now;
- return;
- }
- if ((long)(now - t->last_switch_timestamp) <
- sysctl_hung_task_timeout_secs)
- return;
- if (!sysctl_hung_task_warnings)
- return;
- sysctl_hung_task_warnings--;
-
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
- printk(KERN_ERR "INFO: task %s:%d blocked for more than "
- "%ld seconds.\n", t->comm, t->pid,
- sysctl_hung_task_timeout_secs);
- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- __debug_show_held_locks(t);
-
- t->last_switch_timestamp = now;
- touch_nmi_watchdog();
-
- if (softlockup_panic)
- panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
- int max_count = sysctl_hung_task_check_count;
- unsigned long now = get_timestamp(this_cpu);
- struct task_struct *g, *t;
-
- /*
- * If the system crashed already then all bets are off,
- * do not report extra hung tasks:
- */
- if (test_taint(TAINT_DIE) || did_panic)
- return;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
- if (!--max_count)
- goto unlock;
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, now);
- } while_each_thread(g, t);
- unlock:
- read_unlock(&tasklist_lock);
-}
-
-/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- int this_cpu = (long)__bind_cpu;
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
if (kthread_should_stop())
break;
- if (this_cpu == check_cpu) {
- if (sysctl_hung_task_timeout_secs)
- check_hung_uninterruptible_tasks(this_cpu);
- }
-
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- check_cpu = cpumask_any(cpu_online_mask);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (hotcpu == check_cpu) {
- /* Pick any other online cpu. */
- check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
- }
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 82350f8f04f..4286b62b34a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,11 @@ static int neg_one = -1;
#endif
static int zero;
-static int one = 1;
-static int two = 2;
+static int __maybe_unused one = 1;
+static int __maybe_unused two = 2;
static unsigned long one_ul = 1;
static int one_hundred = 100;
+static int one_thousand = 1000;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -813,6 +814,19 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
.extra2 = &sixty,
},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
@@ -828,7 +842,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_dohung_task_timeout_secs,
.strategy = &sysctl_intvec,
},
{
@@ -1027,6 +1041,28 @@ static struct ctl_table vm_table[] = {
.proc_handler = &proc_dointvec,
},
{
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nr_pdflush_threads_min",
+ .data = &nr_pdflush_threads_min,
+ .maxlen = sizeof nr_pdflush_threads_min,
+ .mode = 0644 /* read-write */,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &one,
+ .extra2 = &nr_pdflush_threads_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nr_pdflush_threads_max",
+ .data = &nr_pdflush_threads_max,
+ .maxlen = sizeof nr_pdflush_threads_max,
+ .mode = 0644 /* read-write */,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &nr_pdflush_threads_min,
+ .extra2 = &one_thousand,
+ },
+ {
.ctl_name = VM_SWAPPINESS,
.procname = "swappiness",
.data = &vm_swappiness,
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e..cffffad01c3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -531,10 +531,13 @@ static void __init_timer(struct timer_list *timer,
}
/**
- * init_timer - initialize a timer.
+ * init_timer_key - initialize a timer
* @timer: the timer to be initialized
+ * @name: name of the timer
+ * @key: lockdep class key of the fake lock used for tracking timer
+ * sync lock dependencies
*
- * init_timer() must be done to a timer prior calling *any* of the
+ * init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
void init_timer_key(struct timer_list *timer,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2246141bda4..417d1985e29 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -312,7 +312,7 @@ config KMEMTRACE
and profile kernel code.
This requires an userspace application to use. See
- Documentation/vm/kmemtrace.txt for more information.
+ Documentation/trace/kmemtrace.txt for more information.
Saying Y will make the kernel somewhat larger and slower. However,
if you disable kmemtrace at run-time or boot-time, the performance
@@ -403,7 +403,7 @@ config MMIOTRACE
implementation and works via page faults. Tracing is disabled by
default and can be enabled at run-time.
- See Documentation/tracers/mmiotrace.txt.
+ See Documentation/trace/mmiotrace.txt.
If you are not helping to develop drivers, say N.
config MMIOTRACE_TEST
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 947c5b3f90c..921ef5d1f0b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,10 +327,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
char *msg;
struct blk_trace *bt;
- if (count > BLK_TN_MAX_MSG)
+ if (count >= BLK_TN_MAX_MSG)
return -EINVAL;
- msg = kmalloc(count, GFP_KERNEL);
+ msg = kmalloc(count + 1, GFP_KERNEL);
if (msg == NULL)
return -ENOMEM;
@@ -339,6 +339,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return -EFAULT;
}
+ msg[count] = '\0';
bt = filp->private_data;
__trace_note_message(bt, "%s", msg);
kfree(msg);
@@ -642,7 +643,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (blk_pc_request(rq)) {
what |= BLK_TC_ACT(BLK_TC_PC);
__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
- sizeof(rq->cmd), rq->cmd);
+ rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
@@ -1376,12 +1377,12 @@ static int blk_trace_str2mask(const char *str)
{
int i;
int mask = 0;
- char *s, *token;
+ char *buf, *s, *token;
- s = kstrdup(str, GFP_KERNEL);
- if (s == NULL)
+ buf = kstrdup(str, GFP_KERNEL);
+ if (buf == NULL)
return -ENOMEM;
- s = strstrip(s);
+ s = strstrip(buf);
while (1) {
token = strsep(&s, ",");
@@ -1402,7 +1403,7 @@ static int blk_trace_str2mask(const char *str)
break;
}
}
- kfree(s);
+ kfree(buf);
return mask;
}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae201b3eda8..5011f4d91e3 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -6,14 +6,16 @@
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
*/
-#include <linux/dcache.h>
+#include <linux/tracepoint.h>
+#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/dcache.h>
#include <linux/fs.h>
-#include <linux/seq_file.h>
+
#include <trace/kmemtrace.h>
-#include "trace.h"
#include "trace_output.h"
+#include "trace.h"
/* Select an alternative, minimalistic output than the original one */
#define TRACE_KMEM_OPT_MINIMAL 0x1
@@ -25,14 +27,156 @@ static struct tracer_opt kmem_opts[] = {
};
static struct tracer_flags kmem_tracer_flags = {
- .val = 0,
- .opts = kmem_opts
+ .val = 0,
+ .opts = kmem_opts
};
-
-static bool kmem_tracing_enabled __read_mostly;
static struct trace_array *kmemtrace_array;
+/* Trace allocations */
+static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
+ unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags,
+ int node)
+{
+ struct trace_array *tr = kmemtrace_array;
+ struct kmemtrace_alloc_entry *entry;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+
+ entry->ent.type = TRACE_KMEM_ALLOC;
+ entry->type_id = type_id;
+ entry->call_site = call_site;
+ entry->ptr = ptr;
+ entry->bytes_req = bytes_req;
+ entry->bytes_alloc = bytes_alloc;
+ entry->gfp_flags = gfp_flags;
+ entry->node = node;
+
+ ring_buffer_unlock_commit(tr->buffer, event);
+
+ trace_wake_up();
+}
+
+static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
+ unsigned long call_site,
+ const void *ptr)
+{
+ struct trace_array *tr = kmemtrace_array;
+ struct kmemtrace_free_entry *entry;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+
+ entry->ent.type = TRACE_KMEM_FREE;
+ entry->type_id = type_id;
+ entry->call_site = call_site;
+ entry->ptr = ptr;
+
+ ring_buffer_unlock_commit(tr->buffer, event);
+
+ trace_wake_up();
+}
+
+static void kmemtrace_kmalloc(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmalloc_node(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags,
+ int node)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags,
+ int node)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+{
+ kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
+}
+
+static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+{
+ kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
+}
+
+static int kmemtrace_start_probes(void)
+{
+ int err;
+
+ err = register_trace_kmalloc(kmemtrace_kmalloc);
+ if (err)
+ return err;
+ err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+ if (err)
+ return err;
+ err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+ if (err)
+ return err;
+ err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+ if (err)
+ return err;
+ err = register_trace_kfree(kmemtrace_kfree);
+ if (err)
+ return err;
+ err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+
+ return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+ unregister_trace_kmalloc(kmemtrace_kmalloc);
+ unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+ unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+ unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+ unregister_trace_kfree(kmemtrace_kfree);
+ unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+}
+
static int kmem_trace_init(struct trace_array *tr)
{
int cpu;
@@ -41,14 +185,14 @@ static int kmem_trace_init(struct trace_array *tr)
for_each_cpu_mask(cpu, cpu_possible_map)
tracing_reset(tr, cpu);
- kmem_tracing_enabled = true;
+ kmemtrace_start_probes();
return 0;
}
static void kmem_trace_reset(struct trace_array *tr)
{
- kmem_tracing_enabled = false;
+ kmemtrace_stop_probes();
}
static void kmemtrace_headers(struct seq_file *s)
@@ -66,47 +210,84 @@ static void kmemtrace_headers(struct seq_file *s)
}
/*
- * The two following functions give the original output from kmemtrace,
- * or something close to....perhaps they need some missing things
+ * The following functions give the original output from kmemtrace,
+ * plus the origin CPU, since reordering occurs in-kernel now.
*/
+
+#define KMEMTRACE_USER_ALLOC 0
+#define KMEMTRACE_USER_FREE 1
+
+struct kmemtrace_user_event {
+ u8 event_id;
+ u8 type_id;
+ u16 event_size;
+ u32 cpu;
+ u64 timestamp;
+ unsigned long call_site;
+ unsigned long ptr;
+};
+
+struct kmemtrace_user_event_alloc {
+ size_t bytes_req;
+ size_t bytes_alloc;
+ unsigned gfp_flags;
+ int node;
+};
+
static enum print_line_t
-kmemtrace_print_alloc_original(struct trace_iterator *iter,
- struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc_user(struct trace_iterator *iter,
+ struct kmemtrace_alloc_entry *entry)
{
+ struct kmemtrace_user_event_alloc *ev_alloc;
struct trace_seq *s = &iter->seq;
- int ret;
+ struct kmemtrace_user_event *ev;
+
+ ev = trace_seq_reserve(s, sizeof(*ev));
+ if (!ev)
+ return TRACE_TYPE_PARTIAL_LINE;
- /* Taken from the old linux/kmemtrace.h */
- ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
- "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
- entry->type_id, entry->call_site, (unsigned long) entry->ptr,
- (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
- (unsigned long) entry->gfp_flags, entry->node);
+ ev->event_id = KMEMTRACE_USER_ALLOC;
+ ev->type_id = entry->type_id;
+ ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
+ ev->cpu = iter->cpu;
+ ev->timestamp = iter->ts;
+ ev->call_site = entry->call_site;
+ ev->ptr = (unsigned long)entry->ptr;
- if (!ret)
+ ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
+ if (!ev_alloc)
return TRACE_TYPE_PARTIAL_LINE;
+ ev_alloc->bytes_req = entry->bytes_req;
+ ev_alloc->bytes_alloc = entry->bytes_alloc;
+ ev_alloc->gfp_flags = entry->gfp_flags;
+ ev_alloc->node = entry->node;
+
return TRACE_TYPE_HANDLED;
}
static enum print_line_t
-kmemtrace_print_free_original(struct trace_iterator *iter,
- struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_user(struct trace_iterator *iter,
+ struct kmemtrace_free_entry *entry)
{
struct trace_seq *s = &iter->seq;
- int ret;
+ struct kmemtrace_user_event *ev;
- /* Taken from the old linux/kmemtrace.h */
- ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
- entry->type_id, entry->call_site, (unsigned long) entry->ptr);
-
- if (!ret)
+ ev = trace_seq_reserve(s, sizeof(*ev));
+ if (!ev)
return TRACE_TYPE_PARTIAL_LINE;
+ ev->event_id = KMEMTRACE_USER_FREE;
+ ev->type_id = entry->type_id;
+ ev->event_size = sizeof(*ev);
+ ev->cpu = iter->cpu;
+ ev->timestamp = iter->ts;
+ ev->call_site = entry->call_site;
+ ev->ptr = (unsigned long)entry->ptr;
+
return TRACE_TYPE_HANDLED;
}
-
/* The two other following provide a more minimalistic output */
static enum print_line_t
kmemtrace_print_alloc_compress(struct trace_iterator *iter,
@@ -178,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
static enum print_line_t
kmemtrace_print_free_compress(struct trace_iterator *iter,
- struct kmemtrace_free_entry *entry)
+ struct kmemtrace_free_entry *entry)
{
struct trace_seq *s = &iter->seq;
int ret;
@@ -239,20 +420,22 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
switch (entry->type) {
case TRACE_KMEM_ALLOC: {
struct kmemtrace_alloc_entry *field;
+
trace_assign_type(field, entry);
if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
return kmemtrace_print_alloc_compress(iter, field);
else
- return kmemtrace_print_alloc_original(iter, field);
+ return kmemtrace_print_alloc_user(iter, field);
}
case TRACE_KMEM_FREE: {
struct kmemtrace_free_entry *field;
+
trace_assign_type(field, entry);
if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
return kmemtrace_print_free_compress(iter, field);
else
- return kmemtrace_print_free_original(iter, field);
+ return kmemtrace_print_free_user(iter, field);
}
default:
@@ -260,70 +443,13 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
}
}
-/* Trace allocations */
-void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- struct ring_buffer_event *event;
- struct kmemtrace_alloc_entry *entry;
- struct trace_array *tr = kmemtrace_array;
-
- if (!kmem_tracing_enabled)
- return;
-
- event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
- sizeof(*entry), 0, 0);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
-
- entry->call_site = call_site;
- entry->ptr = ptr;
- entry->bytes_req = bytes_req;
- entry->bytes_alloc = bytes_alloc;
- entry->gfp_flags = gfp_flags;
- entry->node = node;
-
- trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
-
-void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr)
-{
- struct ring_buffer_event *event;
- struct kmemtrace_free_entry *entry;
- struct trace_array *tr = kmemtrace_array;
-
- if (!kmem_tracing_enabled)
- return;
-
- event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
- sizeof(*entry), 0, 0);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->type_id = type_id;
- entry->call_site = call_site;
- entry->ptr = ptr;
-
- trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_free);
-
static struct tracer kmem_tracer __read_mostly = {
- .name = "kmemtrace",
- .init = kmem_trace_init,
- .reset = kmem_trace_reset,
- .print_line = kmemtrace_print_line,
- .print_header = kmemtrace_headers,
- .flags = &kmem_tracer_flags
+ .name = "kmemtrace",
+ .init = kmem_trace_init,
+ .reset = kmem_trace_reset,
+ .print_line = kmemtrace_print_line,
+ .print_header = kmemtrace_headers,
+ .flags = &kmem_tracer_flags
};
void kmemtrace_init(void)
@@ -335,5 +461,4 @@ static int __init init_kmem_tracer(void)
{
return register_tracer(&kmem_tracer);
}
-
device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0174a40c56..1ce5dc6372b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
#include <linux/percpu.h>
#include <linux/splice.h>
#include <linux/kdebug.h>
+#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/poll.h>
@@ -147,8 +148,7 @@ static int __init set_ftrace_dump_on_oops(char *str)
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
-long
-ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(cycle_t nsec)
{
nsec += 500;
do_div(nsec, 1000);
@@ -1632,7 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
return;
cpumask_set_cpu(iter->cpu, iter->started);
- trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+
+ /* Don't print started cpu buffer for the first entry of the trace */
+ if (iter->idx > 1)
+ trace_seq_printf(s, "##### CPU %u buffer started ####\n",
+ iter->cpu);
}
static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1867,6 +1871,11 @@ __tracing_open(struct inode *inode, struct file *file)
if (current_trace)
*iter->trace = *current_trace;
+ if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+ goto fail;
+
+ cpumask_clear(iter->started);
+
if (current_trace && current_trace->print_max)
iter->tr = &max_tr;
else
@@ -1917,6 +1926,7 @@ __tracing_open(struct inode *inode, struct file *file)
if (iter->buffer_iter[cpu])
ring_buffer_read_finish(iter->buffer_iter[cpu]);
}
+ free_cpumask_var(iter->started);
fail:
mutex_unlock(&trace_types_lock);
kfree(iter->trace);
@@ -1960,6 +1970,7 @@ static int tracing_release(struct inode *inode, struct file *file)
seq_release(inode, file);
mutex_destroy(&iter->mutex);
+ free_cpumask_var(iter->started);
kfree(iter->trace);
kfree(iter);
return 0;
@@ -2358,9 +2369,9 @@ static const char readme_msg[] =
"# mkdir /debug\n"
"# mount -t debugfs nodev /debug\n\n"
"# cat /debug/tracing/available_tracers\n"
- "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+ "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
"# cat /debug/tracing/current_tracer\n"
- "none\n"
+ "nop\n"
"# echo sched_switch > /debug/tracing/current_tracer\n"
"# cat /debug/tracing/current_tracer\n"
"sched_switch\n"
@@ -3266,19 +3277,13 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
info->tr = &global_trace;
info->cpu = cpu;
- info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+ info->spare = NULL;
/* Force reading ring buffer for first read */
info->read = (unsigned int)-1;
- if (!info->spare)
- goto out;
filp->private_data = info;
- return 0;
-
- out:
- kfree(info);
- return -ENOMEM;
+ return nonseekable_open(inode, filp);
}
static ssize_t
@@ -3293,6 +3298,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!count)
return 0;
+ if (!info->spare)
+ info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+ if (!info->spare)
+ return -ENOMEM;
+
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
goto read;
@@ -3331,7 +3341,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
- ring_buffer_free_read_page(info->tr->buffer, info->spare);
+ if (info->spare)
+ ring_buffer_free_read_page(info->tr->buffer, info->spare);
kfree(info);
return 0;
@@ -3417,14 +3428,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int size, i;
size_t ret;
- /*
- * We can't seek on a buffer input
- */
- if (unlikely(*ppos))
- return -ESPIPE;
+ if (*ppos & (PAGE_SIZE - 1)) {
+ WARN_ONCE(1, "Ftrace: previous read must page-align\n");
+ return -EINVAL;
+ }
+ if (len & (PAGE_SIZE - 1)) {
+ WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
+ if (len < PAGE_SIZE)
+ return -EINVAL;
+ len &= PAGE_MASK;
+ }
- for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
+ for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
@@ -3463,6 +3479,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
spd.partial[i].offset = 0;
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
+ *ppos += PAGE_SIZE;
}
spd.nr_pages = i;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3fc36d..e685ac2b2ba 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -182,6 +182,12 @@ struct trace_power {
struct power_trace state_data;
};
+enum kmemtrace_type_id {
+ KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
+ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
+ KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
+};
+
struct kmemtrace_alloc_entry {
struct trace_entry ent;
enum kmemtrace_type_id type_id;
@@ -596,7 +602,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
-extern long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(cycle_t nsec);
extern int
trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d278ff..576f4fa2af0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -503,6 +503,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = '\0';
pred = kzalloc(sizeof(*pred), GFP_KERNEL);
if (!pred)
@@ -520,9 +521,10 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
- if (filter_add_pred(call, pred)) {
+ err = filter_add_pred(call, pred);
+ if (err < 0) {
filter_free_pred(pred);
- return -EINVAL;
+ return err;
}
*ppos += cnt;
@@ -569,6 +571,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = '\0';
pred = kzalloc(sizeof(*pred), GFP_KERNEL);
if (!pred)
@@ -586,10 +589,11 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
- if (filter_add_subsystem_pred(system, pred)) {
+ err = filter_add_subsystem_pred(system, pred);
+ if (err < 0) {
filter_free_subsystem_preds(system);
filter_free_pred(pred);
- return -EINVAL;
+ return err;
}
*ppos += cnt;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be412f35..e03cbf1e38f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -215,7 +215,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
}
}
- return -ENOMEM;
+ return -ENOSPC;
}
static int is_string_field(const char *type)
@@ -319,7 +319,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
}
if (i == MAX_FILTER_PRED)
- return -EINVAL;
+ return -ENOSPC;
events_for_each(call) {
int err;
@@ -410,16 +410,22 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
}
}
+ if (!val_str) {
+ pred->field_name = NULL;
+ return -EINVAL;
+ }
+
pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
if (!pred->field_name)
return -ENOMEM;
- pred->val = simple_strtoull(val_str, &tmp, 10);
+ pred->val = simple_strtoull(val_str, &tmp, 0);
if (tmp == val_str) {
pred->str_val = kstrdup(val_str, GFP_KERNEL);
if (!pred->str_val)
return -ENOMEM;
- }
+ } else if (*tmp != '\0')
+ return -EINVAL;
return 0;
}
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7d411..d363c6672c6 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -105,10 +105,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
return 0;
#undef __entry
-#define __entry "REC"
+#define __entry REC
#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
#undef TP_fast_assign
#define TP_fast_assign(args...) args
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4d9952d3df5..07a22c33ebf 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -40,7 +40,7 @@
#undef TRACE_FIELD_ZERO_CHAR
#define TRACE_FIELD_ZERO_CHAR(item) \
- ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \
+ ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \
"offset:%u;\tsize:0;\n", \
(unsigned int)offsetof(typeof(field), item)); \
if (!ret) \
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d72b9a63b24..64b54a59c55 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -423,7 +423,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+ ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
" %ld.%03ldms (+%ld.%03ldms): ", comm,
entry->pid, iter->cpu, entry->flags,
entry->preempt_count, iter->idx,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index de35f200abd..9117cea6f1a 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -62,6 +62,9 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
pc = preempt_count();
tracing_record_cmdline(current);
+ if (sched_stopped)
+ return;
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c5ad6b2ec8..5bc00e8f153 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -154,7 +154,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
if (unlikely(!tracer_enabled || next != wakeup_task))
goto out_unlock;
- trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
/*
@@ -257,6 +257,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
data = wakeup_trace->data[wakeup_cpu];
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+
+ /*
+ * We must be careful in using CALLER_ADDR2. But since wake_up
+ * is not called by an assembly function (where as schedule is)
+ * it should be safe to use it here.
+ */
trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
out_locked:
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a3af29c94..5e579645ac8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,5 @@
+#include <trace/syscall.h>
#include <linux/kernel.h>
-#include <linux/ftrace.h>
#include <asm/syscall.h>
#include "trace_output.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6b966ce145..f71fb2a0895 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -966,20 +966,20 @@ undo:
}
#ifdef CONFIG_SMP
-static struct workqueue_struct *work_on_cpu_wq __read_mostly;
struct work_for_cpu {
- struct work_struct work;
+ struct completion completion;
long (*fn)(void *);
void *arg;
long ret;
};
-static void do_work_for_cpu(struct work_struct *w)
+static int do_work_for_cpu(void *_wfc)
{
- struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
-
+ struct work_for_cpu *wfc = _wfc;
wfc->ret = wfc->fn(wfc->arg);
+ complete(&wfc->completion);
+ return 0;
}
/**
@@ -990,17 +990,23 @@ static void do_work_for_cpu(struct work_struct *w)
*
* This will return the value @fn returns.
* It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
*/
long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
{
- struct work_for_cpu wfc;
-
- INIT_WORK(&wfc.work, do_work_for_cpu);
- wfc.fn = fn;
- wfc.arg = arg;
- queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
- flush_work(&wfc.work);
-
+ struct task_struct *sub_thread;
+ struct work_for_cpu wfc = {
+ .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
+ .fn = fn,
+ .arg = arg,
+ };
+
+ sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
+ if (IS_ERR(sub_thread))
+ return PTR_ERR(sub_thread);
+ kthread_bind(sub_thread, cpu);
+ wake_up_process(sub_thread);
+ wait_for_completion(&wfc.completion);
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -1016,8 +1022,4 @@ void __init init_workqueues(void)
hotcpu_notifier(workqueue_cpu_callback, 0);
keventd_wq = create_workqueue("events");
BUG_ON(!keventd_wq);
-#ifdef CONFIG_SMP
- work_on_cpu_wq = create_workqueue("work_on_cpu");
- BUG_ON(!work_on_cpu_wq);
-#endif
}