aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c824
-rw-r--r--kernel/irq/chip.c25
-rw-r--r--kernel/irq/manage.c46
-rw-r--r--kernel/irq/proc.c24
-rw-r--r--kernel/itimer.c18
-rw-r--r--kernel/kmod.c44
-rw-r--r--kernel/lockdep_proc.c1
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/posix-cpu-timers.c15
-rw-r--r--kernel/posix-timers.c15
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c7
-rw-r--r--kernel/signal.c58
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/sysctl.c591
-rw-r--r--kernel/time.c254
-rw-r--r--kernel/time/Kconfig25
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/clockevents.c345
-rw-r--r--kernel/time/clocksource.c246
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c480
-rw-r--r--kernel/time/tick-common.c346
-rw-r--r--kernel/time/tick-internal.h110
-rw-r--r--kernel/time/tick-oneshot.c84
-rw-r--r--kernel/time/tick-sched.c563
-rw-r--r--kernel/time/timer_list.c287
-rw-r--r--kernel/time/timer_stats.c411
-rw-r--r--kernel/timer.c290
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/utsname_sysctl.c146
-rw-r--r--kernel/workqueue.c7
37 files changed, 4401 insertions, 931 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 14f4d45e0ae..ac6b27abb1a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 0b6293d94d9..d154cc78648 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
- hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+ hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->it_real_incr.tv64 = 0;
sig->real_timer.function = it_real_fn;
sig->tsk = tsk;
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a737de857d..e749e7df14b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
if (sec != MAX_SCHEDULE_TIMEOUT) {
to = &timeout;
- hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+ hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
to->timer.expires = ktime_set(sec, nsec);
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f44e499e8fc..476cb0c0b4a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1,8 +1,9 @@
/*
* linux/kernel/hrtimer.c
*
- * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
- * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
* High-resolution kernel timers
*
@@ -31,12 +32,17 @@
*/
#include <linux/cpu.h>
+#include <linux/irq.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
+#include <linux/kallsyms.h>
#include <linux/interrupt.h>
+#include <linux/tick.h>
+#include <linux/seq_file.h>
+#include <linux/err.h>
#include <asm/uaccess.h>
@@ -45,7 +51,7 @@
*
* returns the time in ktime_t format
*/
-static ktime_t ktime_get(void)
+ktime_t ktime_get(void)
{
struct timespec now;
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void)
*
* returns the time in ktime_t format
*/
-static ktime_t ktime_get_real(void)
+ktime_t ktime_get_real(void)
{
struct timespec now;
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
* This ensures that we capture erroneous accesses to these clock ids
* rather than moving them into the range of valid clock id's.
*/
-
-#define MAX_HRTIMER_BASES 2
-
-static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
+DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+
+ .clock_base =
{
- .index = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
- .resolution = KTIME_REALTIME_RES,
- },
- {
- .index = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
- .resolution = KTIME_MONOTONIC_RES,
- },
+ {
+ .index = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ .resolution = KTIME_LOW_RES,
+ },
+ }
};
/**
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
* Get the coarse grained time at the softirq based on xtime and
* wall_to_monotonic.
*/
-static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
ktime_t xtim, tomono;
+ struct timespec xts;
unsigned long seq;
do {
seq = read_seqbegin(&xtime_lock);
- xtim = timespec_to_ktime(xtime);
- tomono = timespec_to_ktime(wall_to_monotonic);
-
+#ifdef CONFIG_NO_HZ
+ getnstimeofday(&xts);
+#else
+ xts = xtime;
+#endif
} while (read_seqretry(&xtime_lock, seq));
- base[CLOCK_REALTIME].softirq_time = xtim;
- base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+ xtim = timespec_to_ktime(xts);
+ tomono = timespec_to_ktime(wall_to_monotonic);
+ base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
+ base->clock_base[CLOCK_MONOTONIC].softirq_time =
+ ktime_add(xtim, tomono);
+}
+
+/*
+ * Helper function to check, whether the timer is running the callback
+ * function
+ */
+static inline int hrtimer_callback_running(struct hrtimer *timer)
+{
+ return timer->state & HRTIMER_STATE_CALLBACK;
}
/*
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
*/
#ifdef CONFIG_SMP
-#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)
-
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
* possible to set timer->base = NULL and drop the lock: the timer remains
* locked.
*/
-static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
- unsigned long *flags)
+static
+struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
{
- struct hrtimer_base *base;
+ struct hrtimer_clock_base *base;
for (;;) {
base = timer->base;
if (likely(base != NULL)) {
- spin_lock_irqsave(&base->lock, *flags);
+ spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
/* The timer has migrated to another CPU: */
- spin_unlock_irqrestore(&base->lock, *flags);
+ spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
}
cpu_relax();
}
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
/*
* Switch the timer base to the current CPU when possible.
*/
-static inline struct hrtimer_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
+static inline struct hrtimer_clock_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
- struct hrtimer_base *new_base;
+ struct hrtimer_clock_base *new_base;
+ struct hrtimer_cpu_base *new_cpu_base;
- new_base = &__get_cpu_var(hrtimer_bases)[base->index];
+ new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
/*
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
* completed. There is no conflict as we hold the lock until
* the timer is enqueued.
*/
- if (unlikely(base->curr_timer == timer))
+ if (unlikely(hrtimer_callback_running(timer)))
return base;
/* See the comment in lock_timer_base() */
timer->base = NULL;
- spin_unlock(&base->lock);
- spin_lock(&new_base->lock);
+ spin_unlock(&base->cpu_base->lock);
+ spin_lock(&new_base->cpu_base->lock);
timer->base = new_base;
}
return new_base;
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
#else /* CONFIG_SMP */
-#define set_curr_timer(b, t) do { } while (0)
-
-static inline struct hrtimer_base *
+static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
- struct hrtimer_base *base = timer->base;
+ struct hrtimer_clock_base *base = timer->base;
- spin_lock_irqsave(&base->lock, *flags);
+ spin_lock_irqsave(&base->cpu_base->lock, *flags);
return base;
}
-#define switch_hrtimer_base(t, b) (b)
+# define switch_hrtimer_base(t, b) (b)
#endif /* !CONFIG_SMP */
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
return ktime_add(kt, tmp);
}
-
-#else /* CONFIG_KTIME_SCALAR */
-
# endif /* !CONFIG_KTIME_SCALAR */
/*
* Divide a ktime value by a nanosecond value
*/
-static unsigned long ktime_divns(const ktime_t kt, s64 div)
+unsigned long ktime_divns(const ktime_t kt, s64 div)
{
u64 dclc, inc, dns;
int sft = 0;
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div)
return (unsigned long) dclc;
}
-
-#else /* BITS_PER_LONG < 64 */
-# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
#endif /* BITS_PER_LONG >= 64 */
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer enabled ?
+ */
+static int hrtimer_hres_enabled __read_mostly = 1;
+
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+ if (!strcmp(str, "off"))
+ hrtimer_hres_enabled = 0;
+ else if (!strcmp(str, "on"))
+ hrtimer_hres_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("highres=", setup_hrtimer_hres);
+
+/*
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
+ */
+static inline int hrtimer_is_hres_enabled(void)
+{
+ return hrtimer_hres_enabled;
+}
+
+/*
+ * Is the high resolution mode active ?
+ */
+static inline int hrtimer_hres_active(void)
+{
+ return __get_cpu_var(hrtimer_bases).hres_active;
+}
+
+/*
+ * Reprogram the event source with checking both queues for the
+ * next event
+ * Called with interrupts disabled and base->lock held
+ */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
+{
+ int i;
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ ktime_t expires;
+
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct hrtimer *timer;
+
+ if (!base->first)
+ continue;
+ timer = rb_entry(base->first, struct hrtimer, node);
+ expires = ktime_sub(timer->expires, base->offset);
+ if (expires.tv64 < cpu_base->expires_next.tv64)
+ cpu_base->expires_next = expires;
+ }
+
+ if (cpu_base->expires_next.tv64 != KTIME_MAX)
+ tick_program_event(cpu_base->expires_next, 1);
+}
+
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+ ktime_t expires = ktime_sub(timer->expires, base->offset);
+ int res;
+
+ /*
+ * When the callback is running, we do not reprogram the clock event
+ * device. The timer callback is either running on a different CPU or
+ * the callback is executed in the hrtimer_interupt context. The
+ * reprogramming is handled either by the softirq, which called the
+ * callback or at the end of the hrtimer_interrupt.
+ */
+ if (hrtimer_callback_running(timer))
+ return 0;
+
+ if (expires.tv64 >= expires_next->tv64)
+ return 0;
+
+ /*
+ * Clockevents returns -ETIME, when the event was in the past.
+ */
+ res = tick_program_event(expires, 0);
+ if (!IS_ERR_VALUE(res))
+ *expires_next = expires;
+ return res;
+}
+
+
+/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+ struct hrtimer_cpu_base *base;
+ struct timespec realtime_offset;
+ unsigned long seq;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ set_normalized_timespec(&realtime_offset,
+ -wall_to_monotonic.tv_sec,
+ -wall_to_monotonic.tv_nsec);
+ } while (read_seqretry(&xtime_lock, seq));
+
+ base = &__get_cpu_var(hrtimer_bases);
+
+ /* Adjust CLOCK_REALTIME offset */
+ spin_lock(&base->lock);
+ base->clock_base[CLOCK_REALTIME].offset =
+ timespec_to_ktime(realtime_offset);
+
+ hrtimer_force_reprogram(base);
+ spin_unlock(&base->lock);
+}
+
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+ /* Retrigger the CPU local events everywhere */
+ on_each_cpu(retrigger_next_event, NULL, 0, 1);
+}
+
+/*
+ * Check, whether the timer is on the callback pending list
+ */
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+ return timer->state & HRTIMER_STATE_PENDING;
+}
+
+/*
+ * Remove a timer from the callback pending list
+ */
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+ list_del_init(&timer->cb_entry);
+}
+
+/*
+ * Initialize the high resolution related parts of cpu_base
+ */
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+{
+ base->expires_next.tv64 = KTIME_MAX;
+ base->hres_active = 0;
+ INIT_LIST_HEAD(&base->cb_pending);
+}
+
+/*
+ * Initialize the high resolution related parts of a hrtimer
+ */
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
+{
+ INIT_LIST_HEAD(&timer->cb_entry);
+}
+
+/*
+ * When High resolution timers are active, try to reprogram. Note, that in case
+ * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
+ * check happens. The timer gets enqueued into the rbtree. The reprogramming
+ * and expiry check is done in the hrtimer_interrupt or in the softirq.
+ */
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
+
+ /* Timer is expired, act upon the callback mode */
+ switch(timer->cb_mode) {
+ case HRTIMER_CB_IRQSAFE_NO_RESTART:
+ /*
+ * We can call the callback from here. No restart
+ * happens, so no danger of recursion
+ */
+ BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
+ return 1;
+ case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+ /*
+ * This is solely for the sched tick emulation with
+ * dynamic tick support to ensure that we do not
+ * restart the tick right on the edge and end up with
+ * the tick timer in the softirq ! The calling site
+ * takes care of this.
+ */
+ return 1;
+ case HRTIMER_CB_IRQSAFE:
+ case HRTIMER_CB_SOFTIRQ:
+ /*
+ * Move everything else into the softirq pending list !
+ */
+ list_add_tail(&timer->cb_entry,
+ &base->cpu_base->cb_pending);
+ timer->state = HRTIMER_STATE_PENDING;
+ raise_softirq(HRTIMER_SOFTIRQ);
+ return 1;
+ default:
+ BUG();
+ }
+ }
+ return 0;
+}
+
+/*
+ * Switch to high resolution mode
+ */
+static void hrtimer_switch_to_hres(void)
+{
+ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+ unsigned long flags;
+
+ if (base->hres_active)
+ return;
+
+ local_irq_save(flags);
+
+ if (tick_init_highres()) {
+ local_irq_restore(flags);
+ return;
+ }
+ base->hres_active = 1;
+ base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
+ base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+
+ tick_setup_sched_timer();
+
+ /* "Retrigger" the interrupt to get things going */
+ retrigger_next_event(NULL);
+ local_irq_restore(flags);
+ printk(KERN_INFO "Switched to high resolution mode on CPU %d\n",
+ smp_processor_id());
+}
+
+#else
+
+static inline int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
+static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ return 0;
+}
+static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+
+#endif /* CONFIG_HIGH_RES_TIMERS */
+
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
+{
+ if (timer->start_site)
+ return;
+
+ timer->start_site = addr;
+ memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+ timer->start_pid = current->pid;
+}
+#endif
+
/*
* Counterpart to lock_timer_base above:
*/
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
- spin_unlock_irqrestore(&timer->base->lock, *flags);
+ spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}
/**
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
* The timer is inserted in expiry order. Insertion into the
* red black tree is O(log(n)). Must hold the base lock.
*/
-static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+static void enqueue_hrtimer(struct hrtimer *timer,
+ struct hrtimer_clock_base *base, int reprogram)
{
struct rb_node **link = &base->active.rb_node;
struct rb_node *parent = NULL;
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
* Insert the timer to the rbtree and check whether it
* replaces the first pending timer
*/
- rb_link_node(&timer->node, parent, link);
- rb_insert_color(&timer->node, &base->active);
-
if (!base->first || timer->expires.tv64 <
- rb_entry(base->first, struct hrtimer, node)->expires.tv64)
+ rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
+ /*
+ * Reprogram the clock event device. When the timer is already
+ * expired hrtimer_enqueue_reprogram has either called the
+ * callback or added it to the pending list and raised the
+ * softirq.
+ *
+ * This is a NOP for !HIGHRES
+ */
+ if (reprogram && hrtimer_enqueue_reprogram(timer, base))
+ return;
+
base->first = &timer->node;
+ }
+
+ rb_link_node(&timer->node, parent, link);
+ rb_insert_color(&timer->node, &base->active);
+ /*
+ * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
+ * state of a possibly running callback.
+ */
+ timer->state |= HRTIMER_STATE_ENQUEUED;
}
/*
* __remove_hrtimer - internal function to remove a timer
*
* Caller must hold the base lock.
+ *
+ * High resolution timer mode reprograms the clock event device when the
+ * timer is the one which expires next. The caller can disable this by setting
+ * reprogram to zero. This is useful, when the context does a reprogramming
+ * anyway (e.g. timer interrupt)
*/
-static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+static void __remove_hrtimer(struct hrtimer *timer,
+ struct hrtimer_clock_base *base,
+ unsigned long newstate, int reprogram)
{
- /*
- * Remove the timer from the rbtree and replace the
- * first entry pointer if necessary.
- */
- if (base->first == &timer->node)
- base->first = rb_next(&timer->node);
- rb_erase(&timer->node, &base->active);
- rb_set_parent(&timer->node, &timer->node);
+ /* High res. callback list. NOP for !HIGHRES */
+ if (hrtimer_cb_pending(timer))
+ hrtimer_remove_cb_pending(timer);
+ else {
+ /*
+ * Remove the timer from the rbtree and replace the
+ * first entry pointer if necessary.
+ */
+ if (base->first == &timer->node) {
+ base->first = rb_next(&timer->node);
+ /* Reprogram the clock event device. if enabled */
+ if (reprogram && hrtimer_hres_active())
+ hrtimer_force_reprogram(base->cpu_base);
+ }
+ rb_erase(&timer->node, &base->active);
+ }
+ timer->state = newstate;
}
/*
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
- if (hrtimer_active(timer)) {
- __remove_hrtimer(timer, base);
+ if (hrtimer_is_queued(timer)) {
+ int reprogram;
+
+ /*
+ * Remove the timer and force reprogramming when high
+ * resolution mode is active and the timer is on the current
+ * CPU. If we remove a timer on another CPU, reprogramming is
+ * skipped. The interrupt event on this CPU is fired and
+ * reprogramming happens in the interrupt handler. This is a
+ * rare case and less expensive than a smp call.
+ */
+ timer_stats_hrtimer_clear_start_info(timer);
+ reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
+ reprogram);
return 1;
}
return 0;
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
int
hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
{
- struct hrtimer_base *base, *new_base;
+ struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
int ret;
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
/* Switch the timer base, if necessary: */
new_base = switch_hrtimer_base(timer, base);
- if (mode == HRTIMER_REL) {
+ if (mode == HRTIMER_MODE_REL) {
tim = ktime_add(tim, new_base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
}
timer->expires = tim;
- enqueue_hrtimer(timer, new_base);
+ timer_stats_hrtimer_set_start_info(timer);
+
+ enqueue_hrtimer(timer, new_base, base == new_base);
unlock_hrtimer_base(timer, &flags);
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
*/
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
- struct hrtimer_base *base;
+ struct hrtimer_clock_base *base;
unsigned long flags;
int ret = -1;
base = lock_hrtimer_base(timer, &flags);
- if (base->curr_timer != timer)
+ if (!hrtimer_callback_running(timer))
ret = remove_hrtimer(timer, base);
unlock_hrtimer_base(timer, &flags);
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
*/
ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
{
- struct hrtimer_base *base;
+ struct hrtimer_clock_base *base;
unsigned long flags;
ktime_t rem;
base = lock_hrtimer_base(timer, &flags);
- rem = ktime_sub(timer->expires, timer->base->get_time());
+ rem = ktime_sub(timer->expires, base->get_time());
unlock_hrtimer_base(timer, &flags);
return rem;
}
EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#ifdef CONFIG_NO_IDLE_HZ
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
/**
* hrtimer_get_next_event - get the time until next expiry event
*
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
*/
ktime_t hrtimer_get_next_event(void)
{
- struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
unsigned long flags;
int i;
- for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
- struct hrtimer *timer;
+ spin_lock_irqsave(&cpu_base->lock, flags);
- spin_lock_irqsave(&base->lock, flags);
- if (!base->first) {
- spin_unlock_irqrestore(&base->lock, flags);
- continue;
+ if (!hrtimer_hres_active()) {
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct hrtimer *timer;
+
+ if (!base->first)
+ continue;
+
+ timer = rb_entry(base->first, struct hrtimer, node);
+ delta.tv64 = timer->expires.tv64;
+ delta = ktime_sub(delta, base->get_time());
+ if (delta.tv64 < mindelta.tv64)
+ mindelta.tv64 = delta.tv64;
}
- timer = rb_entry(base->first, struct hrtimer, node);
- delta.tv64 = timer->expires.tv64;
- spin_unlock_irqrestore(&base->lock, flags);
- delta = ktime_sub(delta, base->get_time());
- if (delta.tv64 < mindelta.tv64)
- mindelta.tv64 = delta.tv64;
}
+
+ spin_unlock_irqrestore(&cpu_base->lock, flags);
+
if (mindelta.tv64 < 0)
mindelta.tv64 = 0;
return mindelta;
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void)
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
- struct hrtimer_base *bases;
+ struct hrtimer_cpu_base *cpu_base;
memset(timer, 0, sizeof(struct hrtimer));
- bases = __raw_get_cpu_var(hrtimer_bases);
+ cpu_base = &__raw_get_cpu_var(hrtimer_bases);
- if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
+ if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
clock_id = CLOCK_MONOTONIC;
- timer->base = &bases[clock_id];
- rb_set_parent(&timer->node, &timer->node);
+ timer->base = &cpu_base->clock_base[clock_id];
+ hrtimer_init_timer_hres(timer);
+
+#ifdef CONFIG_TIMER_STATS
+ timer->start_site = NULL;
+ timer->start_pid = -1;
+ memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
}
EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
*/
int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
{
- struct hrtimer_base *bases;
+ struct hrtimer_cpu_base *cpu_base;
- bases = __raw_get_cpu_var(hrtimer_bases);
- *tp = ktime_to_timespec(bases[which_clock].resolution);
+ cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+ *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
return 0;
}
EXPORT_SYMBOL_GPL(hrtimer_get_res);
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_clock_base *base;
+ ktime_t expires_next, now;
+ int i, raise = 0;
+
+ BUG_ON(!cpu_base->hres_active);
+ cpu_base->nr_events++;
+ dev->next_event.tv64 = KTIME_MAX;
+
+ retry:
+ now = ktime_get();
+
+ expires_next.tv64 = KTIME_MAX;
+
+ base = cpu_base->clock_base;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ ktime_t basenow;
+ struct rb_node *node;
+
+ spin_lock(&cpu_base->lock);
+
+ basenow = ktime_add(now, base->offset);
+
+ while ((node = base->first)) {
+ struct hrtimer *timer;
+
+ timer = rb_entry(node, struct hrtimer, node);
+
+ if (basenow.tv64 < timer->expires.tv64) {
+ ktime_t expires;
+
+ expires = ktime_sub(timer->expires,
+ base->offset);
+ if (expires.tv64 < expires_next.tv64)
+ expires_next = expires;
+ break;
+ }
+
+ /* Move softirq callbacks to the pending list */
+ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+ __remove_hrtimer(timer, base,
+ HRTIMER_STATE_PENDING, 0);
+ list_add_tail(&timer->cb_entry,
+ &base->cpu_base->cb_pending);
+ raise = 1;
+ continue;
+ }
+
+ __remove_hrtimer(timer, base,
+ HRTIMER_STATE_CALLBACK, 0);
+ timer_stats_account_hrtimer(timer);
+
+ /*
+ * Note: We clear the CALLBACK bit after
+ * enqueue_hrtimer to avoid reprogramming of
+ * the event hardware. This happens at the end
+ * of this function anyway.
+ */
+ if (timer->function(timer) != HRTIMER_NORESTART) {
+ BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+ enqueue_hrtimer(timer, base, 0);
+ }
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+ }
+ spin_unlock(&cpu_base->lock);
+ base++;
+ }
+
+ cpu_base->expires_next = expires_next;
+
+ /* Reprogramming necessary ? */
+ if (expires_next.tv64 != KTIME_MAX) {
+ if (tick_program_event(expires_next, 0))
+ goto retry;
+ }
+
+ /* Raise softirq ? */
+ if (raise)
+ raise_softirq(HRTIMER_SOFTIRQ);
+}
+
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+ spin_lock_irq(&cpu_base->lock);
+
+ while (!list_empty(&cpu_base->cb_pending)) {
+ enum hrtimer_restart (*fn)(struct hrtimer *);
+ struct hrtimer *timer;
+ int restart;
+
+ timer = list_entry(cpu_base->cb_pending.next,
+ struct hrtimer, cb_entry);
+
+ timer_stats_account_hrtimer(timer);
+
+ fn = timer->function;
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+ spin_unlock_irq(&cpu_base->lock);
+
+ restart = fn(timer);
+
+ spin_lock_irq(&cpu_base->lock);
+
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+ if (restart == HRTIMER_RESTART) {
+ BUG_ON(hrtimer_active(timer));
+ /*
+ * Enqueue the timer, allow reprogramming of the event
+ * device
+ */
+ enqueue_hrtimer(timer, timer->base, 1);
+ } else if (hrtimer_active(timer)) {
+ /*
+ * If the timer was rearmed on another CPU, reprogram
+ * the event device.
+ */
+ if (timer->base->first == &timer->node)
+ hrtimer_reprogram(timer, timer->base);
+ }
+ }
+ spin_unlock_irq(&cpu_base->lock);
+}
+
+#endif /* CONFIG_HIGH_RES_TIMERS */
+
/*
* Expire the per base hrtimer-queue:
*/
-static inline void run_hrtimer_queue(struct hrtimer_base *base)
+static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
+ int index)
{
struct rb_node *node;
+ struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
if (!base->first)
return;
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
if (base->get_softirq_time)
base->softirq_time = base->get_softirq_time();
- spin_lock_irq(&base->lock);
+ spin_lock_irq(&cpu_base->lock);
while ((node = base->first)) {
struct hrtimer *timer;
- int (*fn)(struct hrtimer *);
+ enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
timer = rb_entry(node, struct hrtimer, node);
if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
+ timer_stats_account_hrtimer(timer);
+
fn = timer->function;
- set_curr_timer(base, timer);
- __remove_hrtimer(timer, base);
- spin_unlock_irq(&base->lock);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ spin_unlock_irq(&cpu_base->lock);
restart = fn(timer);
- spin_lock_irq(&base->lock);
+ spin_lock_irq(&cpu_base->lock);
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
if (restart != HRTIMER_NORESTART) {
BUG_ON(hrtimer_active(timer));
- enqueue_hrtimer(timer, base);
+ enqueue_hrtimer(timer, base, 0);
}
}
- set_curr_timer(base, NULL);
- spin_unlock_irq(&base->lock);
+ spin_unlock_irq(&cpu_base->lock);
}
/*
* Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
*/
void hrtimer_run_queues(void)
{
- struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
int i;
- hrtimer_get_softirq_time(base);
+ if (hrtimer_hres_active())
+ return;
+
+ /*
+ * This _is_ ugly: We have to check in the softirq context,
+ * whether we can switch to highres and / or nohz mode. The
+ * clocksource switch happens in the timer interrupt with
+ * xtime_lock held. Notification from there only sets the
+ * check bit in the tick_oneshot code, otherwise we might
+ * deadlock vs. xtime_lock.
+ */
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+ hrtimer_switch_to_hres();
- for (i = 0; i < MAX_HRTIMER_BASES; i++)
- run_hrtimer_queue(&base[i]);
+ hrtimer_get_softirq_time(cpu_base);
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ run_hrtimer_queue(cpu_base, i);
}
/*
* Sleep related functions:
*/
-static int hrtimer_wakeup(struct hrtimer *timer)
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
struct hrtimer_sleeper *t =
container_of(timer, struct hrtimer_sleeper, timer);
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
{
sl->timer.function = hrtimer_wakeup;
sl->task = task;
+#ifdef CONFIG_HIGH_RES_TIMERS
+ sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+#endif
}
static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start(&t->timer, t->timer.expires, mode);
- schedule();
+ if (likely(t->task))
+ schedule();
hrtimer_cancel(&t->timer);
- mode = HRTIMER_ABS;
+ mode = HRTIMER_MODE_ABS;
} while (t->task && !signal_pending(current));
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
restart->fn = do_no_restart_syscall;
- hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
+ hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
- if (do_nanosleep(&t, HRTIMER_ABS))
+ if (do_nanosleep(&t, HRTIMER_MODE_ABS))
return 0;
rmtp = (struct timespec __user *) restart->arg1;
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
return 0;
/* Absolute timers do not update the rmtp value and restart: */
- if (mode == HRTIMER_ABS)
+ if (mode == HRTIMER_MODE_ABS)
return -ERESTARTNOHAND;
if (rmtp) {
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
if (!timespec_valid(&tu))
return -EINVAL;
- return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
/*
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
*/
static void __devinit init_hrtimers_cpu(int cpu)
{
- struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
- spin_lock_init(&base->lock);
- lockdep_set_class(&base->lock, &base->lock_key);
- }
+ spin_lock_init(&cpu_base->lock);
+ lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ cpu_base->clock_base[i].cpu_base = cpu_base;
+
+ hrtimer_init_hres(cpu_base);
}
#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_hrtimer_list(struct hrtimer_base *old_base,
- struct hrtimer_base *new_base)
+static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+ struct hrtimer_clock_base *new_base)
{
struct hrtimer *timer;
struct rb_node *node;
while ((node = rb_first(&old_base->active))) {
timer = rb_entry(node, struct hrtimer, node);
- __remove_hrtimer(timer, old_base);
+ BUG_ON(hrtimer_callback_running(timer));
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
timer->base = new_base;
- enqueue_hrtimer(timer, new_base);
+ /*
+ * Enqueue the timer. Allow reprogramming of the event device
+ */
+ enqueue_hrtimer(timer, new_base, 1);
}
}
static void migrate_hrtimers(int cpu)
{
- struct hrtimer_base *old_base, *new_base;
+ struct hrtimer_cpu_base *old_base, *new_base;
int i;
BUG_ON(cpu_online(cpu));
- old_base = per_cpu(hrtimer_bases, cpu);
- new_base = get_cpu_var(hrtimer_bases);
-
- local_irq_disable();
+ old_base = &per_cpu(hrtimer_bases, cpu);
+ new_base = &get_cpu_var(hrtimer_bases);
- for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+ tick_cancel_sched_timer(cpu);
- spin_lock(&new_base->lock);
- spin_lock(&old_base->lock);
-
- BUG_ON(old_base->curr_timer);
+ local_irq_disable();
- migrate_hrtimer_list(old_base, new_base);
+ spin_lock(&new_base->lock);
+ spin_lock(&old_base->lock);
- spin_unlock(&old_base->lock);
- spin_unlock(&new_base->lock);
- old_base++;
- new_base++;
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ migrate_hrtimer_list(&old_base->clock_base[i],
+ &new_base->clock_base[i]);
}
+ spin_unlock(&old_base->lock);
+ spin_unlock(&new_base->lock);
local_irq_enable();
put_cpu_var(hrtimer_bases);
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
migrate_hrtimers(cpu);
break;
#endif
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void)
hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+ open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+#endif
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 475e8a71bcd..0133f4f9e9f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data);
/**
* set_irq_data - set irq type data for an irq
* @irq: Interrupt number
- * @data: Pointer to interrupt specific data
+ * @entry: Pointer to MSI descriptor data
*
* Set the hardware irq controller data for an irq
*/
@@ -230,10 +230,6 @@ static void default_enable(unsigned int irq)
*/
static void default_disable(unsigned int irq)
{
- struct irq_desc *desc = irq_desc + irq;
-
- if (!(desc->status & IRQ_DELAYED_DISABLE))
- desc->chip->mask(irq);
}
/*
@@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
kstat_cpu(cpu).irqs[irq]++;
action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+ if (desc->chip->mask)
+ desc->chip->mask(irq);
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ desc->status |= IRQ_PENDING;
goto out_unlock;
+ }
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
desc->status |= IRQ_INPROGRESS;
spin_unlock(&desc->lock);
@@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
/*
* If its disabled or no action available
- * keep it masked and get out of here
+ * then mask it and get out of here:
*/
action = desc->action;
if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
desc->status |= IRQ_PENDING;
+ if (desc->chip->mask)
+ desc->chip->mask(irq);
goto out;
}
@@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
/* Uninstall? */
if (handle == handle_bad_irq) {
- if (desc->chip != &no_irq_chip) {
- desc->chip->mask(irq);
- desc->chip->ack(irq);
- }
+ if (desc->chip != &no_irq_chip)
+ mask_ack_irq(desc, irq);
desc->status |= IRQ_DISABLED;
desc->depth = 1;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7c85d69188e..5597c157442 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq)
}
EXPORT_SYMBOL(synchronize_irq);
+/**
+ * irq_can_set_affinity - Check if the affinity of a given irq can be set
+ * @irq: Interrupt to check
+ *
+ */
+int irq_can_set_affinity(unsigned int irq)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
+ !desc->chip->set_affinity)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ */
+int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ if (!desc->chip->set_affinity)
+ return -EINVAL;
+
+ set_balance_irq_affinity(irq, cpumask);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ set_pending_irq(irq, cpumask);
+#else
+ desc->affinity = cpumask;
+ desc->chip->set_affinity(irq, cpumask);
+#endif
+ return 0;
+}
+
#endif
/**
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
if (new->flags & IRQF_PERCPU)
desc->status |= IRQ_PER_CPU;
#endif
+ /* Exclude IRQ from balancing */
+ if (new->flags & IRQF_NOBALANCING)
+ desc->status |= IRQ_NO_BALANCING;
+
if (!shared) {
irq_chip_set_defaults(desc->chip);
@@ -461,7 +505,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
/*
* Lockdep wants atomic interrupt handlers:
*/
- irqflags |= SA_INTERRUPT;
+ irqflags |= IRQF_DISABLED;
#endif
/*
* Sanity-check: shared interrupts must pass in a real dev-ID,
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6d3be06e8ce..2db91eb54ad 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
- set_balance_irq_affinity(irq, mask_val);
-
- /*
- * Save these away for later use. Re-progam when the
- * interrupt is pending
- */
- set_pending_irq(irq, mask_val);
-}
-#else
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
- set_balance_irq_affinity(irq, mask_val);
- irq_desc[irq].affinity = mask_val;
- irq_desc[irq].chip->set_affinity(irq, mask_val);
-}
-#endif
-
static int irq_affinity_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
cpumask_t new_value, tmp;
if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
- CHECK_IRQ_PER_CPU(irq_desc[irq].status))
+ irq_balancing_disabled(irq))
return -EIO;
err = cpumask_parse_user(buffer, count, new_value);
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
code to set default SMP affinity. */
return select_smp_affinity(irq) ? -EINVAL : full_count;
- proc_set_irq_affinity(irq, new_value);
+ irq_set_affinity(irq, new_value);
return full_count;
}
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 204ed7939e7..307c6a632ef 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
/*
* The timer is automagically restarted, when interval != 0
*/
-int it_real_fn(struct hrtimer *timer)
+enum hrtimer_restart it_real_fn(struct hrtimer *timer)
{
struct signal_struct *sig =
container_of(timer, struct signal_struct, real_timer);
send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
- if (sig->it_real_incr.tv64 != 0) {
- hrtimer_forward(timer, timer->base->softirq_time,
- sig->it_real_incr);
- return HRTIMER_RESTART;
- }
return HRTIMER_NORESTART;
}
@@ -231,11 +226,14 @@ again:
spin_unlock_irq(&tsk->sighand->siglock);
goto again;
}
- tsk->signal->it_real_incr =
- timeval_to_ktime(value->it_interval);
expires = timeval_to_ktime(value->it_value);
- if (expires.tv64 != 0)
- hrtimer_start(timer, expires, HRTIMER_REL);
+ if (expires.tv64 != 0) {
+ tsk->signal->it_real_incr =
+ timeval_to_ktime(value->it_interval);
+ hrtimer_start(timer, expires, HRTIMER_MODE_REL);
+ } else
+ tsk->signal->it_real_incr.tv64 = 0;
+
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31c..796276141e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data)
sub_info->retval = ret;
}
- complete(sub_info->complete);
+ if (sub_info->wait < 0)
+ kfree(sub_info);
+ else
+ complete(sub_info->complete);
return 0;
}
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work)
pid = kernel_thread(____call_usermodehelper, sub_info,
CLONE_VFORK | SIGCHLD);
+ if (wait < 0)
+ return;
+
if (pid < 0) {
sub_info->retval = pid;
complete(sub_info->complete);
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work)
* @envp: null-terminated environment list
* @session_keyring: session keyring for process (NULL for an empty keyring)
* @wait: wait for the application to finish and return status.
+ * when -1 don't wait at all, but you get no useful error back when
+ * the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
struct key *session_keyring, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
- struct subprocess_info sub_info = {
- .work = __WORK_INITIALIZER(sub_info.work,
- __call_usermodehelper),
- .complete = &done,
- .path = path,
- .argv = argv,
- .envp = envp,
- .ring = session_keyring,
- .wait = wait,
- .retval = 0,
- };
+ struct subprocess_info *sub_info;
+ int retval;
if (!khelper_wq)
return -EBUSY;
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
if (path[0] == '\0')
return 0;
- queue_work(khelper_wq, &sub_info.work);
+ sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
+ if (!sub_info)
+ return -ENOMEM;
+
+ INIT_WORK(&sub_info->work, __call_usermodehelper);
+ sub_info->complete = &done;
+ sub_info->path = path;
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+ sub_info->ring = session_keyring;
+ sub_info->wait = wait;
+
+ queue_work(khelper_wq, &sub_info->work);
+ if (wait < 0) /* task has freed sub_info */
+ return 0;
wait_for_completion(&done);
- return sub_info.retval;
+ retval = sub_info->retval;
+ kfree(sub_info);
+ return retval;
}
EXPORT_SYMBOL(call_usermodehelper_keys);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 88fc611b3ae..58f35e586ee 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -10,7 +10,6 @@
* Code for /proc/lockdep and /proc/lockdep_stats:
*
*/
-#include <linux/sched.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 841539d72c5..d17436cdea1 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -13,7 +13,6 @@
* Released under the General Public License (GPL).
*/
#include <linux/mutex.h>
-#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/poison.h>
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7c3e1e6dfb5..657f7769741 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
* should be able to see it.
*/
struct task_struct *p;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (p) {
if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
error = cpu_clock_sample(which_clock,
p, &rtn);
}
- } else if (p->tgid == pid && p->signal) {
- error = cpu_clock_sample_group(which_clock,
- p, &rtn);
+ } else {
+ read_lock(&tasklist_lock);
+ if (p->tgid == pid && p->signal) {
+ error =
+ cpu_clock_sample_group(which_clock,
+ p, &rtn);
+ }
+ read_unlock(&tasklist_lock);
}
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
}
if (error)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a1bf6161783..44318ca7197 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
struct itimerspec *, struct itimerspec *);
static int common_timer_del(struct k_itimer *timer);
-static int posix_timer_fn(struct hrtimer *data);
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
* This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
*/
-static int posix_timer_fn(struct hrtimer *timer)
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
struct k_itimer *timr;
unsigned long flags;
int si_private = 0;
- int ret = HRTIMER_NORESTART;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer)
if (timr->it.real.interval.tv64 != 0) {
timr->it_overrun +=
hrtimer_forward(timer,
- timer->base->softirq_time,
+ hrtimer_cb_get_time(timer),
timr->it.real.interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
@@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags,
if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
return 0;
- mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
+ mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
timr->it.real.timer.function = posix_timer_fn;
@@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags,
/* SIGEV_NONE timers are not queued ! See common_timer_get */
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
/* Setup correct expiry time for relative timers */
- if (mode == HRTIMER_REL)
+ if (mode == HRTIMER_MODE_REL)
timer->expires = ktime_add(timer->expires,
timer->base->get_time());
return 0;
@@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
struct timespec *tsave, struct timespec __user *rmtp)
{
return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
- HRTIMER_ABS : HRTIMER_REL, which_clock);
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
}
asmlinkage long
diff --git a/kernel/resource.c b/kernel/resource.c
index 2a3f8863658..bdb55a33f96 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -8,7 +8,6 @@
*/
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/ioport.h>
#include <linux/init.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da46fd..180978cb2f7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
/* Setup the timer, when timeout != NULL */
if (unlikely(timeout))
hrtimer_start(&timeout->timer, timeout->timer.expires,
- HRTIMER_ABS);
+ HRTIMER_MODE_ABS);
for (;;) {
/* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index 08f86178aa3..0dc757246d8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_enter_lazy_cpu_mode();
+
if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8072e568bbe..e2a7d4bf7d5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
int signr = __dequeue_signal(&tsk->pending, mask, info);
- if (!signr)
+ if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info);
+ /*
+ * itimer signal ?
+ *
+ * itimers are process shared and we restart periodic
+ * itimers in the signal delivery path to prevent DoS
+ * attacks in the high resolution timer case. This is
+ * compliant with the old way of self restarting
+ * itimers, as the SIGALRM is a legacy signal and only
+ * queued once. Changing the restart behaviour to
+ * restart the timer in the signal dequeue path is
+ * reducing the timer noise on heavy loaded !highres
+ * systems too.
+ */
+ if (unlikely(signr == SIGALRM)) {
+ struct hrtimer *tmr = &tsk->signal->real_timer;
+
+ if (!hrtimer_is_queued(tmr) &&
+ tsk->signal->it_real_incr.tv64 != 0) {
+ hrtimer_forward(tmr, tmr->base->get_time(),
+ tsk->signal->it_real_incr);
+ hrtimer_restart(tmr);
+ }
+ }
+ }
recalc_sigpending_tsk(tsk);
- if (signr && unlikely(sig_kernel_stop(signr))) {
- /*
- * Set a marker that we have dequeued a stop signal. Our
- * caller might release the siglock and then the pending
- * stop signal it is about to process is no longer in the
- * pending bitmasks, but must still be cleared by a SIGCONT
- * (and overruled by a SIGKILL). So those cases clear this
- * shared flag after we've set it. Note that this flag may
- * remain set after the signal we return is ignored or
- * handled. That doesn't matter because its only purpose
- * is to alert stop-signal processing code when another
- * processor has come along and cleared the flag.
- */
- if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
- tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
- }
+ if (signr && unlikely(sig_kernel_stop(signr))) {
+ /*
+ * Set a marker that we have dequeued a stop signal. Our
+ * caller might release the siglock and then the pending
+ * stop signal it is about to process is no longer in the
+ * pending bitmasks, but must still be cleared by a SIGCONT
+ * (and overruled by a SIGKILL). So those cases clear this
+ * shared flag after we've set it. Note that this flag may
+ * remain set after the signal we return is ignored or
+ * handled. That doesn't matter because its only purpose
+ * is to alert stop-signal processing code when another
+ * processor has come along and cleared the flag.
+ */
+ if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+ tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ }
if ( signr &&
((info->si_code & __SI_MASK) == __SI_TIMER) &&
info->si_sys_private){
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 918e52df090..8b75008e2bd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -17,6 +17,7 @@
#include <linux/kthread.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
+#include <linux/tick.h>
#include <asm/irq.h>
/*
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq);
#endif
+/*
+ * Enter an interrupt context.
+ */
+void irq_enter(void)
+{
+ __irq_enter();
+#ifdef CONFIG_NO_HZ
+ if (idle_cpu(smp_processor_id()))
+ tick_nohz_update_jiffies();
+#endif
+}
+
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
# define invoke_softirq() __do_softirq()
#else
@@ -289,6 +302,12 @@ void irq_exit(void)
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
+
+#ifdef CONFIG_NO_HZ
+ /* Make sure that timer wheel updates are propagated */
+ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
+ tick_nohz_stop_sched_tick();
+#endif
preempt_enable_no_resched();
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e0ac6cd79fc..3ca1d5ff031 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -90,12 +90,6 @@ extern char modprobe_path[];
#ifdef CONFIG_CHR_DEV_SG
extern int sg_big_buff;
#endif
-#ifdef CONFIG_SYSVIPC
-static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-#endif
#ifdef __sparc__
extern char reboot_command [];
@@ -135,18 +129,6 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
void __user *, size_t, ctl_table *);
#endif
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-
-static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen);
-
-#ifdef CONFIG_SYSVIPC
-static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen);
-#endif
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -177,60 +159,6 @@ int sysctl_legacy_va_layout;
#endif
-static void *get_uts(ctl_table *table, int write)
-{
- char *which = table->data;
-#ifdef CONFIG_UTS_NS
- struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
- which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
-#endif
- if (!write)
- down_read(&uts_sem);
- else
- down_write(&uts_sem);
- return which;
-}
-
-static void put_uts(ctl_table *table, int write, void *which)
-{
- if (!write)
- up_read(&uts_sem);
- else
- up_write(&uts_sem);
-}
-
-#ifdef CONFIG_SYSVIPC
-static void *get_ipc(ctl_table *table, int write)
-{
- char *which = table->data;
- struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
- which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
- return which;
-}
-#else
-#define get_ipc(T,W) ((T)->data)
-#endif
-
-/* /proc declarations: */
-
-#ifdef CONFIG_PROC_SYSCTL
-
-static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
-static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
-static int proc_opensys(struct inode *, struct file *);
-
-const struct file_operations proc_sys_file_operations = {
- .open = proc_opensys,
- .read = proc_readsys,
- .write = proc_writesys,
-};
-
-extern struct proc_dir_entry *proc_sys_root;
-
-static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
-static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
-#endif
-
/* The default sysctl tables: */
static ctl_table root_table[] = {
@@ -278,51 +206,6 @@ static ctl_table root_table[] = {
static ctl_table kern_table[] = {
{
- .ctl_name = KERN_OSTYPE,
- .procname = "ostype",
- .data = init_uts_ns.name.sysname,
- .maxlen = sizeof(init_uts_ns.name.sysname),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_uts_string,
- },
- {
- .ctl_name = KERN_OSRELEASE,
- .procname = "osrelease",
- .data = init_uts_ns.name.release,
- .maxlen = sizeof(init_uts_ns.name.release),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_uts_string,
- },
- {
- .ctl_name = KERN_VERSION,
- .procname = "version",
- .data = init_uts_ns.name.version,
- .maxlen = sizeof(init_uts_ns.name.version),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_uts_string,
- },
- {
- .ctl_name = KERN_NODENAME,
- .procname = "hostname",
- .data = init_uts_ns.name.nodename,
- .maxlen = sizeof(init_uts_ns.name.nodename),
- .mode = 0644,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_uts_string,
- },
- {
- .ctl_name = KERN_DOMAINNAME,
- .procname = "domainname",
- .data = init_uts_ns.name.domainname,
- .maxlen = sizeof(init_uts_ns.name.domainname),
- .mode = 0644,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_uts_string,
- },
- {
.ctl_name = KERN_PANIC,
.procname = "panic",
.data = &panic_timeout,
@@ -478,71 +361,6 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
-#ifdef CONFIG_SYSVIPC
- {
- .ctl_name = KERN_SHMMAX,
- .procname = "shmmax",
- .data = &init_ipc_ns.shm_ctlmax,
- .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
- .mode = 0644,
- .proc_handler = &proc_ipc_doulongvec_minmax,
- .strategy = sysctl_ipc_data,
- },
- {
- .ctl_name = KERN_SHMALL,
- .procname = "shmall",
- .data = &init_ipc_ns.shm_ctlall,
- .maxlen = sizeof (init_ipc_ns.shm_ctlall),
- .mode = 0644,
- .proc_handler = &proc_ipc_doulongvec_minmax,
- .strategy = sysctl_ipc_data,
- },
- {
- .ctl_name = KERN_SHMMNI,
- .procname = "shmmni",
- .data = &init_ipc_ns.shm_ctlmni,
- .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
- .mode = 0644,
- .proc_handler = &proc_ipc_dointvec,
- .strategy = sysctl_ipc_data,
- },
- {
- .ctl_name = KERN_MSGMAX,
- .procname = "msgmax",
- .data = &init_ipc_ns.msg_ctlmax,
- .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
- .mode = 0644,
- .proc_handler = &proc_ipc_dointvec,
- .strategy = sysctl_ipc_data,
- },
- {
- .ctl_name = KERN_MSGMNI,
- .procname = "msgmni",
- .data = &init_ipc_ns.msg_ctlmni,
- .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
- .mode = 0644,
- .proc_handler = &proc_ipc_dointvec,
- .strategy = sysctl_ipc_data,
- },
- {
- .ctl_name = KERN_MSGMNB,
- .procname = "msgmnb",
- .data = &init_ipc_ns.msg_ctlmnb,
- .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
- .mode = 0644,
- .proc_handler = &proc_ipc_dointvec,
- .strategy = sysctl_ipc_data,
- },
- {
- .ctl_name = KERN_SEM,
- .procname = "sem",
- .data = &init_ipc_ns.sem_ctls,
- .maxlen = 4*sizeof (int),
- .mode = 0644,
- .proc_handler = &proc_ipc_dointvec,
- .strategy = sysctl_ipc_data,
- },
-#endif
#ifdef CONFIG_MAGIC_SYSRQ
{
.ctl_name = KERN_SYSRQ,
@@ -1043,6 +861,12 @@ static ctl_table vm_table[] = {
{ .ctl_name = 0 }
};
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+static ctl_table binfmt_misc_table[] = {
+ { .ctl_name = 0 }
+};
+#endif
+
static ctl_table fs_table[] = {
{
.ctl_name = FS_NRINODE,
@@ -1166,6 +990,14 @@ static ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "binfmt_misc",
+ .mode = 0555,
+ .child = binfmt_misc_table,
+ },
+#endif
{ .ctl_name = 0 }
};
@@ -1177,8 +1009,6 @@ static ctl_table dev_table[] = {
{ .ctl_name = 0 }
};
-extern void init_irq_proc (void);
-
static DEFINE_SPINLOCK(sysctl_lock);
/* called under sysctl_lock */
@@ -1220,19 +1050,47 @@ static void start_unregistering(struct ctl_table_header *p)
list_del_init(&p->ctl_entry);
}
-void __init sysctl_init(void)
+void sysctl_head_finish(struct ctl_table_header *head)
{
-#ifdef CONFIG_PROC_SYSCTL
- register_proc_table(root_table, proc_sys_root, &root_table_header);
- init_irq_proc();
-#endif
+ if (!head)
+ return;
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+ spin_unlock(&sysctl_lock);
+}
+
+struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+{
+ struct ctl_table_header *head;
+ struct list_head *tmp;
+ spin_lock(&sysctl_lock);
+ if (prev) {
+ tmp = &prev->ctl_entry;
+ unuse_table(prev);
+ goto next;
+ }
+ tmp = &root_table_header.ctl_entry;
+ for (;;) {
+ head = list_entry(tmp, struct ctl_table_header, ctl_entry);
+
+ if (!use_table(head))
+ goto next;
+ spin_unlock(&sysctl_lock);
+ return head;
+ next:
+ tmp = tmp->next;
+ if (tmp == &root_table_header.ctl_entry)
+ break;
+ }
+ spin_unlock(&sysctl_lock);
+ return NULL;
}
#ifdef CONFIG_SYSCTL_SYSCALL
int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen)
{
- struct list_head *tmp;
+ struct ctl_table_header *head;
int error = -ENOTDIR;
if (nlen <= 0 || nlen >= CTL_MAXNAME)
@@ -1242,26 +1100,16 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
if (!oldlenp || get_user(old_len, oldlenp))
return -EFAULT;
}
- spin_lock(&sysctl_lock);
- tmp = &root_table_header.ctl_entry;
- do {
- struct ctl_table_header *head =
- list_entry(tmp, struct ctl_table_header, ctl_entry);
-
- if (!use_table(head))
- continue;
-
- spin_unlock(&sysctl_lock);
+ for (head = sysctl_head_next(NULL); head;
+ head = sysctl_head_next(head)) {
error = parse_table(name, nlen, oldval, oldlenp,
newval, newlen, head->ctl_table);
-
- spin_lock(&sysctl_lock);
- unuse_table(head);
- if (error != -ENOTDIR)
+ if (error != -ENOTDIR) {
+ sysctl_head_finish(head);
break;
- } while ((tmp = tmp->next) != &root_table_header.ctl_entry);
- spin_unlock(&sysctl_lock);
+ }
+ }
return error;
}
@@ -1282,7 +1130,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
#endif /* CONFIG_SYSCTL_SYSCALL */
/*
- * ctl_perm does NOT grant the superuser all rights automatically, because
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
* some sysctl variables are readonly even to root.
*/
@@ -1297,7 +1145,7 @@ static int test_perm(int mode, int op)
return -EACCES;
}
-static inline int ctl_perm(ctl_table *table, int op)
+int sysctl_perm(ctl_table *table, int op)
{
int error;
error = security_sysctl(table, op);
@@ -1321,19 +1169,11 @@ repeat:
for ( ; table->ctl_name || table->procname; table++) {
if (!table->ctl_name)
continue;
- if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
+ if (n == table->ctl_name) {
int error;
if (table->child) {
- if (ctl_perm(table, 001))
+ if (sysctl_perm(table, 001))
return -EPERM;
- if (table->strategy) {
- error = table->strategy(
- table, name, nlen,
- oldval, oldlenp,
- newval, newlen);
- if (error)
- return error;
- }
name++;
nlen--;
table = table->child;
@@ -1361,7 +1201,7 @@ int do_sysctl_strategy (ctl_table *table,
op |= 004;
if (newval)
op |= 002;
- if (ctl_perm(table, op))
+ if (sysctl_perm(table, op))
return -EPERM;
if (table->strategy) {
@@ -1400,10 +1240,26 @@ int do_sysctl_strategy (ctl_table *table,
}
#endif /* CONFIG_SYSCTL_SYSCALL */
+static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
+{
+ for (; table->ctl_name || table->procname; table++) {
+ table->parent = parent;
+ if (table->child)
+ sysctl_set_parent(table, table->child);
+ }
+}
+
+static __init int sysctl_init(void)
+{
+ sysctl_set_parent(NULL, root_table);
+ return 0;
+}
+
+core_initcall(sysctl_init);
+
/**
* register_sysctl_table - register a sysctl hierarchy
* @table: the top-level table structure
- * @insert_at_head: whether the entry should be inserted in front or at the end
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. An entry with a ctl_name of 0 terminates the table.
@@ -1469,8 +1325,7 @@ int do_sysctl_strategy (ctl_table *table,
* This routine returns %NULL on a failure to register, and a pointer
* to the table header on success.
*/
-struct ctl_table_header *register_sysctl_table(ctl_table * table,
- int insert_at_head)
+struct ctl_table_header *register_sysctl_table(ctl_table * table)
{
struct ctl_table_header *tmp;
tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
@@ -1480,15 +1335,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
INIT_LIST_HEAD(&tmp->ctl_entry);
tmp->used = 0;
tmp->unregistering = NULL;
+ sysctl_set_parent(NULL, table);
spin_lock(&sysctl_lock);
- if (insert_at_head)
- list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
- else
- list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+ list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
spin_unlock(&sysctl_lock);
-#ifdef CONFIG_PROC_SYSCTL
- register_proc_table(table, proc_sys_root, tmp);
-#endif
return tmp;
}
@@ -1504,9 +1354,6 @@ void unregister_sysctl_table(struct ctl_table_header * header)
might_sleep();
spin_lock(&sysctl_lock);
start_unregistering(header);
-#ifdef CONFIG_PROC_SYSCTL
- unregister_proc_table(header->ctl_table, proc_sys_root);
-#endif
spin_unlock(&sysctl_lock);
kfree(header);
}
@@ -1530,155 +1377,6 @@ void unregister_sysctl_table(struct ctl_table_header * table)
#ifdef CONFIG_PROC_SYSCTL
-/* Scan the sysctl entries in table and add them all into /proc */
-static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
-{
- struct proc_dir_entry *de;
- int len;
- mode_t mode;
-
- for (; table->ctl_name || table->procname; table++) {
- /* Can't do anything without a proc name. */
- if (!table->procname)
- continue;
- /* Maybe we can't do anything with it... */
- if (!table->proc_handler && !table->child) {
- printk(KERN_WARNING "SYSCTL: Can't register %s\n",
- table->procname);
- continue;
- }
-
- len = strlen(table->procname);
- mode = table->mode;
-
- de = NULL;
- if (table->proc_handler)
- mode |= S_IFREG;
- else {
- mode |= S_IFDIR;
- for (de = root->subdir; de; de = de->next) {
- if (proc_match(len, table->procname, de))
- break;
- }
- /* If the subdir exists already, de is non-NULL */
- }
-
- if (!de) {
- de = create_proc_entry(table->procname, mode, root);
- if (!de)
- continue;
- de->set = set;
- de->data = (void *) table;
- if (table->proc_handler)
- de->proc_fops = &proc_sys_file_operations;
- }
- table->de = de;
- if (de->mode & S_IFDIR)
- register_proc_table(table->child, de, set);
- }
-}
-
-/*
- * Unregister a /proc sysctl table and any subdirectories.
- */
-static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
-{
- struct proc_dir_entry *de;
- for (; table->ctl_name || table->procname; table++) {
- if (!(de = table->de))
- continue;
- if (de->mode & S_IFDIR) {
- if (!table->child) {
- printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
- continue;
- }
- unregister_proc_table(table->child, de);
-
- /* Don't unregister directories which still have entries.. */
- if (de->subdir)
- continue;
- }
-
- /*
- * In any case, mark the entry as goner; we'll keep it
- * around if it's busy, but we'll know to do nothing with
- * its fields. We are under sysctl_lock here.
- */
- de->data = NULL;
-
- /* Don't unregister proc entries that are still being used.. */
- if (atomic_read(&de->count))
- continue;
-
- table->de = NULL;
- remove_proc_entry(table->procname, root);
- }
-}
-
-static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
-{
- int op;
- struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
- struct ctl_table *table;
- size_t res;
- ssize_t error = -ENOTDIR;
-
- spin_lock(&sysctl_lock);
- if (de && de->data && use_table(de->set)) {
- /*
- * at that point we know that sysctl was not unregistered
- * and won't be until we finish
- */
- spin_unlock(&sysctl_lock);
- table = (struct ctl_table *) de->data;
- if (!table || !table->proc_handler)
- goto out;
- error = -EPERM;
- op = (write ? 002 : 004);
- if (ctl_perm(table, op))
- goto out;
-
- /* careful: calling conventions are nasty here */
- res = count;
- error = (*table->proc_handler)(table, write, file,
- buf, &res, ppos);
- if (!error)
- error = res;
- out:
- spin_lock(&sysctl_lock);
- unuse_table(de->set);
- }
- spin_unlock(&sysctl_lock);
- return error;
-}
-
-static int proc_opensys(struct inode *inode, struct file *file)
-{
- if (file->f_mode & FMODE_WRITE) {
- /*
- * sysctl entries that are not writable,
- * are _NOT_ writable, capabilities or not.
- */
- if (!(inode->i_mode & S_IWUSR))
- return -EPERM;
- }
-
- return 0;
-}
-
-static ssize_t proc_readsys(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
-{
- return do_rw_proc(0, file, buf, count, ppos);
-}
-
-static ssize_t proc_writesys(struct file * file, const char __user * buf,
- size_t count, loff_t *ppos)
-{
- return do_rw_proc(1, file, (char __user *) buf, count, ppos);
-}
-
static int _proc_do_string(void* data, int maxlen, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
@@ -1762,21 +1460,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
buffer, lenp, ppos);
}
-/*
- * Special case of dostring for the UTS structure. This has locks
- * to observe. Should this be in kernel/sys.c ????
- */
-
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int r;
- void *which;
- which = get_uts(table, write);
- r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
- put_uts(table, write, which);
- return r;
-}
static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
int *valp,
@@ -2362,27 +2045,6 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
do_proc_dointvec_ms_jiffies_conv, NULL);
}
-#ifdef CONFIG_SYSVIPC
-static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- void *which;
- which = get_ipc(table, write);
- return __do_proc_dointvec(which, table, write, filp, buffer,
- lenp, ppos, NULL, NULL);
-}
-
-static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- void *which;
- which = get_ipc(table, write);
- return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
- lenp, ppos, 1l, 1l);
-}
-
-#endif
-
static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2413,31 +2075,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
return -ENOSYS;
}
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-#ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
- struct file *filp, void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-#endif
-
int proc_dointvec(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2648,62 +2285,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
}
-/* The generic string strategy routine: */
-static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- struct ctl_table uts_table;
- int r, write;
- write = newval && newlen;
- memcpy(&uts_table, table, sizeof(uts_table));
- uts_table.data = get_uts(table, write);
- r = sysctl_string(&uts_table, name, nlen,
- oldval, oldlenp, newval, newlen);
- put_uts(table, write, uts_table.data);
- return r;
-}
-
-#ifdef CONFIG_SYSVIPC
-/* The generic sysctl ipc data routine. */
-static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- size_t len;
- void *data;
-
- /* Get out of I don't have a variable */
- if (!table->data || !table->maxlen)
- return -ENOTDIR;
-
- data = get_ipc(table, 1);
- if (!data)
- return -ENOTDIR;
-
- if (oldval && oldlenp) {
- if (get_user(len, oldlenp))
- return -EFAULT;
- if (len) {
- if (len > table->maxlen)
- len = table->maxlen;
- if (copy_to_user(oldval, data, len))
- return -EFAULT;
- if (put_user(len, oldlenp))
- return -EFAULT;
- }
- }
-
- if (newval && newlen) {
- if (newlen > table->maxlen)
- newlen = table->maxlen;
-
- if (copy_from_user(data, newval, newlen))
- return -EFAULT;
- }
- return 1;
-}
-#endif
#else /* CONFIG_SYSCTL_SYSCALL */
@@ -2769,20 +2350,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
return -ENOSYS;
}
-static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- return -ENOSYS;
-}
-#ifdef CONFIG_SYSVIPC
-static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- return -ENOSYS;
-}
-#endif
#endif /* CONFIG_SYSCTL_SYSCALL */
/*
diff --git a/kernel/time.c b/kernel/time.c
index 0e017bff4c1..c6c80ea5d0e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec)
return tv;
}
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+ return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+ return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+
+/*
+ * When we convert to jiffies then we interpret incoming values
+ * the following way:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ * the input value by a factor or dividing it with a factor
+ *
+ * We must also be careful about 32-bit overflows.
+ */
+unsigned long msecs_to_jiffies(const unsigned int m)
+{
+ /*
+ * Negative value, means infinite timeout:
+ */
+ if ((int)m < 0)
+ return MAX_JIFFY_OFFSET;
+
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ /*
+ * HZ is equal to or smaller than 1000, and 1000 is a nice
+ * round multiple of HZ, divide with the factor between them,
+ * but round upwards:
+ */
+ return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ /*
+ * HZ is larger than 1000, and HZ is a nice round multiple of
+ * 1000 - simply multiply with the factor between them.
+ *
+ * But first make sure the multiplication result cannot
+ * overflow:
+ */
+ if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+
+ return m * (HZ / MSEC_PER_SEC);
+#else
+ /*
+ * Generic case - multiply, round and divide. But first
+ * check that if we are doing a net multiplication, that
+ * we wouldnt overflow:
+ */
+ if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+
+ return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
+#endif
+}
+EXPORT_SYMBOL(msecs_to_jiffies);
+
+unsigned long usecs_to_jiffies(const unsigned int u)
+{
+ if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return u * (HZ / USEC_PER_SEC);
+#else
+ return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
+#endif
+}
+EXPORT_SYMBOL(usecs_to_jiffies);
+
+/*
+ * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
+ * that a remainder subtract here would not do the right thing as the
+ * resolution values don't fall on second boundries. I.e. the line:
+ * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ *
+ * Rather, we just shift the bits off the right.
+ *
+ * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
+ * value to a scaled second value.
+ */
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+ unsigned long sec = value->tv_sec;
+ long nsec = value->tv_nsec + TICK_NSEC - 1;
+
+ if (sec >= MAX_SEC_IN_JIFFIES){
+ sec = MAX_SEC_IN_JIFFIES;
+ nsec = 0;
+ }
+ return (((u64)sec * SEC_CONVERSION) +
+ (((u64)nsec * NSEC_CONVERSION) >>
+ (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+
+}
+EXPORT_SYMBOL(timespec_to_jiffies);
+
+void
+jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+{
+ /*
+ * Convert jiffies to nanoseconds and separate with
+ * one divide.
+ */
+ u64 nsec = (u64)jiffies * TICK_NSEC;
+ value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
+}
+EXPORT_SYMBOL(jiffies_to_timespec);
+
+/* Same for "timeval"
+ *
+ * Well, almost. The problem here is that the real system resolution is
+ * in nanoseconds and the value being converted is in micro seconds.
+ * Also for some machines (those that use HZ = 1024, in-particular),
+ * there is a LARGE error in the tick size in microseconds.
+
+ * The solution we use is to do the rounding AFTER we convert the
+ * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
+ * Instruction wise, this should cost only an additional add with carry
+ * instruction above the way it was done above.
+ */
+unsigned long
+timeval_to_jiffies(const struct timeval *value)
+{
+ unsigned long sec = value->tv_sec;
+ long usec = value->tv_usec;
+
+ if (sec >= MAX_SEC_IN_JIFFIES){
+ sec = MAX_SEC_IN_JIFFIES;
+ usec = 0;
+ }
+ return (((u64)sec * SEC_CONVERSION) +
+ (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
+ (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+}
+
+void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
+{
+ /*
+ * Convert jiffies to nanoseconds and separate with
+ * one divide.
+ */
+ u64 nsec = (u64)jiffies * TICK_NSEC;
+ long tv_usec;
+
+ value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
+ tv_usec /= NSEC_PER_USEC;
+ value->tv_usec = tv_usec;
+}
+
+/*
+ * Convert jiffies/jiffies_64 to clock_t and back.
+ */
+clock_t jiffies_to_clock_t(long x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+ return x / (HZ / USER_HZ);
+#else
+ u64 tmp = (u64)x * TICK_NSEC;
+ do_div(tmp, (NSEC_PER_SEC / USER_HZ));
+ return (long)tmp;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_clock_t);
+
+unsigned long clock_t_to_jiffies(unsigned long x)
+{
+#if (HZ % USER_HZ)==0
+ if (x >= ~0UL / (HZ / USER_HZ))
+ return ~0UL;
+ return x * (HZ / USER_HZ);
+#else
+ u64 jif;
+
+ /* Don't worry about loss of precision here .. */
+ if (x >= ~0UL / HZ * USER_HZ)
+ return ~0UL;
+
+ /* .. but do try to contain it here */
+ jif = x * (u64) HZ;
+ do_div(jif, USER_HZ);
+ return jif;
+#endif
+}
+EXPORT_SYMBOL(clock_t_to_jiffies);
+
+u64 jiffies_64_to_clock_t(u64 x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+ do_div(x, HZ / USER_HZ);
+#else
+ /*
+ * There are better ways that don't overflow early,
+ * but even this doesn't overflow in hundreds of years
+ * in 64 bits, so..
+ */
+ x *= TICK_NSEC;
+ do_div(x, (NSEC_PER_SEC / USER_HZ));
+#endif
+ return x;
+}
+
+EXPORT_SYMBOL(jiffies_64_to_clock_t);
+
+u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+ do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+ x *= USER_HZ/512;
+ do_div(x, (NSEC_PER_SEC / 512));
+#else
+ /*
+ * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+ * overflow after 64.99 years.
+ * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+ */
+ x *= 9;
+ do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
+ USER_HZ));
+#endif
+ return x;
+}
+
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 00000000000..f6635112654
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,25 @@
+#
+# Timer subsystem related configuration options
+#
+config TICK_ONESHOT
+ bool
+ default n
+
+config NO_HZ
+ bool "Tickless System (Dynamic Ticks)"
+ depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+ help
+ This option enables a tickless system: timer interrupts will
+ only trigger on an as-needed basis both when the system is
+ busy and when the system is idle.
+
+config HIGH_RES_TIMERS
+ bool "High Resolution Timer Support"
+ depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+ help
+ This option enables high resolution timer support. If your
+ hardware is not capable then this option only increases
+ the size of the kernel image.
+
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 61a3907d16f..93bccba1f26 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1,8 @@
-obj-y += ntp.o clocksource.o jiffies.o
+obj-y += ntp.o clocksource.o jiffies.o timer_list.o
+
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
+obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 00000000000..67932ea78c1
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,345 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event devices.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+
+/* The registered clock event devices */
+static LIST_HEAD(clockevent_devices);
+static LIST_HEAD(clockevents_released);
+
+/* Notification for clock events */
+static RAW_NOTIFIER_HEAD(clockevents_chain);
+
+/* Protection for the above */
+static DEFINE_SPINLOCK(clockevents_lock);
+
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch: value to convert
+ * @evt: pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+unsigned long clockevent_delta2ns(unsigned long latch,
+ struct clock_event_device *evt)
+{
+ u64 clc = ((u64) latch << evt->shift);
+
+ do_div(clc, evt->mult);
+ if (clc < 1000)
+ clc = 1000;
+ if (clc > LONG_MAX)
+ clc = LONG_MAX;
+
+ return (unsigned long) clc;
+}
+
+/**
+ * clockevents_set_mode - set the operating mode of a clock event device
+ * @dev: device to modify
+ * @mode: new mode
+ *
+ * Must be called with interrupts disabled !
+ */
+void clockevents_set_mode(struct clock_event_device *dev,
+ enum clock_event_mode mode)
+{
+ if (dev->mode != mode) {
+ dev->set_mode(mode, dev);
+ dev->mode = mode;
+ }
+}
+
+/**
+ * clockevents_program_event - Reprogram the clock event device.
+ * @expires: absolute expiry time (monotonic clock)
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
+ */
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
+ ktime_t now)
+{
+ unsigned long long clc;
+ int64_t delta;
+
+ delta = ktime_to_ns(ktime_sub(expires, now));
+
+ if (delta <= 0)
+ return -ETIME;
+
+ dev->next_event = expires;
+
+ if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ return 0;
+
+ if (delta > dev->max_delta_ns)
+ delta = dev->max_delta_ns;
+ if (delta < dev->min_delta_ns)
+ delta = dev->min_delta_ns;
+
+ clc = delta * dev->mult;
+ clc >>= dev->shift;
+
+ return dev->set_next_event((unsigned long) clc, dev);
+}
+
+/**
+ * clockevents_register_notifier - register a clock events change listener
+ */
+int clockevents_register_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ spin_lock(&clockevents_lock);
+ ret = raw_notifier_chain_register(&clockevents_chain, nb);
+ spin_unlock(&clockevents_lock);
+
+ return ret;
+}
+
+/**
+ * clockevents_unregister_notifier - unregister a clock events change listener
+ */
+void clockevents_unregister_notifier(struct notifier_block *nb)
+{
+ spin_lock(&clockevents_lock);
+ raw_notifier_chain_unregister(&clockevents_chain, nb);
+ spin_unlock(&clockevents_lock);
+}
+
+/*
+ * Notify about a clock event change. Called with clockevents_lock
+ * held.
+ */
+static void clockevents_do_notify(unsigned long reason, void *dev)
+{
+ raw_notifier_call_chain(&clockevents_chain, reason, dev);
+}
+
+/*
+ * Called after a notify add to make devices availble which were
+ * released from the notifier call.
+ */
+static void clockevents_notify_released(void)
+{
+ struct clock_event_device *dev;
+
+ while (!list_empty(&clockevents_released)) {
+ dev = list_entry(clockevents_released.next,
+ struct clock_event_device, list);
+ list_del(&dev->list);
+ list_add(&dev->list, &clockevent_devices);
+ clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ }
+}
+
+/**
+ * clockevents_register_device - register a clock event device
+ * @dev: device to register
+ */
+void clockevents_register_device(struct clock_event_device *dev)
+{
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+
+ spin_lock(&clockevents_lock);
+
+ list_add(&dev->list, &clockevent_devices);
+ clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ clockevents_notify_released();
+
+ spin_unlock(&clockevents_lock);
+}
+
+/*
+ * Noop handler when we shut down an event device
+ */
+static void clockevents_handle_noop(struct clock_event_device *dev)
+{
+}
+
+/**
+ * clockevents_exchange_device - release and request clock devices
+ * @old: device to release (can be NULL)
+ * @new: device to request (can be NULL)
+ *
+ * Called from the notifier chain. clockevents_lock is held already
+ */
+void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /*
+ * Caller releases a clock event device. We queue it into the
+ * released list and do a notify add later.
+ */
+ if (old) {
+ old->event_handler = clockevents_handle_noop;
+ clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+ list_del(&old->list);
+ list_add(&old->list, &clockevents_released);
+ }
+
+ if (new) {
+ BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+ clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
+ }
+ local_irq_restore(flags);
+}
+
+/**
+ * clockevents_request_device
+ */
+struct clock_event_device *clockevents_request_device(unsigned int features,
+ cpumask_t cpumask)
+{
+ struct clock_event_device *cur, *dev = NULL;
+ struct list_head *tmp;
+
+ spin_lock(&clockevents_lock);
+
+ list_for_each(tmp, &clockevent_devices) {
+ cur = list_entry(tmp, struct clock_event_device, list);
+
+ if ((cur->features & features) == features &&
+ cpus_equal(cpumask, cur->cpumask)) {
+ if (!dev || dev->rating < cur->rating)
+ dev = cur;
+ }
+ }
+
+ clockevents_exchange_device(NULL, dev);
+
+ spin_unlock(&clockevents_lock);
+
+ return dev;
+}
+
+/**
+ * clockevents_release_device
+ */
+void clockevents_release_device(struct clock_event_device *dev)
+{
+ spin_lock(&clockevents_lock);
+
+ clockevents_exchange_device(dev, NULL);
+ clockevents_notify_released();
+
+ spin_unlock(&clockevents_lock);
+}
+
+/**
+ * clockevents_notify - notification about relevant events
+ */
+void clockevents_notify(unsigned long reason, void *arg)
+{
+ spin_lock(&clockevents_lock);
+ clockevents_do_notify(reason, arg);
+
+ switch (reason) {
+ case CLOCK_EVT_NOTIFY_CPU_DEAD:
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ while (!list_empty(&clockevents_released)) {
+ struct clock_event_device *dev;
+
+ dev = list_entry(clockevents_released.next,
+ struct clock_event_device, list);
+ list_del(&dev->list);
+ }
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&clockevents_lock);
+}
+EXPORT_SYMBOL_GPL(clockevents_notify);
+
+#ifdef CONFIG_SYSFS
+
+/**
+ * clockevents_show_registered - sysfs interface for listing clockevents
+ * @dev: unused
+ * @buf: char buffer to be filled with clock events list
+ *
+ * Provides sysfs interface for listing registered clock event devices
+ */
+static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
+{
+ struct list_head *tmp;
+ char *p = buf;
+ int cpu;
+
+ spin_lock(&clockevents_lock);
+
+ list_for_each(tmp, &clockevent_devices) {
+ struct clock_event_device *ce;
+
+ ce = list_entry(tmp, struct clock_event_device, list);
+ p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
+ ce->features, ce->mode);
+ p += sprintf(p, " C:");
+ if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
+ for_each_cpu_mask(cpu, ce->cpumask)
+ p += sprintf(p, " %d", cpu);
+ } else {
+ /*
+ * FIXME: Add the cpu which is handling this sucker
+ */
+ }
+ p += sprintf(p, "\n");
+ }
+
+ spin_unlock(&clockevents_lock);
+
+ return p - buf;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(registered, 0600,
+ clockevents_show_registered, NULL);
+
+static struct sysdev_class clockevents_sysclass = {
+ set_kset_name("clockevents"),
+};
+
+static struct sys_device clockevents_sys_device = {
+ .id = 0,
+ .cls = &clockevents_sysclass,
+};
+
+static int __init clockevents_sysfs_init(void)
+{
+ int error = sysdev_class_register(&clockevents_sysclass);
+
+ if (!error)
+ error = sysdev_register(&clockevents_sys_device);
+ if (!error)
+ error = sysdev_create_file(
+ &clockevents_sys_device,
+ &attr_registered);
+ return error;
+}
+device_initcall(clockevents_sysfs_init);
+#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9ef176c4e0..193a0793af9 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -29,6 +29,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
/* XXX - Would like a better way for initializing curr_clocksource */
extern struct clocksource clocksource_jiffies;
@@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies;
*/
static struct clocksource *curr_clocksource = &clocksource_jiffies;
static struct clocksource *next_clocksource;
+static struct clocksource *clocksource_override;
static LIST_HEAD(clocksource_list);
static DEFINE_SPINLOCK(clocksource_lock);
static char override_name[32];
@@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void)
finished_booting = 1;
return 0;
}
-
late_initcall(clocksource_done_booting);
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static LIST_HEAD(watchdog_list);
+static struct clocksource *watchdog;
+static struct timer_list watchdog_timer;
+static DEFINE_SPINLOCK(watchdog_lock);
+static cycle_t watchdog_last;
+/*
+ * Interval: 0.5sec Treshold: 0.0625s
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
+
+static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
+{
+ if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
+ return;
+
+ printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
+ cs->name, delta);
+ cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+ clocksource_change_rating(cs, 0);
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+ list_del(&cs->wd_list);
+}
+
+static void clocksource_watchdog(unsigned long data)
+{
+ struct clocksource *cs, *tmp;
+ cycle_t csnow, wdnow;
+ int64_t wd_nsec, cs_nsec;
+
+ spin_lock(&watchdog_lock);
+
+ wdnow = watchdog->read();
+ wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+ watchdog_last = wdnow;
+
+ list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
+ csnow = cs->read();
+ /* Initialized ? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
+ if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+ (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ /*
+ * We just marked the clocksource as
+ * highres-capable, notify the rest of the
+ * system as well so that we transition
+ * into high-res mode:
+ */
+ tick_clock_notify();
+ }
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ cs->wd_last = csnow;
+ } else {
+ cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
+ cs->wd_last = csnow;
+ /* Check the delta. Might remove from the list ! */
+ clocksource_ratewd(cs, cs_nsec - wd_nsec);
+ }
+ }
+
+ if (!list_empty(&watchdog_list)) {
+ __mod_timer(&watchdog_timer,
+ watchdog_timer.expires + WATCHDOG_INTERVAL);
+ }
+ spin_unlock(&watchdog_lock);
+}
+static void clocksource_check_watchdog(struct clocksource *cs)
+{
+ struct clocksource *cse;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ int started = !list_empty(&watchdog_list);
+
+ list_add(&cs->wd_list, &watchdog_list);
+ if (!started && watchdog) {
+ watchdog_last = watchdog->read();
+ watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+ add_timer(&watchdog_timer);
+ }
+ } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+
+ if (!watchdog || cs->rating > watchdog->rating) {
+ if (watchdog)
+ del_timer(&watchdog_timer);
+ watchdog = cs;
+ init_timer(&watchdog_timer);
+ watchdog_timer.function = clocksource_watchdog;
+
+ /* Reset watchdog cycles */
+ list_for_each_entry(cse, &watchdog_list, wd_list)
+ cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
+ /* Start if list is not empty */
+ if (!list_empty(&watchdog_list)) {
+ watchdog_last = watchdog->read();
+ watchdog_timer.expires =
+ jiffies + WATCHDOG_INTERVAL;
+ add_timer(&watchdog_timer);
+ }
+ }
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+#else
+static void clocksource_check_watchdog(struct clocksource *cs)
+{
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+}
+#endif
+
/**
* clocksource_get_next - Returns the selected clocksource
*
@@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void)
}
/**
- * select_clocksource - Finds the best registered clocksource.
+ * select_clocksource - Selects the best registered clocksource.
*
* Private function. Must hold clocksource_lock when called.
*
- * Looks through the list of registered clocksources, returning
- * the one with the highest rating value. If there is a clocksource
- * name that matches the override string, it returns that clocksource.
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
*/
static struct clocksource *select_clocksource(void)
{
- struct clocksource *best = NULL;
- struct list_head *tmp;
+ struct clocksource *next;
- list_for_each(tmp, &clocksource_list) {
- struct clocksource *src;
+ if (list_empty(&clocksource_list))
+ return NULL;
- src = list_entry(tmp, struct clocksource, list);
- if (!best)
- best = src;
-
- /* check for override: */
- if (strlen(src->name) == strlen(override_name) &&
- !strcmp(src->name, override_name)) {
- best = src;
- break;
- }
- /* pick the highest rating: */
- if (src->rating > best->rating)
- best = src;
- }
+ if (clocksource_override)
+ next = clocksource_override;
+ else
+ next = list_entry(clocksource_list.next, struct clocksource,
+ list);
+
+ if (next == curr_clocksource)
+ return NULL;
- return best;
+ return next;
}
-/**
- * is_registered_source - Checks if clocksource is registered
- * @c: pointer to a clocksource
- *
- * Private helper function. Must hold clocksource_lock when called.
- *
- * Returns one if the clocksource is already registered, zero otherwise.
+/*
+ * Enqueue the clocksource sorted by rating
*/
-static int is_registered_source(struct clocksource *c)
+static int clocksource_enqueue(struct clocksource *c)
{
- int len = strlen(c->name);
- struct list_head *tmp;
+ struct list_head *tmp, *entry = &clocksource_list;
list_for_each(tmp, &clocksource_list) {
- struct clocksource *src;
-
- src = list_entry(tmp, struct clocksource, list);
- if (strlen(src->name) == len && !strcmp(src->name, c->name))
- return 1;
+ struct clocksource *cs;
+
+ cs = list_entry(tmp, struct clocksource, list);
+ if (cs == c)
+ return -EBUSY;
+ /* Keep track of the place, where to insert */
+ if (cs->rating >= c->rating)
+ entry = tmp;
}
+ list_add(&c->list, entry);
+
+ if (strlen(c->name) == strlen(override_name) &&
+ !strcmp(c->name, override_name))
+ clocksource_override = c;
return 0;
}
@@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c)
*/
int clocksource_register(struct clocksource *c)
{
- int ret = 0;
unsigned long flags;
+ int ret;
spin_lock_irqsave(&clocksource_lock, flags);
- /* check if clocksource is already registered */
- if (is_registered_source(c)) {
- printk("register_clocksource: Cannot register %s. "
- "Already registered!", c->name);
- ret = -EBUSY;
- } else {
- /* register it */
- list_add(&c->list, &clocksource_list);
- /* scan the registered clocksources, and pick the best one */
+ ret = clocksource_enqueue(c);
+ if (!ret)
next_clocksource = select_clocksource();
- }
spin_unlock_irqrestore(&clocksource_lock, flags);
+ if (!ret)
+ clocksource_check_watchdog(c);
return ret;
}
EXPORT_SYMBOL(clocksource_register);
/**
- * clocksource_reselect - Rescan list for next clocksource
+ * clocksource_change_rating - Change the rating of a registered clocksource
*
- * A quick helper function to be used if a clocksource changes its
- * rating. Forces the clocksource list to be re-scanned for the best
- * clocksource.
*/
-void clocksource_reselect(void)
+void clocksource_change_rating(struct clocksource *cs, int rating)
{
unsigned long flags;
spin_lock_irqsave(&clocksource_lock, flags);
+ list_del(&cs->list);
+ cs->rating = rating;
+ clocksource_enqueue(cs);
next_clocksource = select_clocksource();
spin_unlock_irqrestore(&clocksource_lock, flags);
}
-EXPORT_SYMBOL(clocksource_reselect);
#ifdef CONFIG_SYSFS
/**
@@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
static ssize_t sysfs_override_clocksource(struct sys_device *dev,
const char *buf, size_t count)
{
+ struct clocksource *ovr = NULL;
+ struct list_head *tmp;
size_t ret = count;
+ int len;
+
/* strings from sysfs write are not 0 terminated! */
if (count >= sizeof(override_name))
return -EINVAL;
@@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
/* strip of \n: */
if (buf[count-1] == '\n')
count--;
- if (count < 1)
- return -EINVAL;
spin_lock_irq(&clocksource_lock);
- /* copy the name given: */
- memcpy(override_name, buf, count);
+ if (count > 0)
+ memcpy(override_name, buf, count);
override_name[count] = 0;
- /* try to select it: */
- next_clocksource = select_clocksource();
+ len = strlen(override_name);
+ if (len) {
+ ovr = clocksource_override;
+ /* try to select it: */
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *cs;
+
+ cs = list_entry(tmp, struct clocksource, list);
+ if (strlen(cs->name) == len &&
+ !strcmp(cs->name, override_name))
+ ovr = cs;
+ }
+ }
+
+ /* Reselect, when the override name has changed */
+ if (ovr != clocksource_override) {
+ clocksource_override = ovr;
+ next_clocksource = select_clocksource();
+ }
spin_unlock_irq(&clocksource_lock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a99b2a6e6a0..3be8da8fed7 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = {
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
- .is_continuous = 0, /* tick based, not free running */
};
static int __init init_jiffies_clocksource(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3afeaa3a73f..eb12509e00b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base;
#define MAX_TICKADJ 500 /* microsecs */
#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
- TICK_LENGTH_SHIFT) / HZ)
+ TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
/*
* phase-lock loop variables
@@ -46,13 +46,17 @@ long time_adjust;
static void ntp_update_frequency(void)
{
- tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
- tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
- tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+ u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
+ << TICK_LENGTH_SHIFT;
+ second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+ second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
- do_div(tick_length_base, HZ);
+ tick_length_base = second_length;
- tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+ do_div(second_length, HZ);
+ tick_nsec = second_length >> TICK_LENGTH_SHIFT;
+
+ do_div(tick_length_base, NTP_INTERVAL_FREQ);
}
/**
@@ -162,7 +166,7 @@ void second_overflow(void)
tick_length -= MAX_TICKADJ_SCALED;
} else {
tick_length += (s64)(time_adjust * NSEC_PER_USEC /
- HZ) << TICK_LENGTH_SHIFT;
+ NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
time_adjust = 0;
}
}
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc)
result = -EINVAL;
goto leave;
}
- time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
+ time_freq = ((s64)txc->freq * NSEC_PER_USEC)
+ >> (SHIFT_USEC - SHIFT_NSEC);
}
if (txc->modes & ADJ_MAXERROR) {
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc)
freq_adj += time_freq;
freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
- time_offset = (time_offset / HZ) << SHIFT_UPDATE;
+ time_offset = (time_offset / NTP_INTERVAL_FREQ)
+ << SHIFT_UPDATE;
} /* STA_PLL */
} /* txc->modes & ADJ_OFFSET */
if (txc->modes & ADJ_TICK)
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
txc->offset = save_adjust;
else
- txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
- txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
+ txc->offset = shift_right(time_offset, SHIFT_UPDATE)
+ * NTP_INTERVAL_FREQ / 1000;
+ txc->freq = (time_freq / NSEC_PER_USEC)
+ << (SHIFT_USEC - SHIFT_NSEC);
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 00000000000..12b3efeb9f6
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,480 @@
+/*
+ * linux/kernel/time/tick-broadcast.c
+ *
+ * This file contains functions which emulate a local clock-event
+ * device via a broadcast event source.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/irq.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+
+#include "tick-internal.h"
+
+/*
+ * Broadcast support for broken x86 hardware, where the local apic
+ * timer stops in C3 state.
+ */
+
+struct tick_device tick_broadcast_device;
+static cpumask_t tick_broadcast_mask;
+static DEFINE_SPINLOCK(tick_broadcast_lock);
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_broadcast_device(void)
+{
+ return &tick_broadcast_device;
+}
+
+cpumask_t *tick_get_broadcast_mask(void)
+{
+ return &tick_broadcast_mask;
+}
+
+/*
+ * Start the device in periodic mode
+ */
+static void tick_broadcast_start_periodic(struct clock_event_device *bc)
+{
+ if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ tick_setup_periodic(bc, 1);
+}
+
+/*
+ * Check, if the device can be utilized as broadcast device:
+ */
+int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+ if (tick_broadcast_device.evtdev ||
+ (dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 0;
+
+ clockevents_exchange_device(NULL, dev);
+ tick_broadcast_device.evtdev = dev;
+ if (!cpus_empty(tick_broadcast_mask))
+ tick_broadcast_start_periodic(dev);
+ return 1;
+}
+
+/*
+ * Check, if the device is the broadcast device
+ */
+int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+ return (dev && tick_broadcast_device.evtdev == dev);
+}
+
+/*
+ * Check, if the device is disfunctional and a place holder, which
+ * needs to be handled by the broadcast device.
+ */
+int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Devices might be registered with both periodic and oneshot
+ * mode disabled. This signals, that the device needs to be
+ * operated from the broadcast device and is a placeholder for
+ * the cpu local device.
+ */
+ if (!tick_device_is_functional(dev)) {
+ dev->event_handler = tick_handle_periodic;
+ cpu_set(cpu, tick_broadcast_mask);
+ tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
+ ret = 1;
+ }
+
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ return ret;
+}
+
+/*
+ * Broadcast the event to the cpus, which are set in the mask
+ */
+int tick_do_broadcast(cpumask_t mask)
+{
+ int ret = 0, cpu = smp_processor_id();
+ struct tick_device *td;
+
+ /*
+ * Check, if the current cpu is in the mask
+ */
+ if (cpu_isset(cpu, mask)) {
+ cpu_clear(cpu, mask);
+ td = &per_cpu(tick_cpu_device, cpu);
+ td->evtdev->event_handler(td->evtdev);
+ ret = 1;
+ }
+
+ if (!cpus_empty(mask)) {
+ /*
+ * It might be necessary to actually check whether the devices
+ * have different broadcast functions. For now, just use the
+ * one of the first device. This works as long as we have this
+ * misfeature only on x86 (lapic)
+ */
+ cpu = first_cpu(mask);
+ td = &per_cpu(tick_cpu_device, cpu);
+ td->evtdev->broadcast(mask);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * Periodic broadcast:
+ * - invoke the broadcast handlers
+ */
+static void tick_do_periodic_broadcast(void)
+{
+ cpumask_t mask;
+
+ spin_lock(&tick_broadcast_lock);
+
+ cpus_and(mask, cpu_online_map, tick_broadcast_mask);
+ tick_do_broadcast(mask);
+
+ spin_unlock(&tick_broadcast_lock);
+}
+
+/*
+ * Event handler for periodic broadcast ticks
+ */
+static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
+{
+ dev->next_event.tv64 = KTIME_MAX;
+
+ tick_do_periodic_broadcast();
+
+ /*
+ * The device is in periodic mode. No reprogramming necessary:
+ */
+ if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ return;
+
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode:
+ */
+ for (;;) {
+ ktime_t next = ktime_add(dev->next_event, tick_period);
+
+ if (!clockevents_program_event(dev, next, ktime_get()))
+ return;
+ tick_do_periodic_broadcast();
+ }
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+static void tick_do_broadcast_on_off(void *why)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ unsigned long flags, *reason = why;
+ int cpu;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ cpu = smp_processor_id();
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+ bc = tick_broadcast_device.evtdev;
+
+ /*
+ * Is the device in broadcast mode forever or is it not
+ * affected by the powerstate ?
+ */
+ if (!dev || !tick_device_is_functional(dev) ||
+ !(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ goto out;
+
+ if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) {
+ if (!cpu_isset(cpu, tick_broadcast_mask)) {
+ cpu_set(cpu, tick_broadcast_mask);
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ clockevents_set_mode(dev,
+ CLOCK_EVT_MODE_SHUTDOWN);
+ }
+ } else {
+ if (cpu_isset(cpu, tick_broadcast_mask)) {
+ cpu_clear(cpu, tick_broadcast_mask);
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(dev, 0);
+ }
+ }
+
+ if (cpus_empty(tick_broadcast_mask))
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+ else {
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ }
+out:
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop.
+ */
+void tick_broadcast_on_off(unsigned long reason, int *oncpu)
+{
+ int cpu = get_cpu();
+
+ if (cpu == *oncpu)
+ tick_do_broadcast_on_off(&reason);
+ else
+ smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
+ &reason, 1, 1);
+ put_cpu();
+}
+
+/*
+ * Set the periodic handler depending on broadcast on/off
+ */
+void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ if (!broadcast)
+ dev->event_handler = tick_handle_periodic;
+ else
+ dev->event_handler = tick_handle_periodic_broadcast;
+}
+
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_shutdown_broadcast(unsigned int *cpup)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+ unsigned int cpu = *cpup;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ cpu_clear(cpu, tick_broadcast_mask);
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+ if (bc && cpus_empty(tick_broadcast_mask))
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+ }
+
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+#ifdef CONFIG_TICK_ONESHOT
+
+static cpumask_t tick_broadcast_oneshot_mask;
+
+/*
+ * Debugging: see timer_list.c
+ */
+cpumask_t *tick_get_broadcast_oneshot_mask(void)
+{
+ return &tick_broadcast_oneshot_mask;
+}
+
+static int tick_broadcast_set_event(ktime_t expires, int force)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ ktime_t now = ktime_get();
+ int res;
+
+ for(;;) {
+ res = clockevents_program_event(bc, expires, now);
+ if (!res || !force)
+ return res;
+ now = ktime_get();
+ expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
+ }
+}
+
+/*
+ * Reprogram the broadcast device:
+ *
+ * Called with tick_broadcast_lock held and interrupts disabled.
+ */
+static int tick_broadcast_reprogram(void)
+{
+ ktime_t expires = { .tv64 = KTIME_MAX };
+ struct tick_device *td;
+ int cpu;
+
+ /*
+ * Find the event which expires next:
+ */
+ for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
+ cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev->next_event.tv64 < expires.tv64)
+ expires = td->evtdev->next_event;
+ }
+
+ if (expires.tv64 == KTIME_MAX)
+ return 0;
+
+ return tick_broadcast_set_event(expires, 0);
+}
+
+/*
+ * Handle oneshot mode broadcasting
+ */
+static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
+{
+ struct tick_device *td;
+ cpumask_t mask;
+ ktime_t now;
+ int cpu;
+
+ spin_lock(&tick_broadcast_lock);
+again:
+ dev->next_event.tv64 = KTIME_MAX;
+ mask = CPU_MASK_NONE;
+ now = ktime_get();
+ /* Find all expired events */
+ for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
+ cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev->next_event.tv64 <= now.tv64)
+ cpu_set(cpu, mask);
+ }
+
+ /*
+ * Wakeup the cpus which have an expired event. The broadcast
+ * device is reprogrammed in the return from idle code.
+ */
+ if (!tick_do_broadcast(mask)) {
+ /*
+ * The global event did not expire any CPU local
+ * events. This happens in dyntick mode, as the
+ * maximum PIT delta is quite small.
+ */
+ if (tick_broadcast_reprogram())
+ goto again;
+ }
+ spin_unlock(&tick_broadcast_lock);
+}
+
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+void tick_broadcast_oneshot_control(unsigned long reason)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ unsigned long flags;
+ int cpu;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Periodic mode does not care about the enter/exit of power
+ * states
+ */
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ goto out;
+
+ bc = tick_broadcast_device.evtdev;
+ cpu = smp_processor_id();
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ goto out;
+
+ if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+ if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+ cpu_set(cpu, tick_broadcast_oneshot_mask);
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ if (dev->next_event.tv64 < bc->next_event.tv64)
+ tick_broadcast_set_event(dev->next_event, 1);
+ }
+ } else {
+ if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+ cpu_clear(cpu, tick_broadcast_oneshot_mask);
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ if (dev->next_event.tv64 != KTIME_MAX)
+ tick_program_event(dev->next_event, 1);
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/**
+ * tick_broadcast_setup_highres - setup the broadcast device for highres
+ */
+void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
+ bc->event_handler = tick_handle_oneshot_broadcast;
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ bc->next_event.tv64 = KTIME_MAX;
+ }
+}
+
+/*
+ * Select oneshot operating mode for the broadcast device
+ */
+void tick_broadcast_switch_to_oneshot(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
+ bc = tick_broadcast_device.evtdev;
+ if (bc)
+ tick_broadcast_setup_oneshot(bc);
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+
+/*
+ * Remove a dead CPU from broadcasting
+ */
+void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+ unsigned int cpu = *cpup;
+
+ spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ cpu_clear(cpu, tick_broadcast_oneshot_mask);
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
+ if (bc && cpus_empty(tick_broadcast_oneshot_mask))
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+ }
+
+ spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 00000000000..4500e347f1b
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,346 @@
+/*
+ * linux/kernel/time/tick-common.c
+ *
+ * This file contains the base functions to manage periodic tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/irq.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+
+#include "tick-internal.h"
+
+/*
+ * Tick devices
+ */
+DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
+/*
+ * Tick next event: keeps track of the tick time
+ */
+ktime_t tick_next_period;
+ktime_t tick_period;
+static int tick_do_timer_cpu = -1;
+DEFINE_SPINLOCK(tick_device_lock);
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_device(int cpu)
+{
+ return &per_cpu(tick_cpu_device, cpu);
+}
+
+/**
+ * tick_is_oneshot_available - check for a oneshot capable event device
+ */
+int tick_is_oneshot_available(void)
+{
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+
+ return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+}
+
+/*
+ * Periodic tick
+ */
+static void tick_periodic(int cpu)
+{
+ if (tick_do_timer_cpu == cpu) {
+ write_seqlock(&xtime_lock);
+
+ /* Keep track of the next tick event */
+ tick_next_period = ktime_add(tick_next_period, tick_period);
+
+ do_timer(1);
+ write_sequnlock(&xtime_lock);
+ }
+
+ update_process_times(user_mode(get_irq_regs()));
+ profile_tick(CPU_PROFILING);
+}
+
+/*
+ * Event handler for periodic ticks
+ */
+void tick_handle_periodic(struct clock_event_device *dev)
+{
+ int cpu = smp_processor_id();
+
+ tick_periodic(cpu);
+
+ if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ return;
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode:
+ */
+ for (;;) {
+ ktime_t next = ktime_add(dev->next_event, tick_period);
+
+ if (!clockevents_program_event(dev, next, ktime_get()))
+ return;
+ tick_periodic(cpu);
+ }
+}
+
+/*
+ * Setup the device for a periodic tick
+ */
+void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
+{
+ tick_set_periodic_handler(dev, broadcast);
+
+ /* Broadcast setup ? */
+ if (!tick_device_is_functional(dev))
+ return;
+
+ if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ } else {
+ unsigned long seq;
+ ktime_t next;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ next = tick_next_period;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+
+ for (;;) {
+ if (!clockevents_program_event(dev, next, ktime_get()))
+ return;
+ next = ktime_add(next, tick_period);
+ }
+ }
+}
+
+/*
+ * Setup the tick device
+ */
+static void tick_setup_device(struct tick_device *td,
+ struct clock_event_device *newdev, int cpu,
+ cpumask_t cpumask)
+{
+ ktime_t next_event;
+ void (*handler)(struct clock_event_device *) = NULL;
+
+ /*
+ * First device setup ?
+ */
+ if (!td->evtdev) {
+ /*
+ * If no cpu took the do_timer update, assign it to
+ * this cpu:
+ */
+ if (tick_do_timer_cpu == -1) {
+ tick_do_timer_cpu = cpu;
+ tick_next_period = ktime_get();
+ tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+ }
+
+ /*
+ * Startup in periodic mode first.
+ */
+ td->mode = TICKDEV_MODE_PERIODIC;
+ } else {
+ handler = td->evtdev->event_handler;
+ next_event = td->evtdev->next_event;
+ }
+
+ td->evtdev = newdev;
+
+ /*
+ * When the device is not per cpu, pin the interrupt to the
+ * current cpu:
+ */
+ if (!cpus_equal(newdev->cpumask, cpumask))
+ irq_set_affinity(newdev->irq, cpumask);
+
+ /*
+ * When global broadcasting is active, check if the current
+ * device is registered as a placeholder for broadcast mode.
+ * This allows us to handle this x86 misfeature in a generic
+ * way.
+ */
+ if (tick_device_uses_broadcast(newdev, cpu))
+ return;
+
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(newdev, 0);
+ else
+ tick_setup_oneshot(newdev, handler, next_event);
+}
+
+/*
+ * Check, if the new registered device should be used.
+ */
+static int tick_check_new_device(struct clock_event_device *newdev)
+{
+ struct clock_event_device *curdev;
+ struct tick_device *td;
+ int cpu, ret = NOTIFY_OK;
+ unsigned long flags;
+ cpumask_t cpumask;
+
+ spin_lock_irqsave(&tick_device_lock, flags);
+
+ cpu = smp_processor_id();
+ if (!cpu_isset(cpu, newdev->cpumask))
+ goto out;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ curdev = td->evtdev;
+ cpumask = cpumask_of_cpu(cpu);
+
+ /* cpu local device ? */
+ if (!cpus_equal(newdev->cpumask, cpumask)) {
+
+ /*
+ * If the cpu affinity of the device interrupt can not
+ * be set, ignore it.
+ */
+ if (!irq_can_set_affinity(newdev->irq))
+ goto out_bc;
+
+ /*
+ * If we have a cpu local device already, do not replace it
+ * by a non cpu local device
+ */
+ if (curdev && cpus_equal(curdev->cpumask, cpumask))
+ goto out_bc;
+ }
+
+ /*
+ * If we have an active device, then check the rating and the oneshot
+ * feature.
+ */
+ if (curdev) {
+ /*
+ * Prefer one shot capable devices !
+ */
+ if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ goto out_bc;
+ /*
+ * Check the rating
+ */
+ if (curdev->rating >= newdev->rating)
+ goto out_bc;
+ }
+
+ /*
+ * Replace the eventually existing device by the new
+ * device. If the current device is the broadcast device, do
+ * not give it back to the clockevents layer !
+ */
+ if (tick_is_broadcast_device(curdev)) {
+ clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
+ curdev = NULL;
+ }
+ clockevents_exchange_device(curdev, newdev);
+ tick_setup_device(td, newdev, cpu, cpumask);
+ if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_oneshot_notify();
+
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+ return NOTIFY_STOP;
+
+out_bc:
+ /*
+ * Can the new device be used as a broadcast device ?
+ */
+ if (tick_check_broadcast_device(newdev))
+ ret = NOTIFY_STOP;
+out:
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Shutdown an event device on a given cpu:
+ *
+ * This is called on a life CPU, when a CPU is dead. So we cannot
+ * access the hardware device itself.
+ * We just set the mode and remove it from the lists.
+ */
+static void tick_shutdown(unsigned int *cpup)
+{
+ struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+ struct clock_event_device *dev = td->evtdev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tick_device_lock, flags);
+ td->mode = TICKDEV_MODE_PERIODIC;
+ if (dev) {
+ /*
+ * Prevent that the clock events layer tries to call
+ * the set mode function!
+ */
+ dev->mode = CLOCK_EVT_MODE_UNUSED;
+ clockevents_exchange_device(dev, NULL);
+ td->evtdev = NULL;
+ }
+ spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+
+/*
+ * Notification about clock event devices
+ */
+static int tick_notify(struct notifier_block *nb, unsigned long reason,
+ void *dev)
+{
+ switch (reason) {
+
+ case CLOCK_EVT_NOTIFY_ADD:
+ return tick_check_new_device(dev);
+
+ case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+ case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+ tick_broadcast_on_off(reason, dev);
+ break;
+
+ case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+ case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+ tick_broadcast_oneshot_control(reason);
+ break;
+
+ case CLOCK_EVT_NOTIFY_CPU_DEAD:
+ tick_shutdown_broadcast_oneshot(dev);
+ tick_shutdown_broadcast(dev);
+ tick_shutdown(dev);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block tick_notifier = {
+ .notifier_call = tick_notify,
+};
+
+/**
+ * tick_init - initialize the tick control
+ *
+ * Register the notifier with the clockevents framework
+ */
+void __init tick_init(void)
+{
+ clockevents_register_notifier(&tick_notifier);
+}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 00000000000..54861a0f29f
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,110 @@
+/*
+ * tick internal variable and functions used by low/high res code
+ */
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern spinlock_t tick_device_lock;
+extern ktime_t tick_next_period;
+extern ktime_t tick_period;
+
+extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
+extern void tick_handle_periodic(struct clock_event_device *dev);
+
+/*
+ * NO_HZ / high resolution timer shared code
+ */
+#ifdef CONFIG_TICK_ONESHOT
+extern void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt);
+extern int tick_program_event(ktime_t expires, int force);
+extern void tick_oneshot_notify(void);
+extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
+
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+extern void tick_broadcast_oneshot_control(unsigned long reason);
+extern void tick_broadcast_switch_to_oneshot(void);
+extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
+# else /* BROADCAST */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_broadcast_switch_to_oneshot(void) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+# endif /* !BROADCAST */
+
+#else /* !ONESHOT */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt)
+{
+ BUG();
+}
+static inline int tick_program_event(ktime_t expires, int force)
+{
+ return 0;
+}
+static inline void tick_oneshot_notify(void) { }
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+#endif /* !TICK_ONESHOT */
+
+/*
+ * Broadcasting support
+ */
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_do_broadcast(cpumask_t mask);
+
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern int tick_check_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
+extern void tick_shutdown_broadcast(unsigned int *cpup);
+
+extern void
+tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+
+#else /* !BROADCAST */
+
+static inline int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+ return 0;
+}
+
+static inline int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+ return 0;
+}
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
+ int cpu)
+{
+ return 0;
+}
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
+static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
+
+/*
+ * Set the periodic handler in non broadcast mode
+ */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev,
+ int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+#endif /* !BROADCAST */
+
+/*
+ * Check, if the device is functional or a dummy for broadcast
+ */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 00000000000..2e8b7ff863c
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,84 @@
+/*
+ * linux/kernel/time/tick-oneshot.c
+ *
+ * This file contains functions which manage high resolution tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/irq.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+
+#include "tick-internal.h"
+
+/**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ ktime_t now = ktime_get();
+
+ while (1) {
+ int ret = clockevents_program_event(dev, expires, now);
+
+ if (!ret || !force)
+ return ret;
+ now = ktime_get();
+ expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
+ }
+}
+
+/**
+ * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ */
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t next_event)
+{
+ newdev->event_handler = handler;
+ clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_program_event(newdev, next_event, ktime_get());
+}
+
+/**
+ * tick_switch_to_oneshot - switch to oneshot mode
+ */
+int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct clock_event_device *dev = td->evtdev;
+
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
+ !tick_device_is_functional(dev))
+ return -EINVAL;
+
+ td->mode = TICKDEV_MODE_ONESHOT;
+ dev->event_handler = handler;
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ tick_broadcast_switch_to_oneshot();
+ return 0;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * tick_init_highres - switch to high resolution mode
+ *
+ * Called with interrupts disabled.
+ */
+int tick_init_highres(void)
+{
+ return tick_switch_to_oneshot(hrtimer_interrupt);
+}
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 00000000000..95e41f7f850
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,563 @@
+/*
+ * linux/kernel/time/tick-sched.c
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
+ *
+ * No idle tick implementation for low and high resolution timers
+ *
+ * Started by: Thomas Gleixner and Ingo Molnar
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+
+#include "tick-internal.h"
+
+/*
+ * Per cpu nohz control structure
+ */
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+
+/*
+ * The time, when the last jiffy update happened. Protected by xtime_lock.
+ */
+static ktime_t last_jiffies_update;
+
+struct tick_sched *tick_get_tick_sched(int cpu)
+{
+ return &per_cpu(tick_cpu_sched, cpu);
+}
+
+/*
+ * Must be called with interrupts disabled !
+ */
+static void tick_do_update_jiffies64(ktime_t now)
+{
+ unsigned long ticks = 0;
+ ktime_t delta;
+
+ /* Reevalute with xtime_lock held */
+ write_seqlock(&xtime_lock);
+
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 >= tick_period.tv64) {
+
+ delta = ktime_sub(delta, tick_period);
+ last_jiffies_update = ktime_add(last_jiffies_update,
+ tick_period);
+
+ /* Slow path for long timeouts */
+ if (unlikely(delta.tv64 >= tick_period.tv64)) {
+ s64 incr = ktime_to_ns(tick_period);
+
+ ticks = ktime_divns(delta, incr);
+
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ incr * ticks);
+ }
+ do_timer(++ticks);
+ }
+ write_sequnlock(&xtime_lock);
+}
+
+/*
+ * Initialize and return retrieve the jiffies update.
+ */
+static ktime_t tick_init_jiffy_update(void)
+{
+ ktime_t period;
+
+ write_seqlock(&xtime_lock);
+ /* Did we start the jiffies update yet ? */
+ if (last_jiffies_update.tv64 == 0)
+ last_jiffies_update = tick_next_period;
+ period = last_jiffies_update;
+ write_sequnlock(&xtime_lock);
+ return period;
+}
+
+/*
+ * NOHZ - aka dynamic tick functionality
+ */
+#ifdef CONFIG_NO_HZ
+/*
+ * NO HZ enabled ?
+ */
+static int tick_nohz_enabled __read_mostly = 1;
+
+/*
+ * Enable / Disable tickless mode
+ */
+static int __init setup_tick_nohz(char *str)
+{
+ if (!strcmp(str, "off"))
+ tick_nohz_enabled = 0;
+ else if (!strcmp(str, "on"))
+ tick_nohz_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("nohz=", setup_tick_nohz);
+
+/**
+ * tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ *
+ * Called from interrupt entry when the CPU was idle
+ *
+ * In case the sched_tick was stopped on this CPU, we have to check if jiffies
+ * must be updated. Otherwise an interrupt handler could use a stale jiffy
+ * value. We do this unconditionally on any cpu, as we don't know whether the
+ * cpu, which has the update task assigned is in a long sleep.
+ */
+void tick_nohz_update_jiffies(void)
+{
+ int cpu = smp_processor_id();
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ unsigned long flags;
+ ktime_t now;
+
+ if (!ts->tick_stopped)
+ return;
+
+ cpu_clear(cpu, nohz_cpu_mask);
+ now = ktime_get();
+
+ local_irq_save(flags);
+ tick_do_update_jiffies64(now);
+ local_irq_restore(flags);
+}
+
+/**
+ * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called either from the idle loop or from irq_exit() when an idle period was
+ * just interrupted by an interrupt which did not cause a reschedule.
+ */
+void tick_nohz_stop_sched_tick(void)
+{
+ unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+ struct tick_sched *ts;
+ ktime_t last_update, expires, now, delta;
+ int cpu;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ goto end;
+
+ if (need_resched())
+ goto end;
+
+ cpu = smp_processor_id();
+ BUG_ON(local_softirq_pending());
+
+ now = ktime_get();
+ /*
+ * When called from irq_exit we need to account the idle sleep time
+ * correctly.
+ */
+ if (ts->tick_stopped) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ }
+
+ ts->idle_entrytime = now;
+ ts->idle_calls++;
+
+ /* Read jiffies and the time when jiffies were updated last */
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ last_update = last_jiffies_update;
+ last_jiffies = jiffies;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ /* Get the next timer wheel timer */
+ next_jiffies = get_next_timer_interrupt(last_jiffies);
+ delta_jiffies = next_jiffies - last_jiffies;
+
+ /*
+ * Do not stop the tick, if we are only one off
+ * or if the cpu is required for rcu
+ */
+ if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu)))
+ goto out;
+
+ /* Schedule the tick, if we are at least one jiffie off */
+ if ((long)delta_jiffies >= 1) {
+
+ if (rcu_needs_cpu(cpu))
+ delta_jiffies = 1;
+ else
+ cpu_set(cpu, nohz_cpu_mask);
+ /*
+ * nohz_stop_sched_tick can be called several times before
+ * the nohz_restart_sched_tick is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the
+ * first call we save the current tick time, so we can restart
+ * the scheduler tick in nohz_restart_sched_tick.
+ */
+ if (!ts->tick_stopped) {
+ ts->idle_tick = ts->sched_timer.expires;
+ ts->tick_stopped = 1;
+ ts->idle_jiffies = last_jiffies;
+ }
+ /*
+ * calculate the expiry time for the next timer wheel
+ * timer
+ */
+ expires = ktime_add_ns(last_update, tick_period.tv64 *
+ delta_jiffies);
+ ts->idle_expires = expires;
+ ts->idle_sleeps++;
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start(&ts->sched_timer, expires,
+ HRTIMER_MODE_ABS);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ goto out;
+ } else if(!tick_program_event(expires, 0))
+ goto out;
+ /*
+ * We are past the event already. So we crossed a
+ * jiffie boundary. Update jiffies and raise the
+ * softirq.
+ */
+ tick_do_update_jiffies64(ktime_get());
+ cpu_clear(cpu, nohz_cpu_mask);
+ }
+ raise_softirq_irqoff(TIMER_SOFTIRQ);
+out:
+ ts->next_jiffies = next_jiffies;
+ ts->last_jiffies = last_jiffies;
+end:
+ local_irq_restore(flags);
+}
+
+/**
+ * nohz_restart_sched_tick - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ */
+void tick_nohz_restart_sched_tick(void)
+{
+ int cpu = smp_processor_id();
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ unsigned long ticks;
+ ktime_t now, delta;
+
+ if (!ts->tick_stopped)
+ return;
+
+ /* Update jiffies first */
+ now = ktime_get();
+
+ local_irq_disable();
+ tick_do_update_jiffies64(now);
+ cpu_clear(cpu, nohz_cpu_mask);
+
+ /* Account the idle time */
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+
+ /*
+ * We stopped the tick in idle. Update process times would miss the
+ * time we slept as update_process_times does only a 1 tick
+ * accounting. Enforce that this is accounted to idle !
+ */
+ ticks = jiffies - ts->idle_jiffies;
+ /*
+ * We might be one off. Do not randomly account a huge number of ticks!
+ */
+ if (ticks && ticks < LONG_MAX) {
+ add_preempt_count(HARDIRQ_OFFSET);
+ account_system_time(current, HARDIRQ_OFFSET,
+ jiffies_to_cputime(ticks));
+ sub_preempt_count(HARDIRQ_OFFSET);
+ }
+
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ hrtimer_cancel(&ts->sched_timer);
+ ts->sched_timer.expires = ts->idle_tick;
+
+ while (1) {
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start(&ts->sched_timer,
+ ts->sched_timer.expires,
+ HRTIMER_MODE_ABS);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ } else {
+ if (!tick_program_event(ts->sched_timer.expires, 0))
+ break;
+ }
+ /* Update jiffies and reread time */
+ tick_do_update_jiffies64(now);
+ now = ktime_get();
+ }
+ local_irq_enable();
+}
+
+static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ return tick_program_event(ts->sched_timer.expires, 0);
+}
+
+/*
+ * The nohz low res interrupt handler
+ */
+static void tick_nohz_handler(struct clock_event_device *dev)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct pt_regs *regs = get_irq_regs();
+ ktime_t now = ktime_get();
+
+ dev->next_event.tv64 = KTIME_MAX;
+
+ /* Check, if the jiffies need an update */
+ tick_do_update_jiffies64(now);
+
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start
+ * of idle" jiffy stamp so the idle accounting adjustment we
+ * do when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ ts->idle_jiffies++;
+ }
+
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+
+ /* Do not restart, when we are in the idle loop */
+ if (ts->tick_stopped)
+ return;
+
+ while (tick_nohz_reprogram(ts, now)) {
+ now = ktime_get();
+ tick_do_update_jiffies64(now);
+ }
+}
+
+/**
+ * tick_nohz_switch_to_nohz - switch to nohz mode
+ */
+static void tick_nohz_switch_to_nohz(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ ktime_t next;
+
+ if (!tick_nohz_enabled)
+ return;
+
+ local_irq_disable();
+ if (tick_switch_to_oneshot(tick_nohz_handler)) {
+ local_irq_enable();
+ return;
+ }
+
+ ts->nohz_mode = NOHZ_MODE_LOWRES;
+
+ /*
+ * Recycle the hrtimer in ts, so we can share the
+ * hrtimer_forward with the highres code.
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ /* Get the next period */
+ next = tick_init_jiffy_update();
+
+ for (;;) {
+ ts->sched_timer.expires = next;
+ if (!tick_program_event(next, 0))
+ break;
+ next = ktime_add(next, tick_period);
+ }
+ local_irq_enable();
+
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
+ smp_processor_id());
+}
+
+#else
+
+static inline void tick_nohz_switch_to_nohz(void) { }
+
+#endif /* NO_HZ */
+
+/*
+ * High resolution timer specific code
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * We rearm the timer until we get disabled by the idle code
+ * Called with interrupts disabled and timer->base->cpu_base->lock held.
+ */
+static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+{
+ struct tick_sched *ts =
+ container_of(timer, struct tick_sched, sched_timer);
+ struct hrtimer_cpu_base *base = timer->base->cpu_base;
+ struct pt_regs *regs = get_irq_regs();
+ ktime_t now = ktime_get();
+
+ /* Check, if the jiffies need an update */
+ tick_do_update_jiffies64(now);
+
+ /*
+ * Do not call, when we are not in irq context and have
+ * no valid regs pointer
+ */
+ if (regs) {
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start of
+ * idle" jiffy stamp so the idle accounting adjustment we do
+ * when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ ts->idle_jiffies++;
+ }
+ /*
+ * update_process_times() might take tasklist_lock, hence
+ * drop the base lock. sched-tick hrtimers are per-CPU and
+ * never accessible by userspace APIs, so this is safe to do.
+ */
+ spin_unlock(&base->lock);
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+ spin_lock(&base->lock);
+ }
+
+ /* Do not restart, when we are in the idle loop */
+ if (ts->tick_stopped)
+ return HRTIMER_NORESTART;
+
+ hrtimer_forward(timer, now, tick_period);
+
+ return HRTIMER_RESTART;
+}
+
+/**
+ * tick_setup_sched_timer - setup the tick emulation timer
+ */
+void tick_setup_sched_timer(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ ktime_t now = ktime_get();
+
+ /*
+ * Emulate tick processing via per-CPU hrtimers:
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ ts->sched_timer.function = tick_sched_timer;
+ ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
+ /* Get the next period */
+ ts->sched_timer.expires = tick_init_jiffy_update();
+
+ for (;;) {
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
+ HRTIMER_MODE_ABS);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ now = ktime_get();
+ }
+
+#ifdef CONFIG_NO_HZ
+ if (tick_nohz_enabled)
+ ts->nohz_mode = NOHZ_MODE_HIGHRES;
+#endif
+}
+
+void tick_cancel_sched_timer(int cpu)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (ts->sched_timer.base)
+ hrtimer_cancel(&ts->sched_timer);
+ ts->tick_stopped = 0;
+ ts->nohz_mode = NOHZ_MODE_INACTIVE;
+}
+#endif /* HIGH_RES_TIMERS */
+
+/**
+ * Async notification about clocksource changes
+ */
+void tick_clock_notify(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
+}
+
+/*
+ * Async notification about clock event changes
+ */
+void tick_oneshot_notify(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ set_bit(0, &ts->check_clocks);
+}
+
+/**
+ * Check, if a change happened, which makes oneshot possible.
+ *
+ * Called cyclic from the hrtimer softirq (driven by the timer
+ * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * mode, because high resolution timers are disabled (either compile
+ * or runtime).
+ */
+int tick_check_oneshot_change(int allow_nohz)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (!test_and_clear_bit(0, &ts->check_clocks))
+ return 0;
+
+ if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
+ return 0;
+
+ if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
+ return 0;
+
+ if (!allow_nohz)
+ return 1;
+
+ tick_nohz_switch_to_nohz();
+ return 0;
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 00000000000..f82c635c3d5
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,287 @@
+/*
+ * kernel/time/timer_list.c
+ *
+ * List pending timers
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/tick.h>
+
+#include <asm/uaccess.h>
+
+typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+/*
+ * This allows printing both to /proc/timer_list and
+ * to the console (on SysRq-Q):
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+static void print_name_offset(struct seq_file *m, void *sym)
+{
+ unsigned long addr = (unsigned long)sym;
+ char namebuf[KSYM_NAME_LEN+1];
+ unsigned long size, offset;
+ const char *sym_name;
+ char *modname;
+
+ sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+ if (sym_name)
+ SEQ_printf(m, "%s", sym_name);
+ else
+ SEQ_printf(m, "<%p>", sym);
+}
+
+static void
+print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
+{
+#ifdef CONFIG_TIMER_STATS
+ char tmp[TASK_COMM_LEN + 1];
+#endif
+ SEQ_printf(m, " #%d: ", idx);
+ print_name_offset(m, timer);
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->function);
+ SEQ_printf(m, ", S:%02lx", timer->state);
+#ifdef CONFIG_TIMER_STATS
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->start_site);
+ memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
+ tmp[TASK_COMM_LEN] = 0;
+ SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
+#endif
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
+ (unsigned long long)ktime_to_ns(timer->expires),
+ (unsigned long long)(ktime_to_ns(timer->expires) - now));
+}
+
+static void
+print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
+ u64 now)
+{
+ struct hrtimer *timer, tmp;
+ unsigned long next = 0, i;
+ struct rb_node *curr;
+ unsigned long flags;
+
+next_one:
+ i = 0;
+ spin_lock_irqsave(&base->cpu_base->lock, flags);
+
+ curr = base->first;
+ /*
+ * Crude but we have to do this O(N*N) thing, because
+ * we have to unlock the base when printing:
+ */
+ while (curr && i < next) {
+ curr = rb_next(curr);
+ i++;
+ }
+
+ if (curr) {
+
+ timer = rb_entry(curr, struct hrtimer, node);
+ tmp = *timer;
+ spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+
+ print_timer(m, &tmp, i, now);
+ next++;
+ goto next_one;
+ }
+ spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+}
+
+static void
+print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
+{
+ SEQ_printf(m, " .index: %d\n",
+ base->index);
+ SEQ_printf(m, " .resolution: %Ld nsecs\n",
+ (unsigned long long)ktime_to_ns(base->resolution));
+ SEQ_printf(m, " .get_time: ");
+ print_name_offset(m, base->get_time);
+ SEQ_printf(m, "\n");
+#ifdef CONFIG_HIGH_RES_TIMERS
+ SEQ_printf(m, " .offset: %Ld nsecs\n",
+ ktime_to_ns(base->offset));
+#endif
+ SEQ_printf(m, "active timers:\n");
+ print_active_timers(m, base, now);
+}
+
+static void print_cpu(struct seq_file *m, int cpu, u64 now)
+{
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+ int i;
+
+ SEQ_printf(m, "\ncpu: %d\n", cpu);
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ SEQ_printf(m, " clock %d:\n", i);
+ print_base(m, cpu_base->clock_base + i, now);
+ }
+#define P(x) \
+ SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
+#define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
+ (u64)(ktime_to_ns(cpu_base->x)))
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ P_ns(expires_next);
+ P(hres_active);
+ P(nr_events);
+#endif
+#undef P
+#undef P_ns
+
+#ifdef CONFIG_TICK_ONESHOT
+# define P(x) \
+ SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
+# define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
+ (u64)(ktime_to_ns(ts->x)))
+ {
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+ P(nohz_mode);
+ P_ns(idle_tick);
+ P(tick_stopped);
+ P(idle_jiffies);
+ P(idle_calls);
+ P(idle_sleeps);
+ P_ns(idle_entrytime);
+ P_ns(idle_sleeptime);
+ P(last_jiffies);
+ P(next_jiffies);
+ P_ns(idle_expires);
+ SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
+ }
+#endif
+
+#undef P
+#undef P_ns
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+static void
+print_tickdevice(struct seq_file *m, struct tick_device *td)
+{
+ struct clock_event_device *dev = td->evtdev;
+
+ SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode);
+
+ SEQ_printf(m, "Clock Event Device: ");
+ if (!dev) {
+ SEQ_printf(m, "<NULL>\n");
+ return;
+ }
+ SEQ_printf(m, "%s\n", dev->name);
+ SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns);
+ SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns);
+ SEQ_printf(m, " mult: %ld\n", dev->mult);
+ SEQ_printf(m, " shift: %d\n", dev->shift);
+ SEQ_printf(m, " mode: %d\n", dev->mode);
+ SEQ_printf(m, " next_event: %Ld nsecs\n",
+ (unsigned long long) ktime_to_ns(dev->next_event));
+
+ SEQ_printf(m, " set_next_event: ");
+ print_name_offset(m, dev->set_next_event);
+ SEQ_printf(m, "\n");
+
+ SEQ_printf(m, " set_mode: ");
+ print_name_offset(m, dev->set_mode);
+ SEQ_printf(m, "\n");
+
+ SEQ_printf(m, " event_handler: ");
+ print_name_offset(m, dev->event_handler);
+ SEQ_printf(m, "\n");
+}
+
+static void timer_list_show_tickdevices(struct seq_file *m)
+{
+ int cpu;
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+ print_tickdevice(m, tick_get_broadcast_device());
+ SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
+ tick_get_broadcast_mask()->bits[0]);
+#ifdef CONFIG_TICK_ONESHOT
+ SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
+ tick_get_broadcast_oneshot_mask()->bits[0]);
+#endif
+ SEQ_printf(m, "\n");
+#endif
+ for_each_online_cpu(cpu)
+ print_tickdevice(m, tick_get_device(cpu));
+ SEQ_printf(m, "\n");
+}
+#else
+static void timer_list_show_tickdevices(struct seq_file *m) { }
+#endif
+
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ SEQ_printf(m, "Timer List Version: v0.3\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+
+ for_each_online_cpu(cpu)
+ print_cpu(m, cpu, now);
+
+ SEQ_printf(m, "\n");
+ timer_list_show_tickdevices(m);
+
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
+ timer_list_show(NULL, NULL);
+}
+
+static int timer_list_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timer_list_show, NULL);
+}
+
+static struct file_operations timer_list_fops = {
+ .open = timer_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init init_timer_list_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = create_proc_entry("timer_list", 0644, NULL);
+ if (!pe)
+ return -ENOMEM;
+
+ pe->proc_fops = &timer_list_fops;
+
+ return 0;
+}
+__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 00000000000..1bc4882e28e
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,411 @@
+/*
+ * kernel/time/timer_stats.c
+ *
+ * Collect timer usage statistics.
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * timer_stats is based on timer_top, a similar functionality which was part of
+ * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
+ * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
+ * on dynamic allocation of the statistics entries and linear search based
+ * lookup combined with a global lock, rather than the static array, hash
+ * and per-CPU locking which is used by timer_stats. It was written for the
+ * pre hrtimer kernel code and therefore did not take hrtimers into account.
+ * Nevertheless it provided the base for the timer_stats implementation and
+ * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
+ * for this effort.
+ *
+ * timer_top.c is
+ * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
+ * Written by Daniel Petrini <d.pensator@gmail.com>
+ * timer_top.c was released under the GNU General Public License version 2
+ *
+ * We export the addresses and counting of timer functions being called,
+ * the pid and cmdline from the owner process if applicable.
+ *
+ * Start/stop data collection:
+ * # echo 1[0] >/proc/timer_stats
+ *
+ * Display the information collected so far:
+ * # cat /proc/timer_stats
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * This is our basic unit of interest: a timer expiry event identified
+ * by the timer, its start/expire functions and the PID of the task that
+ * started the timer. We count the number of times an event happens:
+ */
+struct entry {
+ /*
+ * Hash list:
+ */
+ struct entry *next;
+
+ /*
+ * Hash keys:
+ */
+ void *timer;
+ void *start_func;
+ void *expire_func;
+ pid_t pid;
+
+ /*
+ * Number of timeout events:
+ */
+ unsigned long count;
+
+ /*
+ * We save the command-line string to preserve
+ * this information past task exit:
+ */
+ char comm[TASK_COMM_LEN + 1];
+
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Spinlock protecting the tables - not taken during lookup:
+ */
+static DEFINE_SPINLOCK(table_lock);
+
+/*
+ * Per-CPU lookup locks for fast hash lookup:
+ */
+static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+
+/*
+ * Mutex to serialize state changes with show-stats activities:
+ */
+static DEFINE_MUTEX(show_mutex);
+
+/*
+ * Collection status, active/inactive:
+ */
+static int __read_mostly active;
+
+/*
+ * Beginning/end timestamps of measurement:
+ */
+static ktime_t time_start, time_stop;
+
+/*
+ * tstat entry structs only get allocated while collection is
+ * active and never freed during that time - this simplifies
+ * things quite a bit.
+ *
+ * They get freed when a new collection period is started.
+ */
+#define MAX_ENTRIES_BITS 10
+#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
+
+static unsigned long nr_entries;
+static struct entry entries[MAX_ENTRIES];
+
+static atomic_t overflow_count;
+
+static void reset_entries(void)
+{
+ nr_entries = 0;
+ memset(entries, 0, sizeof(entries));
+ atomic_set(&overflow_count, 0);
+}
+
+static struct entry *alloc_entry(void)
+{
+ if (nr_entries >= MAX_ENTRIES)
+ return NULL;
+
+ return entries + nr_entries++;
+}
+
+/*
+ * The entries are in a hash-table, for fast lookup:
+ */
+#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
+#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
+#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
+
+#define __tstat_hashfn(entry) \
+ (((unsigned long)(entry)->timer ^ \
+ (unsigned long)(entry)->start_func ^ \
+ (unsigned long)(entry)->expire_func ^ \
+ (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
+
+#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
+
+static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+
+static int match_entries(struct entry *entry1, struct entry *entry2)
+{
+ return entry1->timer == entry2->timer &&
+ entry1->start_func == entry2->start_func &&
+ entry1->expire_func == entry2->expire_func &&
+ entry1->pid == entry2->pid;
+}
+
+/*
+ * Look up whether an entry matching this item is present
+ * in the hash already. Must be called with irqs off and the
+ * lookup lock held:
+ */
+static struct entry *tstat_lookup(struct entry *entry, char *comm)
+{
+ struct entry **head, *curr, *prev;
+
+ head = tstat_hashentry(entry);
+ curr = *head;
+
+ /*
+ * The fastpath is when the entry is already hashed,
+ * we do this with the lookup lock held, but with the
+ * table lock not held:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ return curr;
+
+ curr = curr->next;
+ }
+ /*
+ * Slowpath: allocate, set up and link a new hash entry:
+ */
+ prev = NULL;
+ curr = *head;
+
+ spin_lock(&table_lock);
+ /*
+ * Make sure we have not raced with another CPU:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ goto out_unlock;
+
+ prev = curr;
+ curr = curr->next;
+ }
+
+ curr = alloc_entry();
+ if (curr) {
+ *curr = *entry;
+ curr->count = 0;
+ memcpy(curr->comm, comm, TASK_COMM_LEN);
+ if (prev)
+ prev->next = curr;
+ else
+ *head = curr;
+ curr->next = NULL;
+ }
+ out_unlock:
+ spin_unlock(&table_lock);
+
+ return curr;
+}
+
+/**
+ * timer_stats_update_stats - Update the statistics for a timer.
+ * @timer: pointer to either a timer_list or a hrtimer
+ * @pid: the pid of the task which set up the timer
+ * @startf: pointer to the function which did the timer setup
+ * @timerf: pointer to the timer callback function of the timer
+ * @comm: name of the process which set up the timer
+ *
+ * When the timer is already registered, then the event counter is
+ * incremented. Otherwise the timer is registered in a free slot.
+ */
+void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+ void *timerf, char * comm)
+{
+ /*
+ * It doesnt matter which lock we take:
+ */
+ spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+ struct entry *entry, input;
+ unsigned long flags;
+
+ input.timer = timer;
+ input.start_func = startf;
+ input.expire_func = timerf;
+ input.pid = pid;
+
+ spin_lock_irqsave(lock, flags);
+ if (!active)
+ goto out_unlock;
+
+ entry = tstat_lookup(&input, comm);
+ if (likely(entry))
+ entry->count++;
+ else
+ atomic_inc(&overflow_count);
+
+ out_unlock:
+ spin_unlock_irqrestore(lock, flags);
+}
+
+static void print_name_offset(struct seq_file *m, unsigned long addr)
+{
+ char namebuf[KSYM_NAME_LEN+1];
+ unsigned long size, offset;
+ const char *sym_name;
+ char *modname;
+
+ sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+ if (sym_name)
+ seq_printf(m, "%s", sym_name);
+ else
+ seq_printf(m, "<%p>", (void *)addr);
+}
+
+static int tstats_show(struct seq_file *m, void *v)
+{
+ struct timespec period;
+ struct entry *entry;
+ unsigned long ms;
+ long events = 0;
+ ktime_t time;
+ int i;
+
+ mutex_lock(&show_mutex);
+ /*
+ * If still active then calculate up to now:
+ */
+ if (active)
+ time_stop = ktime_get();
+
+ time = ktime_sub(time_stop, time_start);
+
+ period = ktime_to_timespec(time);
+ ms = period.tv_nsec / 1000000;
+
+ seq_puts(m, "Timer Stats Version: v0.1\n");
+ seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+ if (atomic_read(&overflow_count))
+ seq_printf(m, "Overflow: %d entries\n",
+ atomic_read(&overflow_count));
+
+ for (i = 0; i < nr_entries; i++) {
+ entry = entries + i;
+ seq_printf(m, "%4lu, %5d %-16s ",
+ entry->count, entry->pid, entry->comm);
+
+ print_name_offset(m, (unsigned long)entry->start_func);
+ seq_puts(m, " (");
+ print_name_offset(m, (unsigned long)entry->expire_func);
+ seq_puts(m, ")\n");
+
+ events += entry->count;
+ }
+
+ ms += period.tv_sec * 1000;
+ if (!ms)
+ ms = 1;
+
+ if (events && period.tv_sec)
+ seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
+ events / period.tv_sec, events * 1000 / ms);
+ else
+ seq_printf(m, "%ld total events\n", events);
+
+ mutex_unlock(&show_mutex);
+
+ return 0;
+}
+
+/*
+ * After a state change, make sure all concurrent lookup/update
+ * activities have stopped:
+ */
+static void sync_access(void)
+{
+ unsigned long flags;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
+ /* nothing */
+ spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
+ }
+}
+
+static ssize_t tstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ char ctl[2];
+
+ if (count != 2 || *offs)
+ return -EINVAL;
+
+ if (copy_from_user(ctl, buf, count))
+ return -EFAULT;
+
+ mutex_lock(&show_mutex);
+ switch (ctl[0]) {
+ case '0':
+ if (active) {
+ active = 0;
+ time_stop = ktime_get();
+ sync_access();
+ }
+ break;
+ case '1':
+ if (!active) {
+ reset_entries();
+ time_start = ktime_get();
+ active = 1;
+ }
+ break;
+ default:
+ count = -EINVAL;
+ }
+ mutex_unlock(&show_mutex);
+
+ return count;
+}
+
+static int tstats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, tstats_show, NULL);
+}
+
+static struct file_operations tstats_fops = {
+ .open = tstats_open,
+ .read = seq_read,
+ .write = tstats_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+void __init init_timer_stats(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ spin_lock_init(&per_cpu(lookup_lock, cpu));
+}
+
+static int __init init_tstats_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = create_proc_entry("timer_stats", 0644, NULL);
+ if (!pe)
+ return -ENOMEM;
+
+ pe->proc_fops = &tstats_fops;
+
+ return 0;
+}
+__initcall(init_tstats_procfs);
diff --git a/kernel/timer.c b/kernel/timer.c
index 8533c379608..cb1b86a9c52 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,8 @@
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
+#include <linux/tick.h>
+#include <linux/kallsyms.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
+{
+ if (timer->start_site)
+ return;
+
+ timer->start_site = addr;
+ memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+ timer->start_pid = current->pid;
+}
+#endif
+
/**
* init_timer - initialize a timer.
* @timer: the timer to be initialized
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer)
{
timer->entry.next = NULL;
timer->base = __raw_get_cpu_var(tvec_bases);
+#ifdef CONFIG_TIMER_STATS
+ timer->start_site = NULL;
+ timer->start_pid = -1;
+ memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
}
EXPORT_SYMBOL(init_timer);
static inline void detach_timer(struct timer_list *timer,
- int clear_pending)
+ int clear_pending)
{
struct list_head *entry = &timer->entry;
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
unsigned long flags;
int ret = 0;
+ timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
base = lock_timer_base(timer, &flags);
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
tvec_base_t *base = per_cpu(tvec_bases, cpu);
unsigned long flags;
+ timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
timer->base = base;
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
{
BUG_ON(!timer->function);
+ timer_stats_timer_set_start_info(timer);
/*
* This is a common optimization triggered by the
* networking code - if the timer is re-modified
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer)
unsigned long flags;
int ret = 0;
+ timer_stats_timer_clear_start_info(timer);
if (timer_pending(timer)) {
base = lock_timer_base(timer, &flags);
if (timer_pending(timer)) {
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base)
fn = timer->function;
data = timer->data;
+ timer_stats_account_timer(timer);
+
set_running_timer(base, timer);
detach_timer(timer, 1);
spin_unlock_irq(&base->lock);
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base)
spin_unlock_irq(&base->lock);
}
-#ifdef CONFIG_NO_IDLE_HZ
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
/*
* Find out when the next timer event is due to happen. This
* is used on S/390 to stop all activity when a cpus is idle.
* This functions needs to be called disabled.
*/
-unsigned long next_timer_interrupt(void)
+static unsigned long __next_timer_interrupt(tvec_base_t *base)
{
- tvec_base_t *base;
- struct list_head *list;
+ unsigned long timer_jiffies = base->timer_jiffies;
+ unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
+ int index, slot, array, found = 0;
struct timer_list *nte;
- unsigned long expires;
- unsigned long hr_expires = MAX_JIFFY_OFFSET;
- ktime_t hr_delta;
tvec_t *varray[4];
- int i, j;
-
- hr_delta = hrtimer_get_next_event();
- if (hr_delta.tv64 != KTIME_MAX) {
- struct timespec tsdelta;
- tsdelta = ktime_to_timespec(hr_delta);
- hr_expires = timespec_to_jiffies(&tsdelta);
- if (hr_expires < 3)
- return hr_expires + jiffies;
- }
- hr_expires += jiffies;
-
- base = __get_cpu_var(tvec_bases);
- spin_lock(&base->lock);
- expires = base->timer_jiffies + (LONG_MAX >> 1);
- list = NULL;
/* Look for timer events in tv1. */
- j = base->timer_jiffies & TVR_MASK;
+ index = slot = timer_jiffies & TVR_MASK;
do {
- list_for_each_entry(nte, base->tv1.vec + j, entry) {
+ list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ found = 1;
expires = nte->expires;
- if (j < (base->timer_jiffies & TVR_MASK))
- list = base->tv2.vec + (INDEX(0));
- goto found;
+ /* Look at the cascade bucket(s)? */
+ if (!index || slot < index)
+ goto cascade;
+ return expires;
}
- j = (j + 1) & TVR_MASK;
- } while (j != (base->timer_jiffies & TVR_MASK));
+ slot = (slot + 1) & TVR_MASK;
+ } while (slot != index);
+
+cascade:
+ /* Calculate the next cascade event */
+ if (index)
+ timer_jiffies += TVR_SIZE - index;
+ timer_jiffies >>= TVR_BITS;
/* Check tv2-tv5. */
varray[0] = &base->tv2;
varray[1] = &base->tv3;
varray[2] = &base->tv4;
varray[3] = &base->tv5;
- for (i = 0; i < 4; i++) {
- j = INDEX(i);
+
+ for (array = 0; array < 4; array++) {
+ tvec_t *varp = varray[array];
+
+ index = slot = timer_jiffies & TVN_MASK;
do {
- if (list_empty(varray[i]->vec + j)) {
- j = (j + 1) & TVN_MASK;
- continue;
- }
- list_for_each_entry(nte, varray[i]->vec + j, entry)
+ list_for_each_entry(nte, varp->vec + slot, entry) {
+ found = 1;
if (time_before(nte->expires, expires))
expires = nte->expires;
- if (j < (INDEX(i)) && i < 3)
- list = varray[i + 1]->vec + (INDEX(i + 1));
- goto found;
- } while (j != (INDEX(i)));
- }
-found:
- if (list) {
- /*
- * The search wrapped. We need to look at the next list
- * from next tv element that would cascade into tv element
- * where we found the timer element.
- */
- list_for_each_entry(nte, list, entry) {
- if (time_before(nte->expires, expires))
- expires = nte->expires;
- }
+ }
+ /*
+ * Do we still search for the first timer or are
+ * we looking up the cascade buckets ?
+ */
+ if (found) {
+ /* Look at the cascade bucket(s)? */
+ if (!index || slot < index)
+ break;
+ return expires;
+ }
+ slot = (slot + 1) & TVN_MASK;
+ } while (slot != index);
+
+ if (index)
+ timer_jiffies += TVN_SIZE - index;
+ timer_jiffies >>= TVN_BITS;
}
- spin_unlock(&base->lock);
+ return expires;
+}
- /*
- * It can happen that other CPUs service timer IRQs and increment
- * jiffies, but we have not yet got a local timer tick to process
- * the timer wheels. In that case, the expiry time can be before
- * jiffies, but since the high-resolution timer here is relative to
- * jiffies, the default expression when high-resolution timers are
- * not active,
- *
- * time_before(MAX_JIFFY_OFFSET + jiffies, expires)
- *
- * would falsely evaluate to true. If that is the case, just
- * return jiffies so that we can immediately fire the local timer
- */
- if (time_before(expires, jiffies))
- return jiffies;
+/*
+ * Check, if the next hrtimer event is before the next timer wheel
+ * event:
+ */
+static unsigned long cmp_next_hrtimer_event(unsigned long now,
+ unsigned long expires)
+{
+ ktime_t hr_delta = hrtimer_get_next_event();
+ struct timespec tsdelta;
+
+ if (hr_delta.tv64 == KTIME_MAX)
+ return expires;
- if (time_before(hr_expires, expires))
- return hr_expires;
+ if (hr_delta.tv64 <= TICK_NSEC)
+ return now;
+ tsdelta = ktime_to_timespec(hr_delta);
+ now += timespec_to_jiffies(&tsdelta);
+ if (time_before(now, expires))
+ return now;
return expires;
}
+
+/**
+ * next_timer_interrupt - return the jiffy of the next pending timer
+ */
+unsigned long get_next_timer_interrupt(unsigned long now)
+{
+ tvec_base_t *base = __get_cpu_var(tvec_bases);
+ unsigned long expires;
+
+ spin_lock(&base->lock);
+ expires = __next_timer_interrupt(base);
+ spin_unlock(&base->lock);
+
+ if (time_before_eq(expires, now))
+ return now;
+
+ return cmp_next_hrtimer_event(now, expires);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+unsigned long next_timer_interrupt(void)
+{
+ return get_next_timer_interrupt(jiffies);
+}
+#endif
+
#endif
/******************************************************************/
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday);
*
* Accumulates current time interval and initializes new clocksource
*/
-static int change_clocksource(void)
+static void change_clocksource(void)
{
struct clocksource *new;
cycle_t now;
u64 nsec;
+
new = clocksource_get_next();
- if (clock != new) {
- now = clocksource_read(new);
- nsec = __get_nsec_offset();
- timespec_add_ns(&xtime, nsec);
-
- clock = new;
- clock->cycle_last = now;
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
- return 1;
- } else if (clock->update_callback) {
- return clock->update_callback();
- }
- return 0;
+
+ if (clock == new)
+ return;
+
+ now = clocksource_read(new);
+ nsec = __get_nsec_offset();
+ timespec_add_ns(&xtime, nsec);
+
+ clock = new;
+ clock->cycle_last = now;
+
+ clock->error = 0;
+ clock->xtime_nsec = 0;
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+ tick_clock_notify();
+
+ printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ clock->name);
}
#else
-static inline int change_clocksource(void)
-{
- return 0;
-}
+static inline void change_clocksource(void) { }
#endif
/**
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void)
do {
seq = read_seqbegin(&xtime_lock);
- ret = clock->is_continuous;
+ ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqretry(&xtime_lock, seq));
return ret;
}
+/**
+ * read_persistent_clock - Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+ return 0;
+}
+
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init(void)
{
unsigned long flags;
+ unsigned long sec = read_persistent_clock();
write_seqlock_irqsave(&xtime_lock, flags);
ntp_clear();
clock = clocksource_get_next();
- clocksource_calculate_interval(clock, tick_nsec);
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clock->cycle_last = clocksource_read(clock);
+ xtime.tv_sec = sec;
+ xtime.tv_nsec = 0;
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+
write_sequnlock_irqrestore(&xtime_lock, flags);
}
-
+/* flag for if timekeeping is suspended */
static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
* @dev: unused
@@ -909,13 +979,26 @@ static int timekeeping_suspended;
static int timekeeping_resume(struct sys_device *dev)
{
unsigned long flags;
+ unsigned long now = read_persistent_clock();
write_seqlock_irqsave(&xtime_lock, flags);
- /* restart the last cycle value */
+
+ if (now && (now > timekeeping_suspend_time)) {
+ unsigned long sleep_length = now - timekeeping_suspend_time;
+
+ xtime.tv_sec += sleep_length;
+ wall_to_monotonic.tv_sec -= sleep_length;
+ }
+ /* re-base the last cycle value */
clock->cycle_last = clocksource_read(clock);
clock->error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ touch_softlockup_watchdog();
+ /* Resume hrtimers */
+ clock_was_set();
+
return 0;
}
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
write_seqlock_irqsave(&xtime_lock, flags);
timekeeping_suspended = 1;
+ timekeeping_suspend_time = read_persistent_clock();
write_sequnlock_irqrestore(&xtime_lock, flags);
return 0;
}
@@ -1089,11 +1173,8 @@ static void update_wall_time(void)
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
/* check to see if there is a new clocksource to use */
- if (change_clocksource()) {
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, tick_nsec);
- }
+ change_clocksource();
+ update_vsyscall(&xtime, clock);
}
/*
@@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks)
* This read-write spinlock protects us from races in SMP while
* playing with xtime and avenrun.
*/
-#ifndef ARCH_HAVE_XTIME_LOCK
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
EXPORT_SYMBOL(xtime_lock);
-#endif
/*
* This function runs timers and the timer-tq in bottom half context.
@@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h)
{
tvec_base_t *base = __get_cpu_var(tvec_bases);
- hrtimer_run_queues();
+ hrtimer_run_queues();
+
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
}
@@ -1621,6 +1701,8 @@ void __init init_timers(void)
int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
+ init_timer_stats();
+
BUG_ON(err == NOTIFY_BAD);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index baacc369141..658f638c402 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -22,8 +22,6 @@
#include <linux/acct.h>
#include <linux/jiffies.h>
-
-#define USEC_PER_TICK (USEC_PER_SEC/HZ)
/*
* fill in basic accounting fields
*/
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
new file mode 100644
index 00000000000..f22b9dbd2a9
--- /dev/null
+++ b/kernel/utsname_sysctl.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2007
+ *
+ * Author: Eric Biederman <ebiederm@xmision.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/sysctl.h>
+
+static void *get_uts(ctl_table *table, int write)
+{
+ char *which = table->data;
+#ifdef CONFIG_UTS_NS
+ struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+ which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+#endif
+ if (!write)
+ down_read(&uts_sem);
+ else
+ down_write(&uts_sem);
+ return which;
+}
+
+static void put_uts(ctl_table *table, int write, void *which)
+{
+ if (!write)
+ up_read(&uts_sem);
+ else
+ up_write(&uts_sem);
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ * Special case of dostring for the UTS structure. This has locks
+ * to observe. Should this be in kernel/sys.c ????
+ */
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table uts_table;
+ int r;
+ memcpy(&uts_table, table, sizeof(uts_table));
+ uts_table.data = get_uts(table, write);
+ r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
+ put_uts(table, write, uts_table.data);
+ return r;
+}
+#else
+#define proc_do_uts_string NULL
+#endif
+
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+/* The generic string strategy routine: */
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ struct ctl_table uts_table;
+ int r, write;
+ write = newval && newlen;
+ memcpy(&uts_table, table, sizeof(uts_table));
+ uts_table.data = get_uts(table, write);
+ r = sysctl_string(&uts_table, name, nlen,
+ oldval, oldlenp, newval, newlen);
+ put_uts(table, write, uts_table.data);
+ return r;
+}
+#else
+#define sysctl_uts_string NULL
+#endif
+
+static struct ctl_table uts_kern_table[] = {
+ {
+ .ctl_name = KERN_OSTYPE,
+ .procname = "ostype",
+ .data = init_uts_ns.name.sysname,
+ .maxlen = sizeof(init_uts_ns.name.sysname),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_OSRELEASE,
+ .procname = "osrelease",
+ .data = init_uts_ns.name.release,
+ .maxlen = sizeof(init_uts_ns.name.release),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_VERSION,
+ .procname = "version",
+ .data = init_uts_ns.name.version,
+ .maxlen = sizeof(init_uts_ns.name.version),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_NODENAME,
+ .procname = "hostname",
+ .data = init_uts_ns.name.nodename,
+ .maxlen = sizeof(init_uts_ns.name.nodename),
+ .mode = 0644,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {
+ .ctl_name = KERN_DOMAINNAME,
+ .procname = "domainname",
+ .data = init_uts_ns.name.domainname,
+ .maxlen = sizeof(init_uts_ns.name.domainname),
+ .mode = 0644,
+ .proc_handler = proc_do_uts_string,
+ .strategy = sysctl_uts_string,
+ },
+ {}
+};
+
+static struct ctl_table uts_root_table[] = {
+ {
+ .ctl_name = CTL_KERN,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = uts_kern_table,
+ },
+ {}
+};
+
+static int __init utsname_sysctl_init(void)
+{
+ register_sysctl_table(uts_root_table);
+ return 0;
+}
+
+__initcall(utsname_sysctl_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 020d1fff57d..b6fa5e63085 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
}
EXPORT_SYMBOL_GPL(queue_work);
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
struct workqueue_struct *wq = get_wq_data(&dwork->work);
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
+ timer_stats_timer_set_start_info(timer);
if (delay == 0)
return queue_work(wq, work);
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work);
* After waiting for a given time this puts a job in the kernel-global
* workqueue.
*/
-int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork,
+ unsigned long delay)
{
+ timer_stats_timer_set_start_info(&dwork->timer);
return queue_delayed_work(keventd_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work);