From a04198887658e1d8ae25f5420035c057cb170e67 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Fri, 1 May 2009 13:10:23 -0700 Subject: timers: allow deferrable timers for intervals tv2-tv5 to be deferred In the current kernel implementation only kernel timers for time interval tv1 are being deferred. This patch allows any timer that is configured as deferrable to be defer regardless of time interval. This patch was previously discussed in http://marc.info/?l=linux-kernel&m=123196343531966&w=2 and was acked by Venki Pallipadi, the author of the original deferrable timer patch. Signed-off-by: Jon Hunter Acked-by: Venkatesh Pallipadi Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/timer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..5c1e84beaf4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1015,6 +1015,9 @@ cascade: index = slot = timer_jiffies & TVN_MASK; do { list_for_each_entry(nte, varp->vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + found = 1; if (time_before(nte->expires, expires)) expires = nte->expires; -- cgit v1.2.3 From 597d0275736dad9c3bda6f0a00a1c477dc0f37b1 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:13:26 +0530 Subject: timers: Framework for identifying pinned timers * Arun R Bharadwaj [2009-04-16 12:11:36]: This patch creates a new framework for identifying cpu-pinned timers and hrtimers. This framework is needed because pinned timers are expected to fire on the same CPU on which they are queued. So it is essential to identify these and not migrate them, in case there are any. For regular timers, the currently existing add_timer_on() can be used queue pinned timers and subsequently mod_timer_pinned() can be used to modify the 'expires' field. For hrtimers, new modes HRTIMER_ABS_PINNED and HRTIMER_REL_PINNED are added to queue cpu-pinned hrtimer. [ tglx: use .._PINNED mode argument instead of creating tons of new functions ] Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 7 ++++--- kernel/timer.c | 31 +++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cb8a15c1958..c71bcd54924 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -193,7 +193,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, * Switch the timer base to the current CPU when possible. */ static inline struct hrtimer_clock_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + int pinned) { struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; @@ -907,9 +908,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, ret = remove_hrtimer(timer, base); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base); + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - if (mode == HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures diff --git a/kernel/timer.c b/kernel/timer.c index 5c1e84beaf4..3424dfd11d5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -604,7 +604,8 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) +__mod_timer(struct timer_list *timer, unsigned long expires, + bool pending_only, int pinned) { struct tvec_base *base, *new_base; unsigned long flags; @@ -668,7 +669,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true); + return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer_pending); @@ -702,10 +703,32 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer->expires == expires && timer_pending(timer)) return 1; - return __mod_timer(timer, expires, false); + return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); +/** + * mod_timer_pinned - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pinned() is a way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * and not allow the timer to be migrated to a different CPU. + * + * mod_timer_pinned(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + */ +int mod_timer_pinned(struct timer_list *timer, unsigned long expires) +{ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires, false, TIMER_PINNED); +} +EXPORT_SYMBOL(mod_timer_pinned); + /** * add_timer - start a timer * @timer: the timer to be added @@ -1356,7 +1379,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false); + __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); schedule(); del_singleshot_timer_sync(&timer); -- cgit v1.2.3 From 5c333864a6ba811052d52ef14fbed056b9ac3512 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:14:37 +0530 Subject: timers: Identifying the existing pinned timers * Arun R Bharadwaj [2009-04-16 12:11:36]: The following pinned hrtimers have been identified and marked: 1)sched_rt_period_timer 2)tick_sched_timer 3)stack_trace_timer_fn [ tglx: fixup the hrtimer pinned mode ] Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 7 ++++--- kernel/trace/trace_sysprof.c | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a..9c5b4d3f97a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -244,7 +244,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) hard = hrtimer_get_expires(&rt_b->rt_period_timer); delta = ktime_to_ns(ktime_sub(hard, soft)); __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, - HRTIMER_MODE_ABS, 0); + HRTIMER_MODE_ABS_PINNED, 0); } spin_unlock(&rt_b->rt_runtime_lock); } @@ -1154,7 +1154,7 @@ static __init void init_hrtick(void) static void hrtick_start(struct rq *rq, u64 delay) { __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL, 0); + HRTIMER_MODE_REL_PINNED, 0); } static inline void init_hrtick(void) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d3f1ef4d5cb..2aff39c6f10 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -349,7 +349,7 @@ void tick_nohz_stop_sched_tick(int inidle) if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) goto out; @@ -395,7 +395,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) break; @@ -698,7 +698,8 @@ void tick_setup_sched_timer(void) for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) break; diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c2149..d180554bc93 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -203,7 +203,8 @@ static void start_stack_timer(void *unused) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); } static void start_stack_timers(void) -- cgit v1.2.3 From cd1bb94b4a0531e8211a3774f17de831f8285f76 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:15:34 +0530 Subject: timers: /proc/sys sysctl hook to enable timer migration * Arun R Bharadwaj [2009-04-16 12:11:36]: This patch creates the /proc/sys sysctl interface at /proc/sys/kernel/timer_migration Timer migration is enabled by default. To disable timer migration, when CONFIG_SCHED_DEBUG = y, echo 0 > /proc/sys/kernel/timer_migration Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 ++ kernel/sysctl.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9c5b4d3f97a..7f1dd56af86 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8731,6 +8731,8 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ +const_debug unsigned int sysctl_timer_migration = 1; + int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b..b3ce5813730 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -324,6 +324,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.3 From eea08f32adb3f97553d49a4f79a119833036000a Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:16:41 +0530 Subject: timers: Logic to move non pinned timers * Arun R Bharadwaj [2009-04-16 12:11:36]: This patch migrates all non pinned timers and hrtimers to the current idle load balancer, from all the idle CPUs. Timers firing on busy CPUs are not migrated. While migrating hrtimers, care should be taken to check if migrating a hrtimer would result in a latency or not. So we compare the expiry of the hrtimer with the next timer interrupt on the target cpu and migrate the hrtimer only if it expires *after* the next interrupt on the target cpu. So, added a clockevents_get_next_event() helper function to return the next_event on the target cpu's clock_event_device. [ tglx: cleanups and simplifications ] Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 51 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched.c | 5 +++++ kernel/time/clockevents.c | 12 +++++++++++ kernel/timer.c | 17 +++++++++++++--- 4 files changed, 80 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c71bcd54924..b675a67c9ac 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include @@ -198,8 +200,19 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, { struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; + int cpu, preferred_cpu = -1; + + cpu = smp_processor_id(); +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { + preferred_cpu = get_nohz_load_balancer(); + if (preferred_cpu >= 0) + cpu = preferred_cpu; + } +#endif - new_cpu_base = &__get_cpu_var(hrtimer_bases); +again: + new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { @@ -219,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, timer->base = NULL; spin_unlock(&base->cpu_base->lock); spin_lock(&new_base->cpu_base->lock); + + /* Optimized away for NOHZ=n SMP=n */ + if (cpu == preferred_cpu) { + /* Calculate clock monotonic expiry time */ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), + new_base->offset); +#else + ktime_t expires = hrtimer_get_expires(timer); +#endif + + /* + * Get the next event on target cpu from the + * clock events layer. + * This covers the highres=off nohz=on case as well. + */ + ktime_t next = clockevents_get_next_event(cpu); + + ktime_t delta = ktime_sub(expires, next); + + /* + * We do not migrate the timer when it is expiring + * before the next event on the target cpu because + * we cannot reprogram the target cpu hardware and + * we would cause it to fire late. + */ + if (delta.tv64 < 0) { + cpu = smp_processor_id(); + spin_unlock(&new_base->cpu_base->lock); + spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; + } + } timer->base = new_base; } return new_base; @@ -236,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) return base; } -# define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 7f1dd56af86..9fe3774a0fd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4244,6 +4244,11 @@ static struct { .load_balancer = ATOMIC_INIT(-1), }; +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d13be216a79..ab20ded013b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,6 +18,7 @@ #include #include #include +#include /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); @@ -251,4 +252,15 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); + +ktime_t clockevents_get_next_event(int cpu) +{ + struct tick_device *td; + struct clock_event_device *dev; + + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + return dev->next_event; +} #endif diff --git a/kernel/timer.c b/kernel/timer.c index 3424dfd11d5..3f841db5edf 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -609,9 +610,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, { struct tvec_base *base, *new_base; unsigned long flags; - int ret; - - ret = 0; + int ret = 0 , cpu; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -630,6 +629,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, new_base = __get_cpu_var(tvec_bases); + cpu = smp_processor_id(); + +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { + int preferred_cpu = get_nohz_load_balancer(); + + if (preferred_cpu >= 0) + cpu = preferred_cpu; + } +#endif + new_base = per_cpu(tvec_bases, cpu); + if (base != new_base) { /* * We are trying to schedule the timer on the local CPU. -- cgit v1.2.3