From 7bb67439bf6bd3782f07f1d7be1e63406453d5de Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 31 Aug 2008 08:05:58 -0700 Subject: select: Introduce a hrtimeout function This patch adds a schedule_hrtimeout() function, to be used by select() and poll() in a later patch. This function works similar to schedule_timeout() in most ways, but takes a timespec rather than jiffies. With a lot of contributions/fixes from Thomas Signed-off-by: Arjan van de Ven Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a7..782137dc755 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1678,3 +1678,68 @@ void __init hrtimers_init(void) #endif } +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && !expires->tv64) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "inifinte" + */ + if (!expires) { + schedule(); + __set_current_state(TASK_RUNNING); + return -EINTR; + } + + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); + t.timer.expires = *expires; + + hrtimer_init_sleeper(&t, current); + + hrtimer_start(&t.timer, t.timer.expires, mode); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); -- cgit v1.2.3 From df0cc0539b4127bd02f64de2c335b4af1fdb3845 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 31 Aug 2008 08:09:53 -0700 Subject: select: add a timespec_add_safe() function For the select() rework, it's important to be able to add timespec structures in an overflow-safe manner. This patch adds a timespec_add_safe() function for this which is similar in operation to ktime_add_safe(), but works on a struct timespec. Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven --- kernel/time.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 6a08660b4fa..d63a4336fad 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64); #endif EXPORT_SYMBOL(jiffies); + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +struct timespec timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} -- cgit v1.2.3 From cc584b213f252bf698849cf4be2377cd3ec7501a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 15:02:30 -0700 Subject: hrtimer: convert kernel/* to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts kernel/* to these accessors. Signed-off-by: Arjan van de Ven --- kernel/futex.c | 7 +++---- kernel/hrtimer.c | 44 +++++++++++++++++++++++--------------------- kernel/posix-timers.c | 10 ++++------ kernel/rtmutex.c | 3 +-- kernel/sched.c | 7 +++---- kernel/time/ntp.c | 3 +-- kernel/time/tick-sched.c | 21 ++++++++++----------- kernel/time/timer_list.c | 4 ++-- 8 files changed, 47 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 7d1136e97c1..4cd5b4319b0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1299,10 +1299,9 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - t.timer.expires = *abs_time; + hrtimer_set_expires(&t.timer, *abs_time); - hrtimer_start(&t.timer, t.timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) t.task = NULL; @@ -1404,7 +1403,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); - to->timer.expires = *time; + hrtimer_set_expires(&to->timer, *time); } q.pi_state = NULL; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 782137dc755..ae307feec74 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) if (!base->first) continue; timer = rb_entry(base->first, struct hrtimer, node); - expires = ktime_sub(timer->expires, base->offset); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } @@ -539,10 +539,10 @@ static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; - ktime_t expires = ktime_sub(timer->expires, base->offset); + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; - WARN_ON_ONCE(timer->expires.tv64 < 0); + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * When the callback is running, we do not reprogram the clock event @@ -794,7 +794,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) u64 orun = 1; ktime_t delta; - delta = ktime_sub(now, timer->expires); + delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta.tv64 < 0) return 0; @@ -806,8 +806,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); - timer->expires = ktime_add_ns(timer->expires, incr * orun); - if (timer->expires.tv64 > now.tv64) + hrtimer_add_expires_ns(timer, incr * orun); + if (hrtimer_get_expires_tv64(timer) > now.tv64) return orun; /* * This (and the ktime_add() below) is the @@ -815,7 +815,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) */ orun++; } - timer->expires = ktime_add_safe(timer->expires, interval); + hrtimer_add_expires(timer, interval); return orun; } @@ -847,7 +847,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * We dont care about collisions. Nodes with * the same expiry time stay together. */ - if (timer->expires.tv64 < entry->expires.tv64) { + if (hrtimer_get_expires_tv64(timer) < + hrtimer_get_expires_tv64(entry)) { link = &(*link)->rb_left; } else { link = &(*link)->rb_right; @@ -982,7 +983,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) #endif } - timer->expires = tim; + hrtimer_set_expires(timer, tim); timer_stats_hrtimer_set_start_info(timer); @@ -1076,7 +1077,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) ktime_t rem; base = lock_hrtimer_base(timer, &flags); - rem = ktime_sub(timer->expires, base->get_time()); + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; @@ -1108,7 +1109,7 @@ ktime_t hrtimer_get_next_event(void) continue; timer = rb_entry(base->first, struct hrtimer, node); - delta.tv64 = timer->expires.tv64; + delta.tv64 = hrtimer_get_expires_tv64(timer); delta = ktime_sub(delta, base->get_time()); if (delta.tv64 < mindelta.tv64) mindelta.tv64 = delta.tv64; @@ -1308,10 +1309,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) timer = rb_entry(node, struct hrtimer, node); - if (basenow.tv64 < timer->expires.tv64) { + if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) { ktime_t expires; - expires = ktime_sub(timer->expires, + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires.tv64 < expires_next.tv64) expires_next = expires; @@ -1414,7 +1415,8 @@ void hrtimer_run_queues(void) struct hrtimer *timer; timer = rb_entry(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= timer->expires.tv64) + if (base->softirq_time.tv64 <= + hrtimer_get_expires_tv64(timer)) break; if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { @@ -1462,7 +1464,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start(&t->timer, t->timer.expires, mode); + hrtimer_start_expires(&t->timer, mode); if (!hrtimer_active(&t->timer)) t->task = NULL; @@ -1484,7 +1486,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) struct timespec rmt; ktime_t rem; - rem = ktime_sub(timer->expires, timer->base->get_time()); + rem = hrtimer_expires_remaining(timer); if (rem.tv64 <= 0) return 0; rmt = ktime_to_timespec(rem); @@ -1503,7 +1505,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS); - t.timer.expires.tv64 = restart->nanosleep.expires; + hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); if (do_nanosleep(&t, HRTIMER_MODE_ABS)) goto out; @@ -1530,7 +1532,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, int ret = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); - t.timer.expires = timespec_to_ktime(*rqtp); + hrtimer_set_expires(&t.timer, timespec_to_ktime(*rqtp)); if (do_nanosleep(&t, mode)) goto out; @@ -1550,7 +1552,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.index = t.timer.base->index; restart->nanosleep.rmtp = rmtp; - restart->nanosleep.expires = t.timer.expires.tv64; + restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); ret = -ERESTART_RESTARTBLOCK; out: @@ -1724,11 +1726,11 @@ int __sched schedule_hrtimeout(ktime_t *expires, } hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); - t.timer.expires = *expires; + hrtimer_set_expires(&t.timer, *expires); hrtimer_init_sleeper(&t, current); - hrtimer_start(&t.timer, t.timer.expires, mode); + hrtimer_start_expires(&t.timer, mode); if (!hrtimer_active(&t.timer)) t.task = NULL; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbf..f85efcdcab2 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -668,7 +668,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(timer->expires, now); + remaining = ktime_sub(hrtimer_get_expires(timer), now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* @@ -762,7 +762,7 @@ common_timer_set(struct k_itimer *timr, int flags, hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; - timer->expires = timespec_to_ktime(new_setting->it_value); + hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); /* Convert interval */ timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); @@ -771,14 +771,12 @@ common_timer_set(struct k_itimer *timr, int flags, if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ if (mode == HRTIMER_MODE_REL) { - timer->expires = - ktime_add_safe(timer->expires, - timer->base->get_time()); + hrtimer_add_expires(timer, timer->base->get_time()); } return 0; } - hrtimer_start(timer, timer->expires, mode); + hrtimer_start_expires(timer, mode); return 0; } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 6522ae5b14a..69d9cb921ff 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* Setup the timer, when timeout != NULL */ if (unlikely(timeout)) { - hrtimer_start(&timeout->timer, timeout->timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&timeout->timer)) timeout->task = NULL; } diff --git a/kernel/sched.c b/kernel/sched.c index 1a5f73c1fcd..e46b5afa200 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -221,9 +221,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start(&rt_b->rt_period_timer, - rt_b->rt_period_timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS); } spin_unlock(&rt_b->rt_runtime_lock); } @@ -1058,7 +1057,7 @@ static void hrtick_start(struct rq *rq, u64 delay) struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - timer->expires = time; + hrtimer_set_expires(timer, time); if (rq == this_rq()) { hrtimer_restart(timer); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd8196..4c8d85421d2 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OOP; printk(KERN_NOTICE "Clock: " "inserting leap second 23:59:60 UTC\n"); - leap_timer.expires = ktime_add_ns(leap_timer.expires, - NSEC_PER_SEC); + hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); res = HRTIMER_RESTART; break; case TIME_DEL: diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b0468568..b33be61c0f6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -288,7 +288,7 @@ void tick_nohz_stop_sched_tick(int inidle) goto out; } - ts->idle_tick = ts->sched_timer.expires; + ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); @@ -419,21 +419,21 @@ void tick_nohz_restart_sched_tick(void) ts->tick_stopped = 0; ts->idle_exittime = now; hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; + hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); while (1) { /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, tick_period); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) break; } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) + if (!tick_program_event( + hrtimer_get_expires(&ts->sched_timer), 0)) break; } /* Update jiffies and reread time */ @@ -446,7 +446,7 @@ void tick_nohz_restart_sched_tick(void) static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(ts->sched_timer.expires, 0); + return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); } /* @@ -529,7 +529,7 @@ static void tick_nohz_switch_to_nohz(void) next = tick_init_jiffy_update(); for (;;) { - ts->sched_timer.expires = next; + hrtimer_set_expires(&ts->sched_timer, next); if (!tick_program_event(next, 0)) break; next = ktime_add(next, tick_period); @@ -625,16 +625,15 @@ void tick_setup_sched_timer(void) ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; /* Get the next period (per cpu) */ - ts->sched_timer.expires = tick_init_jiffy_update(); + hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); offset = ktime_to_ns(tick_period) >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); - ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); + hrtimer_add_expires_ns(&ts->sched_timer, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) break; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20fd000..5224a3215fb 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -66,8 +66,8 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) #endif SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", - (unsigned long long)ktime_to_ns(timer->expires), - (long long)(ktime_to_ns(timer->expires) - now)); + (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } static void -- cgit v1.2.3 From 654c8e0b1c623b156c5b92f28d914ab38c9c2c90 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 15:47:08 -0700 Subject: hrtimer: turn hrtimers into range timers this patch turns hrtimers into range timers; they have 2 expire points 1) the soft expire point 2) the hard expire point the kernel will do it's regular best effort attempt to get the timer run at the hard expire point. However, if some other time fires after the soft expire point, the kernel now has the freedom to fire this timer at this point, and thus grouping the events and preventing a power-expensive wakeup in the future. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae307feec74..01483004183 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1309,7 +1309,20 @@ void hrtimer_interrupt(struct clock_event_device *dev) timer = rb_entry(node, struct hrtimer, node); - if (basenow.tv64 < hrtimer_get_expires_tv64(timer)) { + /* + * The immediate goal for using the softexpires is + * minimizing wakeups, not running timers at the + * earliest interrupt after their soft expiration. + * This allows us to avoid using a Priority Search + * Tree, which can answer a stabbing querry for + * overlapping intervals and instead use the simple + * BST we already have. + * We don't add extra wakeups by delaying timers that + * are right-of a not yet expired timer, because that + * timer will have to trigger a wakeup anyway. + */ + + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), @@ -1681,14 +1694,20 @@ void __init hrtimers_init(void) } /** - * schedule_hrtimeout - sleep until timeout + * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to @@ -1702,7 +1721,7 @@ void __init hrtimers_init(void) * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout(ktime_t *expires, +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, const enum hrtimer_mode mode) { struct hrtimer_sleeper t; @@ -1726,7 +1745,7 @@ int __sched schedule_hrtimeout(ktime_t *expires, } hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); - hrtimer_set_expires(&t.timer, *expires); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_init_sleeper(&t, current); @@ -1744,4 +1763,33 @@ int __sched schedule_hrtimeout(ktime_t *expires, return !t.task ? 0 : -EINTR; } +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} EXPORT_SYMBOL_GPL(schedule_hrtimeout); -- cgit v1.2.3 From 6976675d94042fbd446231d1bd8b7de71a980ada Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 15:52:40 -0700 Subject: hrtimer: create a "timer_slack" field in the task struct We want to be able to control the default "rounding" that is used by select() and poll() and friends. This is a per process property (so that we can have a "nice" like program to start certain programs with a looser or stricter rounding) that can be set/get via a prctl(). For this purpose, a field called "timer_slack_ns" is added to the task struct. In addition, a field called "default_timer_slack"ns" is added so that tasks easily can temporarily to a more/less accurate slack and then back to the default. The default value of the slack is set to 50 usec; this is significantly less than 2.6.27's average select() and poll() timing error but still allows the kernel to group timers somewhat to preserve power behavior. Applications and admins can override this via the prctl() Signed-off-by: Arjan van de Ven --- kernel/fork.c | 2 ++ kernel/sys.c | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe8479..4308d75f0fa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -987,6 +987,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; + p->default_timer_slack_ns = current->timer_slack_ns; + #ifdef CONFIG_DETECT_SOFTLOCKUP p->last_switch_count = 0; p->last_switch_timestamp = 0; diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc0901..1b96401a057 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1727,6 +1727,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = + current->default_timer_slack_ns; + else + current->timer_slack_ns = arg2; + break; default: error = -EINVAL; break; -- cgit v1.2.3 From da8f2e170ea94cc20f8ebbc8ee8d127edb8f12f1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 7 Sep 2008 10:47:46 -0700 Subject: hrtimer: add a hrtimer_start_range() function this patch adds a _range version of hrtimer_start() so that range timers can be created; the hrtimer_start() function is just a wrapper around this. In addition, hrtimer_start_expires() will now preserve existing ranges. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 01483004183..a0222097c57 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -945,9 +945,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) } /** - * hrtimer_start - (re)start an relative timer on the current CPU + * hrtimer_start_range_ns - (re)start an relative timer on the current CPU * @timer: the timer to be added * @tim: expiry time + * @delta_ns: "slack" range for the timer * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) * * Returns: @@ -955,7 +956,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * 1 when the timer was active */ int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns, + const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -983,7 +985,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) #endif } - hrtimer_set_expires(timer, tim); + hrtimer_set_expires_range_ns(timer, tim, delta_ns); timer_stats_hrtimer_set_start_info(timer); @@ -1016,8 +1018,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) return ret; } +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +/** + * hrtimer_start - (re)start an relative timer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int +hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +{ + return hrtimer_start_range_ns(timer, tim, 0, mode); +} EXPORT_SYMBOL_GPL(hrtimer_start); + /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop -- cgit v1.2.3 From 704af52bd13a5d9f3c60c496c68e752fafdfb434 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 7 Sep 2008 16:10:20 -0700 Subject: hrtimer: show the timer ranges in /proc/timer_list to help debugging and visibility of timer ranges, show them in the existing timer list in /proc/timer_list Signed-off-by: Arjan van de Ven --- kernel/time/timer_list.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5224a3215fb..122ee751d2d 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,8 +65,10 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", + SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -- cgit v1.2.3 From 3bd012060f962567aadb52b27b2fc8fdc91102c7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 8 Sep 2008 08:58:59 -0700 Subject: hrtimer: make the nanosleep() syscall use the per process slack This patch makes the nanosleep() system call use the per process slack value; with this users are able to externally control existing applications to reduce the wakeup rate. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index a0222097c57..9a4c9018556 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1563,9 +1563,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; + unsigned long slack; + + slack = current->timer_slack_ns; + if (rt_task(current)) + slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires(&t.timer, timespec_to_ktime(*rqtp)); + hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); if (do_nanosleep(&t, mode)) goto out; -- cgit v1.2.3 From ae4b748e81b7e366f04f55229d5e372e372c33af Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 8 Sep 2008 09:03:57 -0700 Subject: hrtimer: make the futex() system call use the per process slack value This patch makes the futex() system call use the per process slack value; with this users are able to externally control existing applications to reduce the wakeup rate. Signed-off-by: Arjan van de Ven --- kernel/futex.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4cd5b4319b0..8af10027514 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1296,10 +1296,14 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (!abs_time) schedule(); else { + unsigned long slack; + slack = current->timer_slack_ns; + if (rt_task(current)) + slack = 0; hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - hrtimer_set_expires(&t.timer, *abs_time); + hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) -- cgit v1.2.3 From 2e94d1f71f7e4404d997e6fb4f1618aa147d76f9 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 10 Sep 2008 16:06:00 -0700 Subject: hrtimer: peek at the timer queue just before going idle As part of going idle, we already look at the time of the next timer event to determine which C-state to select etc. This patch adds functionality that causes the timers that are past their soft expire time, to fire at this time, before we calculate the next wakeup time. This functionality will thus avoid wakeups by running timers before going idle rather than specially waking up for it. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9a4c9018556..eb2cf984959 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1381,6 +1381,36 @@ void hrtimer_interrupt(struct clock_event_device *dev) raise_softirq(HRTIMER_SOFTIRQ); } +/** + * hrtimer_peek_ahead_timers -- run soft-expired timers now + * + * hrtimer_peek_ahead_timers will peek at the timer queue of + * the current cpu and check if there are any timers for which + * the soft expires time has passed. If any such timers exist, + * they are run immediately and then removed from the timer queue. + * + */ +void hrtimer_peek_ahead_timers(void) +{ + unsigned long flags; + struct tick_device *td; + struct clock_event_device *dev; + + if (hrtimer_hres_active()) + return; + + local_irq_save(flags); + td = &__get_cpu_var(tick_cpu_device); + if (!td) + goto out; + dev = td->evtdev; + if (!dev) + goto out; + hrtimer_interrupt(dev); +out: + local_irq_restore(flags); +} + static void run_hrtimer_softirq(struct softirq_action *h) { run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); -- cgit v1.2.3 From 030aebd2e439a2ebcca2b0ce30a02ed84feb043e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 11 Oct 2008 12:25:45 -0700 Subject: rangetimer: fix BUG_ON reported by Ingo There's a small race/chance that, while hrtimers are enabled globally, they're later not enabled when we're calling the hrtimer_interrupt() function, which then BUG_ON()'s for that. This patch closes that race/gap. Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index eb2cf984959..b17657d8d81 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1395,11 +1395,15 @@ void hrtimer_peek_ahead_timers(void) unsigned long flags; struct tick_device *td; struct clock_event_device *dev; - + struct hrtimer_cpu_base *cpu_base; if (hrtimer_hres_active()) return; local_irq_save(flags); + cpu_base = &__get_cpu_var(hrtimer_bases); + if (!cpu_base->hres_active) + goto out; + td = &__get_cpu_var(tick_cpu_device); if (!td) goto out; -- cgit v1.2.3 From dc4304f7deee29fcdf6a2b62f7146ea7f505fd42 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 13 Oct 2008 10:32:15 -0400 Subject: rangetimers: fix the bug reported by Ingo for real and please hand me a brown paper bag (thanks to Thomas for pointing out this very obvious bug) Signed-off-by: Arjan van de Ven --- kernel/hrtimer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b17657d8d81..2bd230be1cb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1395,15 +1395,11 @@ void hrtimer_peek_ahead_timers(void) unsigned long flags; struct tick_device *td; struct clock_event_device *dev; - struct hrtimer_cpu_base *cpu_base; - if (hrtimer_hres_active()) + + if (!hrtimer_hres_active()) return; local_irq_save(flags); - cpu_base = &__get_cpu_var(hrtimer_bases); - if (!cpu_base->hres_active) - goto out; - td = &__get_cpu_var(tick_cpu_device); if (!td) goto out; -- cgit v1.2.3 From 8cd162ce230b154e564a1285bb5f89fcf73f0dce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Oct 2008 20:37:23 +0200 Subject: sched: only update rq->clock while holding rq->lock Vatsa noticed rq->clock going funny and tracked it down to an update_rq_clock() outside a rq->lock section. This is a problem because things like double_rq_lock() update the rq->clock value for both rqs. Therefore disabling interrupts isn't strong enough. Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6f230596bd0..c530b84c7f8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4441,12 +4441,8 @@ need_resched_nonpreemptible: if (sched_feat(HRTICK)) hrtick_clear(rq); - /* - * Do the rq-clock update outside the rq lock: - */ - local_irq_disable(); + spin_lock_irq(&rq->lock); update_rq_clock(rq); - spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { -- cgit v1.2.3 From c851c8676bd7ae456e9b3af8e6bb2c434eddcc75 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 17 Oct 2008 18:17:46 +0800 Subject: sched: fix the wrong mask_len If NR_CPUS isn't a multiple of 32, we get a truncated string of sched domains by catting /proc/schedstat. This is caused by the wrong mask_len. This patch fixes it. Signed-off-by: Miao Xie Cc: Signed-off-by: Ingo Molnar --- kernel/sched_stats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43987e..81365b3d89f 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,7 +9,7 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = NR_CPUS/32 * 9; + int mask_len = (NR_CPUS/32 + 1) * 9; char *mask_str = kmalloc(mask_len, GFP_KERNEL); if (mask_str == NULL) -- cgit v1.2.3 From b968905292eaa52b25abb7b3e6c0841dac9f03ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 12:58:19 +0200 Subject: sched: fix the wrong mask_len, cleanup Clean up the division in show_schedstat(). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_stats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 81365b3d89f..67579253b53 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,7 +9,7 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = (NR_CPUS/32 + 1) * 9; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; char *mask_str = kmalloc(mask_len, GFP_KERNEL); if (mask_str == NULL) -- cgit v1.2.3 From b0aa51b999c449e5e3f9faa1ee406e052d407fe7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 17 Oct 2008 15:33:21 +0200 Subject: sched: minor fast-path overhead reduction Greetings, 103638d added a bit of avoidable overhead to the fast-path. Use sysctl_sched_min_granularity instead of sched_slice() to restrict buddy wakeups. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 18fd17172eb..67084936b60 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -747,7 +747,7 @@ pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rq *rq = rq_of(cfs_rq); u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) { cfs_rq->pair_start = rq->clock; return se; } -- cgit v1.2.3 From e1dd7bc58578ebfcaba989608017fe5156c29c86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2008 13:33:36 +0200 Subject: hrtimers: fix docbook comments hrtimer_start() and hrtimer_start_range_ns() handle relative and absolute timers. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 51ee90bca2d..00e6f0a1e7a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -946,7 +946,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) } /** - * hrtimer_start_range_ns - (re)start an relative timer on the current CPU + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer @@ -1022,7 +1022,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** - * hrtimer_start - (re)start an relative timer on the current CPU + * hrtimer_start - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) -- cgit v1.2.3 From 643bdf68f92a8516574ed7ca3713f9334c331b8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2008 13:38:11 +0200 Subject: hrtimers: simplify hrtimer_peek_ahead_timers() Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 00e6f0a1e7a..4fc41414fc0 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1394,22 +1394,16 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ void hrtimer_peek_ahead_timers(void) { - unsigned long flags; struct tick_device *td; - struct clock_event_device *dev; + unsigned long flags; if (!hrtimer_hres_active()) return; local_irq_save(flags); td = &__get_cpu_var(tick_cpu_device); - if (!td) - goto out; - dev = td->evtdev; - if (!dev) - goto out; - hrtimer_interrupt(dev); -out: + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); local_irq_restore(flags); } -- cgit v1.2.3 From ffda12a17a324103e9900fa1035309811eecbfe5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 19:27:02 +0200 Subject: sched: optimize group load balancer I noticed that tg_shares_up() unconditionally takes rq-locks for all cpus in the sched_domain. This hurts. We need the rq-locks whenever we change the weight of the per-cpu group sched entities. To allevate this a little, only change the weight when the new weight is at least shares_thresh away from the old value. This avoids the rq-lock for the top level entries, since those will never be re-weighted, and fuzzes the lower level entries a little to gain performance in semi-stable situations. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 45 +++++++++++++++++++++++++-------------------- kernel/sysctl.c | 10 ++++++++++ 2 files changed, 35 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c530b84c7f8..11ca3901783 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -817,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; */ unsigned int sysctl_sched_shares_ratelimit = 250000; +/* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ +unsigned int sysctl_sched_shares_thresh = 4; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1453,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; @@ -1485,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * */ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + spin_lock_irqsave(&rq->lock, flags); + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; - __set_se_shares(tg->se[cpu], shares); + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } } /* @@ -1526,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!rq_weight) rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } + for_each_cpu_mask(i, sd->span) + update_group_shares_cpu(tg, i, shares, rq_weight); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a..3d804f41e64 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -274,6 +274,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_thresh", + .data = &sysctl_sched_shares_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", -- cgit v1.2.3 From a4c2f00f5cb848af7a8c816426b413c8e41834df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 19:27:03 +0200 Subject: sched: fair scheduler should not resched rt tasks With use of ftrace Steven noticed that some RT tasks got rescheduled due to sched_fair interaction. What happens is that we reprogram the hrtick from enqueue/dequeue_fair_task() because that can change nr_running, and thus a current tasks ideal runtime. However, its possible the current task isn't a fair_sched_class task, and thus doesn't have a hrtick set to change. Fix this by wrapping those hrtick_start_fair() calls in a hrtick_update() function, which will check for the right conditions. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67084936b60..0c4bcac5476 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +static const struct sched_class fair_sched_class; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -848,11 +850,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta); } } + +/* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ +static void hrtick_update(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + + if (curr->sched_class != &fair_sched_class) + return; + + if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) + hrtick_start_fair(rq, curr); +} #else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { } + +static inline void hrtick_update(struct rq *rq) +{ +} #endif /* @@ -873,7 +895,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) wakeup = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -895,7 +917,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) sleep = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -1001,8 +1023,6 @@ static inline int wake_idle(int cpu, struct task_struct *p) #ifdef CONFIG_SMP -static const struct sched_class fair_sched_class; - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group -- cgit v1.2.3 From f9c0b0950d5fd8c8c5af39bc061f27ea8fddcac3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Oct 2008 19:27:04 +0200 Subject: sched: revert back to per-rq vruntime Vatsa rightly points out that having the runqueue weight in the vruntime calculations can cause unfairness in the face of task joins/leaves. Suppose: dv = dt * rw / w Then take 10 tasks t_n, each of similar weight. If the first will run 1 then its vruntime will increase by 10. Now, if the next 8 tasks leave after having run their 1, then the last task will get a vruntime increase of 2 after having run 1. Which will leave us with 2 tasks of equal weight and equal runtime, of which one will not be scheduled for 8/2=4 units of time. Ergo, we cannot do that and must use: dv = dt / w. This means we cannot have a global vruntime based on effective priority, but must instead go back to the vruntime per rq model we started out with. This patch was lightly tested by doing starting while loops on each nice level and observing their execution time, and a simple group scenario of 1:2:3 pinned to a single cpu. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c4bcac5476..a0aa38b10fd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -336,7 +336,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= w / rw + * delta *= P[w / rw] */ static inline unsigned long calc_delta_weight(unsigned long delta, struct sched_entity *se) @@ -350,15 +350,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se) } /* - * delta *= rw / w + * delta /= w */ static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, &se->load); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; } @@ -388,26 +386,26 @@ static u64 __sched_period(unsigned long nr_running) * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * - * s = p*w/rw + * s = p*P[w/rw] */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); + unsigned long nr_running = cfs_rq->nr_running; + + if (unlikely(!se->on_rq)) + nr_running++; + + return calc_delta_weight(__sched_period(nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s*rw/w = p + * vs = s/w */ -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; - - if (!se->on_rq) - nr_running++; - - return __sched_period(nr_running); + return calc_delta_fair(sched_slice(cfs_rq, se), se); } /* @@ -629,7 +627,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice_add(cfs_rq, se); + vruntime += sched_vslice(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ -- cgit v1.2.3 From 0c4b83da58ec2e96ce9c44c211d6eac5f9dae478 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Oct 2008 14:27:43 +0200 Subject: sched: disable the hrtick for now David Miller reported that hrtick update overhead has tripled the wakeup overhead on Sparc64. That is too much - disable the HRTICK feature for now by default, until a faster implementation is found. Reported-by: David Miller Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 7c9e8f4a049..fda01621829 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -5,7 +5,7 @@ SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) -SCHED_FEAT(HRTICK, 1) +SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) -- cgit v1.2.3 From 0b3682ba33c59a362901b478bdab965da888b350 Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Mon, 20 Oct 2008 12:41:58 -0600 Subject: genirq: fix set_irq_type() when recording trigger type Impact: fix boot hang on a G5 In set_irq_type() we want to pass the type rather than the current interrupt state. Signed-off-by: Chris Friesen Acked-by: Benjamin Herrenschmidt Acked-by: David Brownell Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4895fde4eb9..3de6ea3ee74 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -127,7 +127,7 @@ int set_irq_type(unsigned int irq, unsigned int type) return 0; spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, flags); + ret = __irq_set_trigger(desc, irq, type); spin_unlock_irqrestore(&desc->lock, flags); return ret; } -- cgit v1.2.3 From 9a1c3542768b5a58e45a9216921cd10a3bae1205 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Feb 2008 20:40:24 -0500 Subject: [PATCH] pass fmode_t to blkdev_put() Signed-off-by: Al Viro --- kernel/power/swap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 80ccac849e4..7b9d611c110 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -178,7 +178,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ res = set_blocksize(resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, FMODE_WRITE); return res; } @@ -574,7 +574,7 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, FMODE_READ); if (!error) pr_debug("PM: Image successfully loaded\n"); @@ -609,7 +609,7 @@ int swsusp_check(void) return -EINVAL; } if (error) - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, FMODE_READ); else pr_debug("PM: Signature found, resuming\n"); } else { @@ -633,7 +633,7 @@ void swsusp_close(void) return; } - blkdev_put(resume_bdev); + blkdev_put(resume_bdev, 0); /* move up */ } static int swsusp_header_init(void) -- cgit v1.2.3 From c2dd0dae185423fb243b13d490c3fcfaa18ff333 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Oct 2007 13:21:10 -0400 Subject: [PATCH] propagate mode through swsusp_close() Signed-off-by: Al Viro --- kernel/power/disk.c | 2 +- kernel/power/power.h | 2 +- kernel/power/swap.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 331f9836383..c9d74083746 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -651,7 +651,7 @@ static int software_resume(void) pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); if (error) { - swsusp_close(); + swsusp_close(FMODE_READ); goto Done; } diff --git a/kernel/power/power.h b/kernel/power/power.h index acc0c101dbd..46b5ec7a3af 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -153,7 +153,7 @@ extern int swsusp_shrink_memory(void); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(void); +extern void swsusp_close(fmode_t); struct timeval; /* kernel/power/swsusp.c */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7b9d611c110..178b001a4f1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -426,7 +426,7 @@ int swsusp_write(unsigned int flags) release_swap_writer(&handle); out: - swsusp_close(); + swsusp_close(FMODE_WRITE); return error; } @@ -626,14 +626,14 @@ int swsusp_check(void) * swsusp_close - close swap device. */ -void swsusp_close(void) +void swsusp_close(fmode_t mode) { if (IS_ERR(resume_bdev)) { pr_debug("PM: Image device not initialised\n"); return; } - blkdev_put(resume_bdev, 0); /* move up */ + blkdev_put(resume_bdev, mode); /* move up */ } static int swsusp_header_init(void) -- cgit v1.2.3 From 572c48921574dbe6dceb958cf965aa962baefde4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Oct 2007 13:24:05 -0400 Subject: [PATCH] sanitize blkdev_get() and friends * get rid of fake struct file/struct dentry in __blkdev_get() * merge __blkdev_get() and do_open() * get rid of flags argument of blkdev_get() Signed-off-by: Al Viro --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 178b001a4f1..b7713b53d07 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -172,7 +172,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ return res; root_swap = res; - res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); + res = blkdev_get(resume_bdev, FMODE_WRITE); if (res) return res; -- cgit v1.2.3 From b6f3b7803a9231eddc36d0a2a6d2d8105ef89344 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Sat, 18 Oct 2008 16:06:56 -0700 Subject: genirq: NULL struct irq_desc's member 'name' in dynamic_irq_cleanup() If the member 'name' of the irq_desc structure happens to point to a character string that is resident within a kernel module, problems ensue if that module is rmmod'd (at which time dynamic_irq_cleanup() is called) and then later show_interrupts() is called by someone. It is also not a good thing if the character string resided in kmalloc'd space that has been kfree'd (after having called dynamic_irq_cleanup()). dynamic_irq_cleanup() fails to NULL the 'name' member and show_interrupts() references it on a few architectures (like h8300, sh and x86). Signed-off-by: Dean Nelson Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3de6ea3ee74..10b5092e9bf 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -76,6 +76,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; + desc->name = NULL; spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.2.3 From 5f86515158ca86182c1dbecd546f1848121ba135 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 17 Oct 2008 14:40:30 +0800 Subject: rcupdate: fix bug of rcu_barrier*() current rcu_barrier_bh() is like this: void rcu_barrier_bh(void) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); atomic_set(&rcu_barrier_cpu_count, 0); /* * The queueing of callbacks in all CPUs must be atomic with * respect to RCU, otherwise one CPU may queue a callback, * wait for a grace period, decrement barrier count and call * complete(), while other CPUs have not yet queued anything. * So, we need to make sure that grace periods cannot complete * until all the callbacks are queued. */ rcu_read_lock(); on_each_cpu(rcu_barrier_func, (void *)RCU_BARRIER_BH, 1); rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } The inconsistency of the code and the comments show a bug here. rcu_read_lock() cannot make sure that "grace periods for RCU_BH cannot complete until all the callbacks are queued". it only make sure that race periods for RCU cannot complete until all the callbacks are queued. so we must use rcu_read_lock_bh() for rcu_barrier_bh(). like this: void rcu_barrier_bh(void) { ...... rcu_read_lock_bh(); on_each_cpu(rcu_barrier_func, (void *)RCU_BARRIER_BH, 1); rcu_read_unlock_bh(); ...... } and also rcu_barrier() rcu_barrier_sched() are implemented like this. it will bring a lot of duplicate code. My patch uses another way to fix this bug, please see the comment of my patch. Thank Paul E. McKenney for he rewrote the comment. Signed-off-by: Lai Jiangshan Reviewed-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 467d5940f62..ad63af8b252 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -119,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type) /* Take cpucontrol mutex to protect against CPU hotplug */ mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 0); /* - * The queueing of callbacks in all CPUs must be atomic with - * respect to RCU, otherwise one CPU may queue a callback, - * wait for a grace period, decrement barrier count and call - * complete(), while other CPUs have not yet queued anything. - * So, we need to make sure that grace periods cannot complete - * until all the callbacks are queued. + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. */ - rcu_read_lock(); + atomic_set(&rcu_barrier_cpu_count, 1); on_each_cpu(rcu_barrier_func, (void *)type, 1); - rcu_read_unlock(); + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } -- cgit v1.2.3 From c4bd822e7b12a9008241d76db45b665f2fef180c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Oct 2008 20:17:35 +0200 Subject: NOHZ: fix thinko in the timer restart code path commit fb02fbc14d17837b4b7b02dbb36142c16a7bf208 (NOHZ: restart tick device from irq_enter()) solves the problem of stale jiffies when long running softirqs happen in a long idle sleep period, but it has a major thinko in it: When the interrupt which came in _is_ the timer interrupt which should expire ts->sched_timer then we cancel and rearm the timer _before_ it gets expired in hrtimer_interrupt() to the next period. That means the call back function is not called. This game can go on for ever :( Prevent this by making sure to only rearm the timer when the expiry time is more than one tick_period away. Otherwise keep it running as it is either already expired or will expiry at the right point to update jiffies. Signed-off-by: Thomas Gleixner Tested-by: Venkatesch Pallipadi --- kernel/time/tick-sched.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0581c11fe6c..727c1ae0517 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -567,11 +567,21 @@ static void tick_nohz_switch_to_nohz(void) static void tick_nohz_kick_tick(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t delta, now; if (!ts->tick_stopped) return; - tick_nohz_restart(ts, ktime_get()); + /* + * Do not touch the tick device, when the next expiry is either + * already reached or less/equal than the tick period. + */ + now = ktime_get(); + delta = ktime_sub(ts->sched_timer.expires, now); + if (delta.tv64 <= tick_period.tv64) + return; + + tick_nohz_restart(ts, now); } #else -- cgit v1.2.3 From 5e458cc0f4770eea45d3c07110f01b3a94c72aa5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:13 -0500 Subject: module: simplify load_module. Linus' recent catch of stack overflow in load_module lead me to look at the code. A couple of helpers to get a section address and get objects from a section can help clean things up a little. (And in case you're wondering, the stack size also dropped from 328 to 284 bytes). Signed-off-by: Rusty Russell --- kernel/module.c | 235 ++++++++++++++++++++++++-------------------------------- 1 file changed, 99 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0d8d21ee792..3d256681ab6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -132,6 +132,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr, return 0; } +/* Find a module section, or NULL. */ +static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, + const char *secstrings, const char *name) +{ + /* Section 0 has sh_addr 0. */ + return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; +} + +/* Find a module section, or NULL. Fill in number of "objects" in section. */ +static void *section_objs(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = sechdrs[sec].sh_size / object_size; + return (void *)sechdrs[sec].sh_addr; +} + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -1789,32 +1812,20 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ -#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG -static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) { - struct mod_debug *debug_info; - unsigned long pos, end; - unsigned int num_verbose; - - pos = sechdrs[verboseindex].sh_addr; - num_verbose = sechdrs[verboseindex].sh_size / - sizeof(struct mod_debug); - end = pos + (num_verbose * sizeof(struct mod_debug)); +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG + unsigned int i; - for (; pos < end; pos += sizeof(struct mod_debug)) { - debug_info = (struct mod_debug *)pos; - register_dynamic_debug_module(debug_info->modname, - debug_info->type, debug_info->logical_modname, - debug_info->flag_names, debug_info->hash, - debug_info->hash2); + for (i = 0; i < num; i++) { + register_dynamic_debug_module(debug[i].modname, + debug[i].type, + debug[i].logical_modname, + debug[i].flag_names, + debug[i].hash, debug[i].hash2); } -} -#else -static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, - unsigned int verboseindex) -{ -} #endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ +} static void *module_alloc_update_bounds(unsigned long size) { @@ -1843,37 +1854,14 @@ static noinline struct module *load_module(void __user *umod, unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; - unsigned int setupindex; - unsigned int exindex; - unsigned int exportindex; - unsigned int modindex; - unsigned int obsparmindex; - unsigned int infoindex; - unsigned int gplindex; - unsigned int crcindex; - unsigned int gplcrcindex; - unsigned int versindex; - unsigned int pcpuindex; - unsigned int gplfutureindex; - unsigned int gplfuturecrcindex; + unsigned int modindex, versindex, infoindex, pcpuindex; unsigned int unwindex = 0; -#ifdef CONFIG_UNUSED_SYMBOLS - unsigned int unusedindex; - unsigned int unusedcrcindex; - unsigned int unusedgplindex; - unsigned int unusedgplcrcindex; -#endif - unsigned int markersindex; - unsigned int markersstringsindex; - unsigned int verboseindex; - unsigned int tracepointsindex; - unsigned int tracepointsstringsindex; - unsigned int mcountindex; + unsigned int num_kp, num_mcount; + struct kernel_param *kp; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - void *mseg; - struct exception_table_entry *extable; + unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -1937,6 +1925,7 @@ static noinline struct module *load_module(void __user *umod, err = -ENOEXEC; goto free_hdr; } + /* This is temporary: point mod into copy of data. */ mod = (void *)sechdrs[modindex].sh_addr; if (symindex == 0) { @@ -1946,22 +1935,6 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } - /* Optional sections */ - exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); - gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); - gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); - crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); - gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); -#ifdef CONFIG_UNUSED_SYMBOLS - unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); - unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); - unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); - unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); -#endif - setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); - exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); - obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); @@ -2117,42 +2090,57 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto cleanup; - /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ - mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); - mod->syms = (void *)sechdrs[exportindex].sh_addr; - if (crcindex) - mod->crcs = (void *)sechdrs[crcindex].sh_addr; - mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); - mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; - if (gplcrcindex) - mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; - mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / - sizeof(*mod->gpl_future_syms); - mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; - if (gplfuturecrcindex) - mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; + /* Now we've got everything in the final locations, we can + * find optional sections. */ + kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp), + &num_kp); + mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); + mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_gpl_future"); #ifdef CONFIG_UNUSED_SYMBOLS - mod->num_unused_syms = sechdrs[unusedindex].sh_size / - sizeof(*mod->unused_syms); - mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / - sizeof(*mod->unused_gpl_syms); - mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; - if (unusedcrcindex) - mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; - mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; - if (unusedgplcrcindex) - mod->unused_gpl_crcs - = (void *)sechdrs[unusedgplcrcindex].sh_addr; + mod->unused_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, + "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, + "__kcrctab_unused_gpl"); +#endif + +#ifdef CONFIG_MARKERS + mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", + sizeof(*mod->markers), &mod->num_markers); +#endif +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = section_objs(hdr, sechdrs, secstrings, + "__tracepoints", + sizeof(*mod->tracepoints), + &mod->num_tracepoints); #endif #ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !crcindex) - || (mod->num_gpl_syms && !gplcrcindex) - || (mod->num_gpl_future_syms && !gplfuturecrcindex) + if ((mod->num_syms && !mod->crcs) + || (mod->num_gpl_syms && !mod->gpl_crcs) + || (mod->num_gpl_future_syms && !mod->gpl_future_crcs) #ifdef CONFIG_UNUSED_SYMBOLS - || (mod->num_unused_syms && !unusedcrcindex) - || (mod->num_unused_gpl_syms && !unusedgplcrcindex) + || (mod->num_unused_syms && !mod->unused_crcs) + || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) #endif ) { printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); @@ -2161,16 +2149,6 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } #endif - markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); - markersstringsindex = find_sec(hdr, sechdrs, secstrings, - "__markers_strings"); - verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); - tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); - tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, - "__tracepoints_strings"); - - mcountindex = find_sec(hdr, sechdrs, secstrings, - "__mcount_loc"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2193,28 +2171,16 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto cleanup; } -#ifdef CONFIG_MARKERS - mod->markers = (void *)sechdrs[markersindex].sh_addr; - mod->num_markers = - sechdrs[markersindex].sh_size / sizeof(*mod->markers); -#endif -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; - mod->num_tracepoints = - sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); -#endif - /* Find duplicate symbols */ err = verify_export_symbols(mod); - if (err < 0) goto cleanup; /* Set up and sort exception table */ - mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); - mod->extable = extable = (void *)sechdrs[exindex].sh_addr; - sort_extable(extable, extable + mod->num_exentries); + mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + sort_extable(mod->extable, mod->extable + mod->num_exentries); /* Finally, copy percpu area over. */ percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, @@ -2223,11 +2189,17 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); if (!mod->taints) { + struct mod_debug *debug; + unsigned int num_debug; + #ifdef CONFIG_MARKERS marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif - dynamic_printk_setup(sechdrs, verboseindex); + debug = section_objs(hdr, sechdrs, secstrings, "__verbose", + sizeof(*debug), &num_debug); + dynamic_printk_setup(debug, num_debug); + #ifdef CONFIG_TRACEPOINTS tracepoint_update_probe_range(mod->tracepoints, mod->tracepoints + mod->num_tracepoints); @@ -2235,8 +2207,9 @@ static noinline struct module *load_module(void __user *umod, } /* sechdrs[0].sh_size is always zero */ - mseg = (void *)sechdrs[mcountindex].sh_addr; - ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", + sizeof(*mseg), &num_mcount); + ftrace_init_module(mseg, mseg + num_mcount); err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2261,7 +2234,7 @@ static noinline struct module *load_module(void __user *umod, set_fs(old_fs); mod->args = args; - if (obsparmindex) + if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", mod->name); @@ -2270,21 +2243,11 @@ static noinline struct module *load_module(void __user *umod, * strong_try_module_get() will fail. */ stop_machine(__link_module, mod, NULL); - /* Size of section 0 is 0, so this works well if no params */ - err = parse_args(mod->name, mod->args, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param), - NULL); + err = parse_args(mod->name, mod->args, kp, num_kp, NULL); if (err < 0) goto unlink; - err = mod_sysfs_setup(mod, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param)); + err = mod_sysfs_setup(mod, kp, num_kp); if (err < 0) goto unlink; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); -- cgit v1.2.3 From d72b37513cdfbd3f53f3d485a8c403cc96d2c95f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 30 Aug 2008 10:09:00 +0200 Subject: Remove stop_machine during module load v2 Remove stop_machine during module load v2 module loading currently does a stop_machine on each module load to insert the module into the global module lists. Especially on larger systems this can be quite expensive. It does that to handle concurrent lock lessmodule list readers like kallsyms. I don't think stop_machine() is actually needed to insert something into a list though. There are no concurrent writers because the module mutex is taken. And the RCU list functions know how to insert a node into a list with the right memory ordering so that concurrent readers don't go off into the wood. So remove the stop_machine for the module list insert and just do a list_add_rcu() instead. Module removal will still do a stop_machine of course, it needs that for other reasons. v2: Revised readers based on Paul's comments. All readers that only rely on disabled preemption need to be changed to list_for_each_rcu(). Done that. The others are ok because they have the modules mutex. Also added a possible missing preempt disable for print_modules(). [cc Paul McKenney for review. It's not RCU, but quite similar.] Acked-by: Paul E. McKenney Signed-off-by: Rusty Russell --- kernel/module.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3d256681ab6..c0f1826e2d9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -63,7 +64,7 @@ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) /* List of modules, protected by module_mutex or preempt_disable - * (add/delete uses stop_machine). */ + * (delete uses stop_machine/add uses RCU list operations). */ static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); @@ -241,7 +242,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr, if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) return true; - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { struct symsearch arr[] = { { mod->syms, mod->syms + mod->num_syms, mod->crcs, NOT_GPL_ONLY, false }, @@ -1416,17 +1417,6 @@ static void mod_kobject_remove(struct module *mod) mod_sysfs_fini(mod); } -/* - * link the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __link_module(void *_mod) -{ - struct module *mod = _mod; - list_add(&mod->list, &modules); - return 0; -} - /* * unlink the module with the whole machine is stopped with interrupts off * - this defends against kallsyms not taking locks @@ -2239,9 +2229,13 @@ static noinline struct module *load_module(void __user *umod, mod->name); /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since - * strong_try_module_get() will fail. */ - stop_machine(__link_module, mod, NULL); + * info during argument parsing. Noone should access us, since + * strong_try_module_get() will fail. + * lockdep/oops can run asynchronous, so use the RCU list insertion + * function to insert in a way safe to concurrent readers. + * The mutex protects against concurrent writers. + */ + list_add_rcu(&mod->list, &modules); err = parse_args(mod->name, mod->args, kp, num_kp, NULL); if (err < 0) @@ -2436,7 +2430,7 @@ const char *module_address_lookup(unsigned long addr, const char *ret = NULL; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { if (modname) @@ -2459,7 +2453,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { const char *sym; @@ -2483,7 +2477,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { const char *sym; @@ -2510,7 +2504,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; @@ -2553,7 +2547,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) ret = mod_find_symname(mod, colon+1); *colon = ':'; } else { - list_for_each_entry(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) if ((ret = mod_find_symname(mod, name)) != 0) break; } @@ -2656,7 +2650,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) struct module *mod; preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (mod->num_exentries == 0) continue; @@ -2682,7 +2676,7 @@ int is_module_address(unsigned long addr) preempt_disable(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (within(addr, mod->module_core, mod->core_size)) { preempt_enable(); return 1; @@ -2703,7 +2697,7 @@ struct module *__module_text_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) if (within(addr, mod->module_init, mod->init_text_size) || within(addr, mod->module_core, mod->core_text_size)) return mod; @@ -2728,8 +2722,11 @@ void print_modules(void) char buf[8]; printk("Modules linked in:"); - list_for_each_entry(mod, &modules, list) + /* Most callers should already have preempt disabled, but make sure */ + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) printk(" %s%s", mod->name, module_flags(mod, buf)); + preempt_enable(); if (last_unloaded_module[0]) printk(" [last unloaded: %s]", last_unloaded_module); printk("\n"); -- cgit v1.2.3 From 730b69d225259565c705f5f5a11cb1aba69568f1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:22 -0500 Subject: module: check kernel param length at compile time, not runtime The kparam code tries to handle over-length parameter prefixes at runtime. Not only would I bet this has never been tested, it's not clear that truncating names is a good idea either. So let's check at compile time. We need to move the #define to moduleparam.h to do this, though. Signed-off-by: Rusty Russell --- kernel/params.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index afc46a23eb6..aca07e1a050 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -585,17 +585,14 @@ static void __init param_sysfs_builtin(void) { struct kernel_param *kp, *kp_begin = NULL; unsigned int i, name_len, count = 0; - char modname[MODULE_NAME_LEN + 1] = ""; + char modname[MODULE_NAME_LEN] = ""; for (i=0; i < __stop___param - __start___param; i++) { char *dot; - size_t max_name_len; kp = &__start___param[i]; - max_name_len = - min_t(size_t, MODULE_NAME_LEN, strlen(kp->name)); - dot = memchr(kp->name, '.', max_name_len); + dot = strchr(kp->name, '.'); if (!dot) { DEBUGP("couldn't find period in first %d characters " "of %s\n", MODULE_NAME_LEN, kp->name); -- cgit v1.2.3 From 9b473de87209fa86eb421b23386693b461612f30 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:22 -0500 Subject: param: Fix duplicate module prefixes Instead of insisting each new module_param sysfs entry is unique, handle the case where it already exists (for builtin modules). The current code assumes that all identical prefixes are together in the section: true for normal uses, but not necessarily so if someone overrides MODULE_PARAM_PREFIX. More importantly, it's not true with the new "core_param()" code which uses "kernel" as a prefix. This simplifies the caller for the builtin case, at a slight loss of efficiency (we do the lookup every time to see if the directory exists). Signed-off-by: Rusty Russell Cc: Greg Kroah-Hartman --- kernel/params.c | 261 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 141 insertions(+), 120 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index aca07e1a050..f27c992a462 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp) } /* sysfs output in /sys/modules/XYZ/parameters/ */ +#define to_module_attr(n) container_of(n, struct module_attribute, attr); +#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); extern struct kernel_param __start___param[], __stop___param[]; @@ -384,6 +386,7 @@ struct param_attribute struct module_param_attrs { + unsigned int num; struct attribute_group grp; struct param_attribute attrs[0]; }; @@ -434,69 +437,84 @@ static ssize_t param_attr_store(struct module_attribute *mattr, #ifdef CONFIG_SYSFS /* - * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME - * @mk: struct module_kobject (contains parent kobject) - * @kparam: array of struct kernel_param, the actual parameter definitions - * @num_params: number of entries in array - * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules" + * add_sysfs_param - add a parameter to sysfs + * @mk: struct module_kobject + * @kparam: the actual parameter definition to add to sysfs + * @name: name of parameter * - * Create a kobject for a (per-module) group of parameters, and create files - * in sysfs. A pointer to the param_kobject is returned on success, - * NULL if there's no parameter to export, or other ERR_PTR(err). + * Create a kobject if for a (per-module) parameter if mp NULL, and + * create file in sysfs. Returns an error on out of memory. Always cleans up + * if there's an error. */ -static __modinit struct module_param_attrs * -param_sysfs_setup(struct module_kobject *mk, - struct kernel_param *kparam, - unsigned int num_params, - unsigned int name_skip) +static __modinit int add_sysfs_param(struct module_kobject *mk, + struct kernel_param *kp, + const char *name) { - struct module_param_attrs *mp; - unsigned int valid_attrs = 0; - unsigned int i, size[2]; - struct param_attribute *pattr; - struct attribute **gattr; - int err; - - for (i=0; iperm); + + if (!mk->mp) { + num = 0; + attrs = NULL; + } else { + num = mk->mp->num; + attrs = mk->mp->grp.attrs; } - if (!valid_attrs) - return NULL; - - size[0] = ALIGN(sizeof(*mp) + - valid_attrs * sizeof(mp->attrs[0]), - sizeof(mp->grp.attrs[0])); - size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); - - mp = kzalloc(size[0] + size[1], GFP_KERNEL); - if (!mp) - return ERR_PTR(-ENOMEM); + /* Enlarge. */ + new = krealloc(mk->mp, + sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), + GFP_KERNEL); + if (!new) { + kfree(mk->mp); + err = -ENOMEM; + goto fail; + } + attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); + if (!attrs) { + err = -ENOMEM; + goto fail_free_new; + } - mp->grp.name = "parameters"; - mp->grp.attrs = (void *)mp + size[0]; + /* Sysfs wants everything zeroed. */ + memset(new, 0, sizeof(*new)); + memset(&new->attrs[num], 0, sizeof(new->attrs[num])); + memset(&attrs[num], 0, sizeof(attrs[num])); + new->grp.name = "parameters"; + new->grp.attrs = attrs; + + /* Tack new one on the end. */ + new->attrs[num].param = kp; + new->attrs[num].mattr.show = param_attr_show; + new->attrs[num].mattr.store = param_attr_store; + new->attrs[num].mattr.attr.name = (char *)name; + new->attrs[num].mattr.attr.mode = kp->perm; + new->num = num+1; + + /* Fix up all the pointers, since krealloc can move us */ + for (num = 0; num < new->num; num++) + new->grp.attrs[num] = &new->attrs[num].mattr.attr; + new->grp.attrs[num] = NULL; + + mk->mp = new; + return 0; - pattr = &mp->attrs[0]; - gattr = &mp->grp.attrs[0]; - for (i = 0; i < num_params; i++) { - struct kernel_param *kp = &kparam[i]; - if (kp->perm) { - pattr->param = kp; - pattr->mattr.show = param_attr_show; - pattr->mattr.store = param_attr_store; - pattr->mattr.attr.name = (char *)&kp->name[name_skip]; - pattr->mattr.attr.mode = kp->perm; - *(gattr++) = &(pattr++)->mattr.attr; - } - } - *gattr = NULL; +fail_free_new: + kfree(new); +fail: + mk->mp = NULL; + return err; +} - if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { - kfree(mp); - return ERR_PTR(err); - } - return mp; +static void free_module_param_attrs(struct module_kobject *mk) +{ + kfree(mk->mp->grp.attrs); + kfree(mk->mp); + mk->mp = NULL; } #ifdef CONFIG_MODULES @@ -506,21 +524,33 @@ param_sysfs_setup(struct module_kobject *mk, * @kparam: module parameters (array) * @num_params: number of module parameters * - * Adds sysfs entries for module parameters, and creates a link from - * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ + * Adds sysfs entries for module parameters under + * /sys/module/[mod->name]/parameters/ */ int module_param_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { - struct module_param_attrs *mp; + int i, err; + bool params = false; + + for (i = 0; i < num_params; i++) { + if (kparam[i].perm == 0) + continue; + err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); + if (err) + return err; + params = true; + } - mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); - if (IS_ERR(mp)) - return PTR_ERR(mp); + if (!params) + return 0; - mod->param_attrs = mp; - return 0; + /* Create the param group. */ + err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); + if (err) + free_module_param_attrs(&mod->mkobj); + return err; } /* @@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod, */ void module_param_sysfs_remove(struct module *mod) { - if (mod->param_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->param_attrs->grp); + if (mod->mkobj.mp) { + sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); /* We are positive that no one is using any param * attrs at this point. Deallocate immediately. */ - kfree(mod->param_attrs); - mod->param_attrs = NULL; + free_module_param_attrs(&mod->mkobj); } } #endif -/* - * kernel_param_sysfs_setup - wrapper for built-in params support - */ -static void __init kernel_param_sysfs_setup(const char *name, - struct kernel_param *kparam, - unsigned int num_params, - unsigned int name_skip) +static void __init kernel_add_sysfs_param(const char *name, + struct kernel_param *kparam, + unsigned int name_skip) { struct module_kobject *mk; - int ret; + struct kobject *kobj; + int err; - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); - if (ret) { - kobject_put(&mk->kobj); - printk(KERN_ERR "Module '%s' failed to be added to sysfs, " - "error number %d\n", name, ret); - printk(KERN_ERR "The system will be unstable now.\n"); - return; + kobj = kset_find_obj(module_kset, name); + if (kobj) { + /* We already have one. Remove params so we can add more. */ + mk = to_module_kobject(kobj); + /* We need to remove it before adding parameters. */ + sysfs_remove_group(&mk->kobj, &mk->mp->grp); + } else { + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + BUG_ON(!mk); + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, + "%s", name); + if (err) { + kobject_put(&mk->kobj); + printk(KERN_ERR "Module '%s' failed add to sysfs, " + "error number %d\n", name, err); + printk(KERN_ERR "The system will be unstable now.\n"); + return; + } + /* So that exit path is even. */ + kobject_get(&mk->kobj); } - param_sysfs_setup(mk, kparam, num_params, name_skip); + + /* These should not fail at boot. */ + err = add_sysfs_param(mk, kparam, kparam->name + name_skip); + BUG_ON(err); + err = sysfs_create_group(&mk->kobj, &mk->mp->grp); + BUG_ON(err); kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); } /* @@ -579,18 +621,19 @@ static void __init kernel_param_sysfs_setup(const char *name, * The "module" name (KBUILD_MODNAME) is stored before a dot, the * "parameter" name is stored behind a dot in kernel_param->name. So, * extract the "module" name for all built-in kernel_param-eters, - * and for all who have the same, call kernel_param_sysfs_setup. + * and for all who have the same, call kernel_add_sysfs_param. */ static void __init param_sysfs_builtin(void) { - struct kernel_param *kp, *kp_begin = NULL; - unsigned int i, name_len, count = 0; - char modname[MODULE_NAME_LEN] = ""; + struct kernel_param *kp; + unsigned int name_len; + char modname[MODULE_NAME_LEN]; - for (i=0; i < __stop___param - __start___param; i++) { + for (kp = __start___param; kp < __stop___param; kp++) { char *dot; - kp = &__start___param[i]; + if (kp->perm == 0) + continue; dot = strchr(kp->name, '.'); if (!dot) { @@ -599,37 +642,15 @@ static void __init param_sysfs_builtin(void) continue; } name_len = dot - kp->name; - - /* new kbuild_modname? */ - if (strlen(modname) != name_len - || strncmp(modname, kp->name, name_len) != 0) { - /* add a new kobject for previous kernel_params. */ - if (count) - kernel_param_sysfs_setup(modname, - kp_begin, - count, - strlen(modname)+1); - - strncpy(modname, kp->name, name_len); - modname[name_len] = '\0'; - count = 0; - kp_begin = kp; - } - count++; + strncpy(modname, kp->name, name_len); + modname[name_len] = '\0'; + kernel_add_sysfs_param(modname, kp, name_len+1); } - - /* last kernel_params need to be registered as well */ - if (count) - kernel_param_sysfs_setup(modname, kp_begin, count, - strlen(modname)+1); } /* module-related sysfs stuff */ -#define to_module_attr(n) container_of(n, struct module_attribute, attr); -#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); - static ssize_t module_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) -- cgit v1.2.3 From 67e67ceaac5bf55dbdceb704ff2d763d438b5373 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:23 -0500 Subject: core_param() for genuinely core kernel parameters There are a lot of one-liner uses of __setup() in the kernel: they're cumbersome and not queryable (definitely not settable) via /sys. Yet it's ugly to simplify them to module_param(), because by default that inserts a prefix of the module name (usually filename). So, introduce a "core_param". The parameter gets no prefix, but appears in /sys/module/kernel/parameters/ (if non-zero perms arg). I thought about using the name "core", but that's more common than "kernel". And if you create a module called "kernel", you will die a horrible death. Signed-off-by: Rusty Russell --- kernel/params.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index f27c992a462..b077f1b045d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -637,14 +637,14 @@ static void __init param_sysfs_builtin(void) dot = strchr(kp->name, '.'); if (!dot) { - DEBUGP("couldn't find period in first %d characters " - "of %s\n", MODULE_NAME_LEN, kp->name); - continue; + /* This happens for core_param() */ + strcpy(modname, "kernel"); + name_len = 0; + } else { + name_len = dot - kp->name + 1; + strlcpy(modname, kp->name, name_len); } - name_len = dot - kp->name; - strncpy(modname, kp->name, name_len); - modname[name_len] = '\0'; - kernel_add_sysfs_param(modname, kp, name_len+1); + kernel_add_sysfs_param(modname, kp, name_len); } } -- cgit v1.2.3 From f44dd164f37336f7e17fabf5740698fd10635172 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 22 Oct 2008 10:00:24 -0500 Subject: Make panic= and panic_on_oops into core_params This allows them to be examined and set after boot, plus means they actually give errors if they are misused (eg. panic=yes). Signed-off-by: Rusty Russell --- kernel/panic.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index bda561ef3cd..6513aac8e99 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,13 +34,6 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -static int __init panic_setup(char *str) -{ - panic_timeout = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("panic=", panic_setup); - static long no_blink(long time) { return 0; @@ -218,13 +211,6 @@ void add_taint(unsigned flag) } EXPORT_SYMBOL(add_taint); -static int __init pause_on_oops_setup(char *str) -{ - pause_on_oops = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("pause_on_oops=", pause_on_oops_setup); - static void spin_msec(int msecs) { int i; @@ -384,3 +370,6 @@ void __stack_chk_fail(void) } EXPORT_SYMBOL(__stack_chk_fail); #endif + +core_param(panic, panic_timeout, int, 0644); +core_param(pause_on_oops, pause_on_oops, int, 0644); -- cgit v1.2.3 From 0d557dc97f4bb501f086a03d0f00b99a7855d794 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 13 Oct 2008 23:50:09 +0200 Subject: workqueue: introduce create_rt_workqueue create_rt_workqueue will create a real time prioritized workqueue. This is needed for the conversion of stop_machine to a workqueue based implementation. This patch adds yet another parameter to __create_workqueue_key to tell it that we want an rt workqueue. However it looks like we rather should have something like "int type" instead of singlethread, freezable and rt. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell Cc: Ingo Molnar --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 714afad4653..f928f2a87b9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -62,6 +62,7 @@ struct workqueue_struct { const char *name; int singlethread; int freezeable; /* Freeze threads during suspend */ + int rt; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif @@ -766,6 +767,7 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct workqueue_struct *wq = cwq->wq; const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; struct task_struct *p; @@ -781,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) */ if (IS_ERR(p)) return PTR_ERR(p); - + if (cwq->wq->rt) + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); cwq->thread = p; return 0; @@ -801,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) struct workqueue_struct *__create_workqueue_key(const char *name, int singlethread, int freezeable, + int rt, struct lock_class_key *key, const char *lock_name) { @@ -822,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); wq->singlethread = singlethread; wq->freezeable = freezeable; + wq->rt = rt; INIT_LIST_HEAD(&wq->list); if (singlethread) { -- cgit v1.2.3 From c9583e55fa2b08a230c549bd1e3c0bde6c50d9cc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 13 Oct 2008 23:50:10 +0200 Subject: stop_machine: use workqueues instead of kernel threads Convert stop_machine to a workqueue based approach. Instead of using kernel threads for stop_machine we now use a an rt workqueue to synchronize all cpus. This has the advantage that all needed per cpu threads are already created when stop_machine gets called. And therefore a call to stop_machine won't fail anymore. This is needed for s390 which needs a mechanism to synchronize all cpus without allocating any memory. As Rusty pointed out free_module() needs a non-failing stop_machine interface as well. As a side effect the stop_machine code gets simplified. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell --- kernel/stop_machine.c | 111 +++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index af3c7cea258..0e688c6a1a6 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -37,9 +37,13 @@ struct stop_machine_data { /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ static unsigned int num_threads; static atomic_t thread_ack; -static struct completion finished; static DEFINE_MUTEX(lock); +static struct workqueue_struct *stop_machine_wq; +static struct stop_machine_data active, idle; +static const cpumask_t *active_cpus; +static void *stop_machine_work; + static void set_state(enum stopmachine_state newstate) { /* Reset ack counter. */ @@ -51,21 +55,25 @@ static void set_state(enum stopmachine_state newstate) /* Last one to ack a state moves to the next state. */ static void ack_state(void) { - if (atomic_dec_and_test(&thread_ack)) { - /* If we're the last one to ack the EXIT, we're finished. */ - if (state == STOPMACHINE_EXIT) - complete(&finished); - else - set_state(state + 1); - } + if (atomic_dec_and_test(&thread_ack)) + set_state(state + 1); } -/* This is the actual thread which stops the CPU. It exits by itself rather - * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ -static int stop_cpu(struct stop_machine_data *smdata) +/* This is the actual function which stops the CPU. It runs + * in the context of a dedicated stopmachine workqueue. */ +static void stop_cpu(struct work_struct *unused) { enum stopmachine_state curstate = STOPMACHINE_NONE; - + struct stop_machine_data *smdata = &idle; + int cpu = smp_processor_id(); + + if (!active_cpus) { + if (cpu == first_cpu(cpu_online_map)) + smdata = &active; + } else { + if (cpu_isset(cpu, *active_cpus)) + smdata = &active; + } /* Simple state machine */ do { /* Chill out and ensure we re-read stopmachine_state. */ @@ -90,7 +98,6 @@ static int stop_cpu(struct stop_machine_data *smdata) } while (curstate != STOPMACHINE_EXIT); local_irq_enable(); - do_exit(0); } /* Callback for CPUs which aren't supposed to do anything. */ @@ -101,78 +108,34 @@ static int chill(void *unused) int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { - int i, err; - struct stop_machine_data active, idle; - struct task_struct **threads; + struct work_struct *sm_work; + int i; + /* Set up initial state. */ + mutex_lock(&lock); + num_threads = num_online_cpus(); + active_cpus = cpus; active.fn = fn; active.data = data; active.fnret = 0; idle.fn = chill; idle.data = NULL; - /* This could be too big for stack on large machines. */ - threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); - if (!threads) - return -ENOMEM; - - /* Set up initial state. */ - mutex_lock(&lock); - init_completion(&finished); - num_threads = num_online_cpus(); set_state(STOPMACHINE_PREPARE); - for_each_online_cpu(i) { - struct stop_machine_data *smdata = &idle; - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - if (!cpus) { - if (i == first_cpu(cpu_online_map)) - smdata = &active; - } else { - if (cpu_isset(i, *cpus)) - smdata = &active; - } - - threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", - i); - if (IS_ERR(threads[i])) { - err = PTR_ERR(threads[i]); - threads[i] = NULL; - goto kill_threads; - } - - /* Place it onto correct cpu. */ - kthread_bind(threads[i], i); - - /* Make it highest prio. */ - if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) - BUG(); - } - - /* We've created all the threads. Wake them all: hold this CPU so one + /* Schedule the stop_cpu work on all cpus: hold this CPU so one * doesn't hit this CPU until we're ready. */ get_cpu(); - for_each_online_cpu(i) - wake_up_process(threads[i]); - + for_each_online_cpu(i) { + sm_work = percpu_ptr(stop_machine_work, i); + INIT_WORK(sm_work, stop_cpu); + queue_work_on(i, stop_machine_wq, sm_work); + } /* This will release the thread on our CPU. */ put_cpu(); - wait_for_completion(&finished); + flush_workqueue(stop_machine_wq); mutex_unlock(&lock); - - kfree(threads); - return active.fnret; - -kill_threads: - for_each_online_cpu(i) - if (threads[i]) - kthread_stop(threads[i]); - mutex_unlock(&lock); - - kfree(threads); - return err; } int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) @@ -187,3 +150,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) return ret; } EXPORT_SYMBOL_GPL(stop_machine); + +static int __init stop_machine_init(void) +{ + stop_machine_wq = create_rt_workqueue("kstop"); + stop_machine_work = alloc_percpu(struct work_struct); + return 0; +} +early_initcall(stop_machine_init); -- cgit v1.2.3 From 8163bcac779f62c6bf847caed9bce905db0693fb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 22 Oct 2008 10:00:26 -0500 Subject: stop_machine: fix error code handling on multiple cpus Using |= for updating a value which might be updated on several cpus concurrently will not always work since we need to make sure that the update happens atomically. To fix this just use a write if the called function returns an error code on a cpu. We end up writing the error code of an arbitrary cpu if multiple ones fail but that should be sufficient. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell --- kernel/stop_machine.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0e688c6a1a6..8aff79d90dd 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -66,6 +66,7 @@ static void stop_cpu(struct work_struct *unused) enum stopmachine_state curstate = STOPMACHINE_NONE; struct stop_machine_data *smdata = &idle; int cpu = smp_processor_id(); + int err; if (!active_cpus) { if (cpu == first_cpu(cpu_online_map)) @@ -86,9 +87,11 @@ static void stop_cpu(struct work_struct *unused) hard_irq_disable(); break; case STOPMACHINE_RUN: - /* |= allows error detection if functions on - * multiple CPUs. */ - smdata->fnret |= smdata->fn(smdata->data); + /* On multiple CPUs only a single error code + * is needed to tell that something failed. */ + err = smdata->fn(smdata->data); + if (err) + smdata->fnret = err; break; default: break; -- cgit v1.2.3 From 98bc993f99e51467057ef699e47fec020f24d233 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Aug 2008 01:06:21 -0400 Subject: [PATCH] get rid of nameidata in audit_tree Signed-off-by: Al Viro --- kernel/audit_tree.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f7921a2ecf1..8ba0e0d934f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -532,7 +532,7 @@ void audit_trim_trees(void) list_add(&cursor, &tree_list); while (cursor.next != &tree_list) { struct audit_tree *tree; - struct nameidata nd; + struct path path; struct vfsmount *root_mnt; struct node *node; struct list_head list; @@ -544,12 +544,12 @@ void audit_trim_trees(void) list_add(&cursor, &tree->list); mutex_unlock(&audit_filter_mutex); - err = path_lookup(tree->pathname, 0, &nd); + err = kern_path(tree->pathname, 0, &path); if (err) goto skip_it; - root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); + root_mnt = collect_mounts(path.mnt, path.dentry); + path_put(&path); if (!root_mnt) goto skip_it; @@ -580,19 +580,19 @@ skip_it: } static int is_under(struct vfsmount *mnt, struct dentry *dentry, - struct nameidata *nd) + struct path *path) { - if (mnt != nd->path.mnt) { + if (mnt != path->mnt) { for (;;) { if (mnt->mnt_parent == mnt) return 0; - if (mnt->mnt_parent == nd->path.mnt) + if (mnt->mnt_parent == path->mnt) break; mnt = mnt->mnt_parent; } dentry = mnt->mnt_mountpoint; } - return is_subdir(dentry, nd->path.dentry); + return is_subdir(dentry, path->dentry); } int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) @@ -618,7 +618,7 @@ void audit_put_tree(struct audit_tree *tree) int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; - struct nameidata nd; + struct path path; struct vfsmount *mnt, *p; struct list_head list; int err; @@ -637,11 +637,11 @@ int audit_add_tree_rule(struct audit_krule *rule) /* do not set rule->tree yet */ mutex_unlock(&audit_filter_mutex); - err = path_lookup(tree->pathname, 0, &nd); + err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); + mnt = collect_mounts(path.mnt, path.dentry); + path_put(&path); if (!mnt) { err = -ENOMEM; goto Err; @@ -690,29 +690,29 @@ int audit_tag_tree(char *old, char *new) { struct list_head cursor, barrier; int failed = 0; - struct nameidata nd; + struct path path; struct vfsmount *tagged; struct list_head list; struct vfsmount *mnt; struct dentry *dentry; int err; - err = path_lookup(new, 0, &nd); + err = kern_path(new, 0, &path); if (err) return err; - tagged = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); + tagged = collect_mounts(path.mnt, path.dentry); + path_put(&path); if (!tagged) return -ENOMEM; - err = path_lookup(old, 0, &nd); + err = kern_path(old, 0, &path); if (err) { drop_collected_mounts(tagged); return err; } - mnt = mntget(nd.path.mnt); - dentry = dget(nd.path.dentry); - path_put(&nd.path); + mnt = mntget(path.mnt); + dentry = dget(path.dentry); + path_put(&path); if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) follow_up(&mnt, &dentry); @@ -733,7 +733,7 @@ int audit_tag_tree(char *old, char *new) list_add(&cursor, &tree->list); mutex_unlock(&audit_filter_mutex); - err = path_lookup(tree->pathname, 0, &nd); + err = kern_path(tree->pathname, 0, &path); if (err) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -741,15 +741,15 @@ int audit_tag_tree(char *old, char *new) } spin_lock(&vfsmount_lock); - if (!is_under(mnt, dentry, &nd)) { + if (!is_under(mnt, dentry, &path)) { spin_unlock(&vfsmount_lock); - path_put(&nd.path); + path_put(&path); put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } spin_unlock(&vfsmount_lock); - path_put(&nd.path); + path_put(&path); list_for_each_entry(p, &list, mnt_list) { failed = tag_chunk(p->mnt_root->d_inode, tree); -- cgit v1.2.3 From 6e62775ece1c83a84d86df44cfd8ea3e6c96ce23 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 4 Oct 2008 14:28:09 +0400 Subject: proc: move /proc/execdomains to kernel/exec_domain.c Signed-off-by: Alexey Dobriyan --- kernel/exec_domain.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0d407e88673..0511716e942 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -173,20 +175,39 @@ __set_personality(u_long personality) return 0; } -int -get_exec_domain_list(char *page) +#ifdef CONFIG_PROC_FS +static int execdomains_proc_show(struct seq_file *m, void *v) { struct exec_domain *ep; - int len = 0; read_lock(&exec_domains_lock); - for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) - len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", + for (ep = exec_domains; ep; ep = ep->next) + seq_printf(m, "%d-%d\t%-16s\t[%s]\n", ep->pers_low, ep->pers_high, ep->name, module_name(ep->module)); read_unlock(&exec_domains_lock); - return (len); + return 0; +} + +static int execdomains_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, execdomains_proc_show, NULL); +} + +static const struct file_operations execdomains_proc_fops = { + .open = execdomains_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_execdomains_init(void) +{ + proc_create("execdomains", 0, NULL, &execdomains_proc_fops); + return 0; } +module_init(proc_execdomains_init); +#endif asmlinkage long sys_personality(u_long personality) -- cgit v1.2.3 From 3b5d5c6b0ccba733a313f8752ebc3f8015628ba3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 6 Oct 2008 13:19:27 +0400 Subject: proc: move /proc/modules boilerplate to kernel/module.c Signed-off-by: Alexey Dobriyan --- kernel/module.c | 59 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0d8d21ee792..6fded84d702 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -2599,23 +2601,6 @@ unsigned long module_kallsyms_lookup_name(const char *name) } #endif /* CONFIG_KALLSYMS */ -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - static char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -2649,6 +2634,24 @@ static char *module_flags(struct module *mod, char *buf) return buf; } +#ifdef CONFIG_PROC_FS +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); @@ -2679,13 +2682,33 @@ static int m_show(struct seq_file *m, void *p) Where refcount is a number or -, and deps is a comma-separated list of depends or -. */ -const struct seq_operations modules_op = { +static const struct seq_operations modules_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show }; +static int modules_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &modules_op); +} + +static const struct file_operations proc_modules_operations = { + .open = modules_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &proc_modules_operations); + return 0; +} +module_init(proc_modules_init); +#endif + /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { -- cgit v1.2.3 From b5aadf7f14c1acc94956aa257e018e9de3881f41 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 6 Oct 2008 13:23:43 +0400 Subject: proc: move /proc/schedstat boilerplate to kernel/sched_stats.h Signed-off-by: Alexey Dobriyan --- kernel/sched.c | 1 + kernel/sched_stats.h | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d906f72b42d..5a70189d505 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index b8c156979cf..3d14ce27390 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file) return res; } -const struct file_operations proc_schedstat_operations = { +static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; +static int __init proc_schedstat_init(void) +{ + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; +} +module_init(proc_schedstat_init); + /* * Expects runqueue lock to be held for atomicity of update */ -- cgit v1.2.3 From d2441183dc222d12961ff2201f5086c846505d93 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Oct 2008 12:07:17 -0700 Subject: Fix compile warning in kernel/params.c Move free_module_param_attrs() into the CONFIG_MODULES section, since it's only used inside there. Thus avoiding the warning kernel/params.c:514: warning: 'free_module_param_attrs' defined but not used Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b077f1b045d..a1e3025b19a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -510,6 +510,7 @@ fail: return err; } +#ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { kfree(mk->mp->grp.attrs); @@ -517,7 +518,6 @@ static void free_module_param_attrs(struct module_kobject *mk) mk->mp = NULL; } -#ifdef CONFIG_MODULES /* * module_param_sysfs_setup - setup sysfs support for one module * @mod: module -- cgit v1.2.3 From 4403b406d4369a275d483ece6ddee0088cc0d592 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 25 Oct 2008 19:53:38 -0700 Subject: Revert "Call init_workqueues before pre smp initcalls." This reverts commit a802dd0eb5fc97a50cf1abb1f788a8f6cc5db635 by moving the call to init_workqueues() back where it belongs - after SMP has been initialized. It also moves stop_machine_init() - which needs workqueues - to a later phase using a core_initcall() instead of early_initcall(). That should satisfy all ordering requirements, and was apparently the reason why init_workqueues() was moved to be too early. Cc: Heiko Carstens Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 8aff79d90dd..9bc4c00872c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -160,4 +160,4 @@ static int __init stop_machine_init(void) stop_machine_work = alloc_percpu(struct work_struct); return 0; } -early_initcall(stop_machine_init); +core_initcall(stop_machine_init); -- cgit v1.2.3 From 2077776641b6ffb0049f13018d2e162340ec51c7 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 21 Oct 2008 16:11:20 +1100 Subject: cgroup: remove unused variable /scratch/sfr/next/kernel/cgroup.c: In function 'cgroup_tasks_start': /scratch/sfr/next/kernel/cgroup.c:2107: warning: unused variable 'i' Introduced in commit cc31edceee04a7b87f2be48f9489ebb72d264844 "cgroups: convert tasks file to use a seq_file with shared pid array". Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 046c1609606..35eebd5510c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2104,7 +2104,7 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) down_read(&cgrp->pids_mutex); if (pid) { int end = cgrp->pids_length; - int i; + while (index < end) { int mid = (index + end) / 2; if (cgrp->tasks_pids[mid] == pid) { -- cgit v1.2.3