diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 217 |
1 files changed, 177 insertions, 40 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 8e2558c2ba6..7b389c74f8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return list_empty(&root_task_group.children); +} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else +#ifdef CONFIG_SMP +static int root_task_group_empty(void) +{ + return 1; +} +#endif + static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { @@ -467,11 +481,17 @@ struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - int highest_prio; /* highest queued rt task prio */ + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded; + struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; @@ -549,7 +569,6 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ unsigned long last_tick_seen; unsigned char in_nohz_recently; @@ -590,6 +609,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned char idle_at_tick; /* For active balancing */ int active_balance; int push_cpu; @@ -618,9 +638,6 @@ struct rq { /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ - unsigned int yld_exp_empty; - unsigned int yld_act_empty; - unsigned int yld_both_empty; unsigned int yld_count; /* schedule() stats */ @@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p) assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + if (test_tsk_need_resched(p)) return; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + set_tsk_need_resched(p); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu) * lockless. The worst case is that the other CPU runs the * idle task through an additional NOOP schedule() */ - set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + set_tsk_need_resched(rq->idle); /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -1610,21 +1627,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_PREEMPT + /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) { int ret = 0; - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -1637,6 +1675,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { @@ -1705,6 +1759,9 @@ static void update_avg(u64 *avg, u64 sample) static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { + if (wakeup) + p->se.start_runtime = p->se.sum_exec_runtime; + sched_info_queued(p); p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; @@ -1712,10 +1769,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - if (sleep && p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; + if (sleep) { + if (p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } else { + update_avg(&p->se.avg_wakeup, + sysctl_sched_wakeup_granularity); + } } sched_info_dequeued(p); @@ -2017,7 +2079,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * it must be off the runqueue _entirely_, and not * preempted! * - * So if it wa still runnable (but just not actively + * So if it was still runnable (but just not actively * running right now), it's preempted, and we should * yield - it could be a while. */ @@ -2267,7 +2329,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) sync = 0; #ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE)) { + if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { struct sched_domain *sd; this_cpu = raw_smp_processor_id(); @@ -2345,6 +2407,22 @@ out_activate: activate_task(rq, p, 1); success = 1; + /* + * Only attribute actual wakeups done by this task. + */ + if (!in_interrupt()) { + struct sched_entity *se = ¤t->se; + u64 sample = se->sum_exec_runtime; + + if (se->last_wakeup) + sample -= se->last_wakeup; + else + sample -= se->start_runtime; + update_avg(&se->avg_wakeup, sample); + + se->last_wakeup = se->sum_exec_runtime; + } + out_running: trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); @@ -2355,8 +2433,6 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: - current->se.last_wakeup = current->se.sum_exec_runtime; - task_rq_unlock(rq, &flags); return success; @@ -2386,6 +2462,8 @@ static void __sched_fork(struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; + p->se.start_runtime = 0; + p->se.avg_wakeup = sysctl_sched_wakeup_granularity; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2448,6 +2526,8 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + plist_node_init(&p->pushable_tasks, MAX_PRIO); + put_cpu(); } @@ -2491,7 +2571,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) #ifdef CONFIG_PREEMPT_NOTIFIERS /** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * preempt_notifier_register - tell me when current is being preempted & rescheduled * @notifier: notifier struct to register */ void preempt_notifier_register(struct preempt_notifier *notifier) @@ -2588,6 +2668,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; +#ifdef CONFIG_SMP + int post_schedule = 0; + + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif rq->prev_mm = NULL; @@ -2606,7 +2692,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP - if (current->sched_class->post_schedule) + if (post_schedule) current->sched_class->post_schedule(rq); #endif @@ -2913,6 +2999,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { + int tsk_cache_hot = 0; /* * We do not migrate tasks that are: * 1) running (obviously), or @@ -2936,10 +3023,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (!task_hot(p, rq->clock, sd) || - sd->nr_balance_failed > sd->cache_nice_tries) { + tsk_cache_hot = task_hot(p, rq->clock, sd); + if (!tsk_cache_hot || + sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(sd, lb_hot_gained[idle]); schedstat_inc(p, se.nr_forced_migrations); } @@ -2947,7 +3035,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, return 1; } - if (task_hot(p, rq->clock, sd)) { + if (tsk_cache_hot) { schedstat_inc(p, se.nr_failed_migrations_hot); return 0; } @@ -2987,6 +3075,16 @@ next: pulled++; rem_load_move -= p->se.load.weight; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible kernels + * will stop after the first task is pulled to minimize the critical + * section. + */ + if (idle == CPU_NEWLY_IDLE) + goto out; +#endif + /* * We only want to steal up to the prescribed amount of weighted load. */ @@ -3033,9 +3131,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, sd, idle, all_pinned, &this_best_prio); class = class->next; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; - +#endif } while (class && max_load_move > total_load_moved); return total_load_moved > 0; @@ -4057,6 +4161,11 @@ static void run_rebalance_domains(struct softirq_action *h) #endif } +static inline int on_null_domain(int cpu) +{ + return !rcu_dereference(cpu_rq(cpu)->sd); +} + /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. * @@ -4114,7 +4223,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif - if (time_after_eq(jiffies, rq->next_balance)) + /* Don't need to rebalance while attached to NULL domain */ + if (time_after_eq(jiffies, rq->next_balance) && + likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); } @@ -4508,11 +4619,33 @@ static inline void schedule_debug(struct task_struct *prev) #endif } +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; + + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + + /* + * In order to avoid avg_overlap growing stale when we are + * indeed overlapping and hence not getting put to sleep, grow + * the avg_overlap on preemption. + * + * We use the average preemption runtime because that + * correlates to the amount of cache footprint a task can + * build up. + */ + update_avg(&prev->se.avg_overlap, runtime); + } + prev->sched_class->put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq) { const struct sched_class *class; struct task_struct *p; @@ -4586,8 +4719,8 @@ need_resched_nonpreemptible: if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - prev->sched_class->put_prev_task(rq, prev); - next = pick_next_task(rq, prev); + put_prev_task(rq, prev); + next = pick_next_task(rq); if (likely(prev != next)) { sched_info_switch(prev, next); @@ -4642,7 +4775,7 @@ asmlinkage void __sched preempt_schedule(void) * between schedule and now. */ barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); + } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); @@ -4671,7 +4804,7 @@ asmlinkage void __sched preempt_schedule_irq(void) * between schedule and now. */ barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); + } while (need_resched()); } #endif /* CONFIG_PREEMPT */ @@ -5145,7 +5278,7 @@ SYSCALL_DEFINE1(nice, int, increment) if (increment > 40) increment = 40; - nice = PRIO_TO_NICE(current->static_prio) + increment; + nice = TASK_NICE(current) + increment; if (nice < -20) nice = -20; if (nice > 19) @@ -6423,7 +6556,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) if (!rq->nr_running) break; update_rq_clock(rq); - next = pick_next_task(rq, rq->curr); + next = pick_next_task(rq); if (!next) break; next->sched_class->put_prev_task(rq, next); @@ -8218,11 +8351,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - rt_rq->highest_prio = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO; +#ifdef CONFIG_SMP + rt_rq->highest_prio.next = MAX_RT_PRIO; +#endif #endif #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; + plist_head_init(&rq->rt.pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; @@ -9598,7 +9735,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cpuacct *ca; int cpu; - if (!cpuacct_subsys.active) + if (unlikely(!cpuacct_subsys.active)) return; cpu = task_cpu(tsk); |