aboutsummaryrefslogtreecommitdiff
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c365
1 files changed, 297 insertions, 68 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 0227f1625a7..a3a04085e79 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
+#include <asm/tlb.h>
#include <asm/unistd.h>
/*
@@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
+ (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
@@ -223,6 +245,10 @@ struct rq {
unsigned long raw_weighted_load;
#ifdef CONFIG_SMP
unsigned long cpu_load[3];
+ unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+ unsigned char in_nohz_recently;
+#endif
#endif
unsigned long long nr_switches;
@@ -278,7 +304,7 @@ struct rq {
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
static inline int cpu_of(struct rq *rq)
{
@@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p)
if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
+
+static void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
#else
static inline void resched_task(struct task_struct *p)
{
@@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
@@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p)
struct sched_domain *sd;
int i;
- if (idle_cpu(cpu))
+ /*
+ * If it is idle, then it is the best cpu to run this task.
+ *
+ * This cpu is also the best, if it has more than one task already.
+ * Siblings must be also busy(in most cases) as they didn't already
+ * pickup the extra load from this cpu and hence we need not check
+ * sibling runqueue info. This will avoid the checks and cache miss
+ * penalities associated with that.
+ */
+ if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
@@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
}
total_load += avg_load;
- total_pwr += group->cpu_power;
+ total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
- group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
@@ -2468,8 +2516,8 @@ group_next:
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->cpu_power,
- (avg_load - this_load) * this->cpu_power)
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -2503,28 +2551,29 @@ small_imbalance:
* moving them.
*/
- pwr_now += busiest->cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->__cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- busiest->cpu_power;
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
- pwr_move += busiest->cpu_power *
+ pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->cpu_power <
+ if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = max_load * busiest->cpu_power / this->cpu_power;
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
else
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- this->cpu_power;
- pwr_move += this->cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
@@ -2657,6 +2706,12 @@ redo:
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
+ /*
+ * some other cpu did the load balance for us.
+ */
+ if (nr_moved && this_cpu != smp_processor_id())
+ resched_cpu(this_cpu);
+
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
@@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq)
}
}
+#ifdef CONFIG_NO_HZ
+static struct {
+ atomic_t load_balancer;
+ cpumask_t cpu_mask;
+} nohz ____cacheline_aligned = {
+ .load_balancer = ATOMIC_INIT(-1),
+ .cpu_mask = CPU_MASK_NONE,
+};
+
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
*
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
+
+ if (stop_tick) {
+ cpu_set(cpu, nohz.cpu_mask);
+ cpu_rq(cpu)->in_nohz_recently = 1;
+
+ /*
+ * If we are going offline and still the leader, give up!
+ */
+ if (cpu_is_offline(cpu) &&
+ atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ return 0;
+ }
+
+ /* time for ilb owner also to sleep */
+ if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ atomic_set(&nohz.load_balancer, -1);
+ return 0;
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+ return 1;
+ } else if (atomic_read(&nohz.load_balancer) == cpu)
+ return 1;
+ } else {
+ if (!cpu_isset(cpu, nohz.cpu_mask))
+ return 0;
+
+ cpu_clear(cpu, nohz.cpu_mask);
+
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ }
+ return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
{
- int this_cpu = smp_processor_id(), balance = 1;
- struct rq *this_rq = cpu_rq(this_cpu);
+ int balance = 1;
+ struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
- /*
- * We are idle if there are no processes running. This
- * is valid even if we are the idle process (SMT).
- */
- enum idle_type idle = !this_rq->nr_running ?
- SCHED_IDLE : NOT_IDLE;
- /* Earliest time when we have to call run_rebalance_domains again */
+ /* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
- for_each_domain(this_cpu, sd) {
+ for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -2995,7 +3116,114 @@ out:
if (!balance)
break;
}
- this_rq->next_balance = next_balance;
+ rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int local_cpu = smp_processor_id();
+ struct rq *local_rq = cpu_rq(local_cpu);
+ enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+
+ rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * If this cpu is the owner for idle load balancing, then do the
+ * balancing on behalf of the other idle cpus whose ticks are
+ * stopped.
+ */
+ if (local_rq->idle_at_tick &&
+ atomic_read(&nohz.load_balancer) == local_cpu) {
+ cpumask_t cpus = nohz.cpu_mask;
+ struct rq *rq;
+ int balance_cpu;
+
+ cpu_clear(local_cpu, cpus);
+ for_each_cpu_mask(balance_cpu, cpus) {
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched())
+ break;
+
+ rebalance_domains(balance_cpu, SCHED_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(local_rq->next_balance, rq->next_balance))
+ local_rq->next_balance = rq->next_balance;
+ }
+ }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+ /*
+ * If we were in the nohz mode recently and busy at the current
+ * scheduler tick, then check if we need to nominate new idle
+ * load balancer.
+ */
+ if (rq->in_nohz_recently && !rq->idle_at_tick) {
+ rq->in_nohz_recently = 0;
+
+ if (atomic_read(&nohz.load_balancer) == cpu) {
+ cpu_clear(cpu, nohz.cpu_mask);
+ atomic_set(&nohz.load_balancer, -1);
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /*
+ * simple selection for now: Nominate the
+ * first cpu in the nohz list to be the next
+ * ilb owner.
+ *
+ * TBD: Traverse the sched domains and nominate
+ * the nearest cpu in the nohz.cpu_mask.
+ */
+ int ilb = first_cpu(nohz.cpu_mask);
+
+ if (ilb != NR_CPUS)
+ resched_cpu(ilb);
+ }
+ }
+
+ /*
+ * If this cpu is idle and doing idle load balancing for all the
+ * cpus with ticks stopped, is it time for that to stop?
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+ cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ resched_cpu(cpu);
+ return;
+ }
+
+ /*
+ * If this cpu is idle and the idle load balancing is done by
+ * someone else, then no need raise the SCHED_SOFTIRQ
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+ cpu_isset(cpu, nohz.cpu_mask))
+ return;
+#endif
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
}
#else
/*
@@ -3218,16 +3446,17 @@ void scheduler_tick(void)
unsigned long long now = sched_clock();
struct task_struct *p = current;
int cpu = smp_processor_id();
+ int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
update_cpu_clock(p, rq, now);
- if (p != rq->idle)
+ if (!idle_at_tick)
task_running_tick(rq, p);
#ifdef CONFIG_SMP
update_load(rq);
- if (time_after_eq(jiffies, rq->next_balance))
- raise_softirq(SCHED_SOFTIRQ);
+ rq->idle_at_tick = idle_at_tick;
+ trigger_load_balance(cpu);
#endif
}
@@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
struct prio_array *array;
unsigned long flags;
struct rq *rq;
- int oldprio;
+ int delta;
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- oldprio = p->prio;
+ delta = prio - p->prio;
array = p->array;
if (array)
dequeue_task(p, array);
@@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
enqueue_task(p, array);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
+ * our priority decreased, or if our priority became higher
+ * than the current's.
*/
- if (task_running(rq, p)) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
task_rq_unlock(rq, &flags);
@@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice)
enqueue_task(p, array);
inc_raw_weighted_load(rq, p);
/*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if our priority became higher
+ * than the current's.
*/
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -4153,13 +4382,11 @@ recheck:
__activate_task(p, rq);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
+ * our priority decreased, or our priority became higher
+ * than the current's.
*/
- if (task_running(rq, p)) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (task_running(rq, p) && p->prio > oldprio))
resched_task(rq->curr);
}
__task_rq_unlock(rq);
@@ -4750,6 +4977,8 @@ void show_state_filter(unsigned long state_filter)
show_task(p);
} while_each_thread(g, p);
+ touch_all_softlockup_watchdogs();
+
read_unlock(&tasklist_lock);
/*
* Only show locks if all tasks are dumped:
@@ -5304,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
- if (!group->cpu_power) {
+ if (!group->__cpu_power) {
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5481,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
continue;
sg->cpumask = CPU_MASK_NONE;
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group)
@@ -6170,7 +6399,7 @@ next_sg:
continue;
}
- sg->cpu_power += sd->groups->cpu_power;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
@@ -6245,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child = sd->child;
+ sd->groups->__cpu_power = 0;
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
@@ -6255,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return;
}
- sd->groups->cpu_power = 0;
-
/*
* add cpu_power of each child group to this groups cpu_power
*/
group = child->groups;
do {
- sd->groups->cpu_power += group->cpu_power;
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next;
} while (group != child->groups);
}
@@ -6426,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = nodemask;
sg->next = sg;
cpus_or(covered, covered, nodemask);
@@ -6454,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
"Can not alloc domain group for node %d\n", j);
goto error;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = tmp;
sg->next = prev->next;
cpus_or(covered, covered, tmp);