From f6cf891c4d7128f9f91243fc0b9ce99e10fa1586 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Aug 2007 12:53:24 +0200 Subject: sched: make the scheduler converge to the ideal latency de-HZ-ification of the granularity defaults unearthed a pre-existing property of CFS: while it correctly converges to the granularity goal, it does not prevent run-time fluctuations in the range of [-gran ... 0 ... +gran]. With the increase of the granularity due to the removal of HZ dependencies, this becomes visible in chew-max output (with 5 tasks running): out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 out: 27 . 27. 32 | flu: 0 . 0 | ran: 17 . 13 | per: 44 . 40 out: 27 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 36 . 40 out: 29 . 27. 32 | flu: 2 . 0 | ran: 17 . 13 | per: 46 . 40 out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 out: 29 . 27. 32 | flu: 0 . 0 | ran: 18 . 13 | per: 47 . 40 out: 28 . 27. 32 | flu: 0 . 0 | ran: 9 . 13 | per: 37 . 40 average slice is the ideal 13 msecs and the period is picture-perfect 40 msecs. But the 'ran' field fluctuates around 13.33 msecs and there's no mechanism in CFS to keep that from happening: it's a perfectly valid solution that CFS finds. to fix this we add a granularity/preemption rule that knows about the "target latency", which makes tasks that run longer than the ideal latency run a bit less. The simplest approach is to simply decrease the preemption granularity when a task overruns its ideal latency. For this we have to track how much the task executed since its last preemption. ( this adds a new field to task_struct, but we can eliminate that overhead in 2.6.24 by putting all the scheduler timestamps into an anonymous union. ) with this change in place, chew-max output is fluctuation-less all around: out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 2 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 1 | ran: 13 . 13 | per: 41 . 40 out: 28 . 27. 39 | flu: 0 . 1 | ran: 13 . 13 | per: 41 . 40 this patch has no impact on any fastpath or on any globally observable scheduling property. (unless you have sharp enough eyes to see millisecond-level ruckles in glxgears smoothness :-) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9fe473a190d..b533d6db78a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1587,6 +1587,7 @@ static void __sched_fork(struct task_struct *p) p->se.wait_start_fair = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; p->se.delta_exec = 0; p->se.delta_fair_run = 0; p->se.delta_fair_sleep = 0; -- cgit v1.2.3 From 7fd0d2dde929ead79901e389e70dbfb3c6c06986 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 5 Sep 2007 14:32:48 +0200 Subject: sched: fix MC/HT scheduler optimization, without breaking the FUZZ logic. First fix the check if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) with this if (*imbalance < busiest_load_per_task) As the current check is always false for nice 0 tasks (as SCHED_LOAD_SCALE_FUZZ is same as busiest_load_per_task for nice 0 tasks). With the above change, imbalance was getting reset to 0 in the corner case condition, making the FUZZ logic fail. Fix it by not corrupting the imbalance and change the imbalance, only when it finds that the HT/MC optimization is needed. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b533d6db78a..c8759ec6d8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2512,7 +2512,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) { + if (*imbalance < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -2564,10 +2564,8 @@ small_imbalance: pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ - if (pwr_move <= pwr_now) - goto out_balanced; - - *imbalance = busiest_load_per_task; + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; } return busiest; -- cgit v1.2.3 From a206c07213cf6372289f189c3774c4c3255a7ae1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: debug: fix cfs_rq->wait_runtime accounting the cfs_rq->wait_runtime debug/statistics counter was not maintained properly - fix this. this also removes some code: text data bss dec hex filename 13420 228 1204 14852 3a04 sched.o.before 13404 228 1204 14836 39f4 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c8759ec6d8a..97986f1f0be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -858,7 +858,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) static void set_load_weight(struct task_struct *p) { - task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; p->se.wait_runtime = 0; if (task_has_rt_policy(p)) { -- cgit v1.2.3 From cf2ab4696ee42f895eed88c2b6e432fe03dda0db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 5 Sep 2007 14:32:49 +0200 Subject: sched: fix xtensa build warning rename RSR to SRR - 'RSR' is already defined on xtensa. found by Adrian Bunk. Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 97986f1f0be..deeb1f8e0c3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -668,7 +668,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) /* * Shift right and round: */ -#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, @@ -684,10 +684,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, * Check whether we'd overflow the 64-bit multiplication: */ if (unlikely(tmp > WMULT_CONST)) - tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, WMULT_SHIFT/2); else - tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT); + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -- cgit v1.2.3