From 5c8e1ed1d204a6770ca2854cd3b3597070fe7e5a Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:01 -0700 Subject: sched: CPU hotplug events must not destroy scheduler domains created by the cpusets First issue is not related to the cpusets. We're simply leaking doms_cur. It's allocated in arch_init_sched_domains() which is called for every hotplug event. So we just keep reallocation doms_cur without freeing it. I introduced free_sched_domains() function that cleans things up. Second issue is that sched domains created by the cpusets are completely destroyed by the CPU hotplug events. For all CPU hotplug events scheduler attaches all CPUs to the NULL domain and then puts them all into the single domain thereby destroying domains created by the cpusets (partition_sched_domains). The solution is simple, when cpusets are enabled scheduler should not create default domain and instead let cpusets do that. Which is exactly what the patch does. Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/cpuset.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 86ea9e34e32..6090d18b58a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1886,6 +1886,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } -- cgit v1.2.3 From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Jun 2008 12:57:11 -0700 Subject: sched: prevent bound kthreads from changing cpus_allowed Kthreads that have called kthread_bind() are bound to specific cpus, so other tasks should not be able to change their cpus_allowed from under them. Otherwise, it is possible to move kthreads, such as the migration or software watchdog threads, so they are not allowed access to the cpu they work on. Cc: Peter Zijlstra Cc: Paul Menage Cc: Paul Jackson Signed-off-by: David Rientjes Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6090d18b58a..b84354f4de3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; + if (tsk->flags & PF_THREAD_BOUND) { + cpumask_t mask; + + mutex_lock(&callback_mutex); + mask = cs->cpus_allowed; + mutex_unlock(&callback_mutex); + if (!cpus_equal(tsk->cpus_allowed, mask)) + return -EINVAL; + } return security_task_setscheduler(tsk, 0, NULL); } @@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + int err; mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed_ptr(tsk, &cpus); + err = set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); + if (err) + return; from = oldcs->mems_allowed; to = cs->mems_allowed; -- cgit v1.2.3 From 3e84050c81ffb4961ef43d20e1fb1d7607167d83 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Sun, 13 Jul 2008 02:10:29 +0200 Subject: cpusets, hotplug, scheduler: fix scheduler domain breakage Commit f18f982ab ("sched: CPU hotplug events must not destroy scheduler domains created by the cpusets") introduced a hotplug-related problem as described below: Upon CPU_DOWN_PREPARE, update_sched_domains() -> detach_destroy_domains(&cpu_online_map) does the following: /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing * code, so we temporarily attach all running cpus to the NULL domain * which will prevent rebalancing while the sched domains are recalculated. */ The sched-domains should be rebuilt when a CPU_DOWN ops. has been completed, effectively either upon CPU_DEAD{_FROZEN} (upon success) or CPU_DOWN_FAILED{_FROZEN} (upon failure -- restore the things to their initial state). That's what update_sched_domains() also does but only for !CPUSETS case. With f18f982ab, sched-domains' reinitialization is delegated to CPUSETS code: cpuset_handle_cpuhp() -> common_cpu_mem_hotplug_unplug() -> rebuild_sched_domains() Being called for CPU_UP_PREPARE and if its callback is called after update_sched_domains()), it just negates all the work done by update_sched_domains() -- i.e. a soon-to-be-offline cpu is included in the sched-domains and that makes it visible for the load-balancer while the CPU_DOWN ops. is in progress. __migrate_live_tasks() moves the tasks off a 'dead' cpu (it's already "offline" when this function is called). try_to_wake_up() is called for one of these tasks from another CPU -> the load-balancer (wake_idle()) picks up a "dead" CPU and places the task on it. Then e.g. BUG_ON(rq->nr_running) detects this a bit later -> oops. Signed-off-by: Dmitry Adamushko Tested-by: Vegard Nossum Cc: Paul Menage Cc: Max Krasnyansky Cc: Paul Jackson Cc: Peter Zijlstra Cc: miaox@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fceb97e989..798b3ab054e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1882,7 +1882,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root) * in order to minimize text size. */ -static void common_cpu_mem_hotplug_unplug(void) +static void common_cpu_mem_hotplug_unplug(int rebuild_sd) { cgroup_lock(); @@ -1894,7 +1894,8 @@ static void common_cpu_mem_hotplug_unplug(void) * Scheduler destroys domains on hotplug events. * Rebuild them based on the current settings. */ - rebuild_sched_domains(); + if (rebuild_sd) + rebuild_sched_domains(); cgroup_unlock(); } @@ -1912,11 +1913,22 @@ static void common_cpu_mem_hotplug_unplug(void) static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { - if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) + switch (phase) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + common_cpu_mem_hotplug_unplug(1); + break; + default: return NOTIFY_DONE; + } - common_cpu_mem_hotplug_unplug(); - return 0; + return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -1929,7 +1941,7 @@ static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, void cpuset_track_online_nodes(void) { - common_cpu_mem_hotplug_unplug(); + common_cpu_mem_hotplug_unplug(0); } #endif -- cgit v1.2.3 From e761b7725234276a802322549cee5255305a0930 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 15 Jul 2008 04:43:49 -0700 Subject: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2) This is based on Linus' idea of creating cpu_active_map that prevents scheduler load balancer from migrating tasks to the cpu that is going down. It allows us to simplify domain management code and avoid unecessary domain rebuilds during cpu hotplug event handling. Please ignore the cpusets part for now. It needs some more work in order to avoid crazy lock nesting. Although I did simplfy and unify domain reinitialization logic. We now simply call partition_sched_domains() in all the cases. This means that we're using exact same code paths as in cpusets case and hence the test below cover cpusets too. Cpuset changes to make rebuild_sched_domains() callable from various contexts are in the separate patch (right next after this one). This not only boots but also easily handles while true; do make clean; make -j 8; done and while true; do on-off-cpu 1; done at the same time. (on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing). Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing this on right now in gnome-terminal and things are moving just fine. Also this is running with most of the debug features enabled (lockdep, mutex, etc) no BUG_ONs or lockdep complaints so far. I believe I addressed all of the Dmitry's comments for original Linus' version. I changed both fair and rt balancer to mask out non-active cpus. And replaced cpu_is_offline() with !cpu_active() in the main scheduler code where it made sense (to me). Signed-off-by: Max Krasnyanskiy Acked-by: Linus Torvalds Acked-by: Peter Zijlstra Acked-by: Gregory Haskins Cc: dmitry.adamushko@gmail.com Cc: pj@sgi.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a..3c3ef02f65f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * partition_sched_domains(). */ -static void rebuild_sched_domains(void) +void rebuild_sched_domains(void) { struct kfifo *q; /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ -- cgit v1.2.3 From 91cd4d6ef0abb1f65e81f8fe37e7d3c10344e38c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 21 Jul 2008 14:21:35 -0700 Subject: cpusets: fix wrong domain attr updates Fix wrong domain attr updates, or we will always update the first sched domain attr. Signed-off-by: Miao Xie Cc: Hidetoshi Seto Cc: Paul Jackson Cc: Nick Piggin Cc: Ingo Molnar Cc: [2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/cpuset.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a..d2cc67dac8b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -679,7 +679,9 @@ restart: if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; - update_domain_attr(dattr, b); + if (dattr) + update_domain_attr(dattr + + nslot, b); } } nslot++; -- cgit v1.2.3