From ea6f18ed5a1531caf678374f30a0990c9e6742f3 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 25 Nov 2008 02:35:02 +1030 Subject: sched: reduce stack size requirements in kernel/sched.c Impact: cleanup * use node_to_cpumask_ptr in place of node_to_cpumask to reduce stack requirements in sched.c Signed-off-by: Mike Travis Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bb827651558..dd22cec499b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6110,8 +6110,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) do { /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); + node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); + + cpus_and(mask, *pnodemask, p->cpus_allowed); dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ @@ -7098,9 +7099,9 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *nodemask) { int group; + node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpus_and(*nodemask, *pnodemask, *cpu_map); group = first_cpu(*nodemask); if (sg) @@ -7150,9 +7151,9 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + node_to_cpumask_ptr(pnodemask, i); - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpus_and(*nodemask, *pnodemask, *cpu_map); if (cpus_empty(*nodemask)) continue; -- cgit v1.2.3 From abcd083a1a658d2bc1f7fced02632bfe03918002 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:02 +1030 Subject: sched: convert sched.c from for_each_cpu_mask to for_each_cpu. Impact: trivial API conversion This is a simple conversion, but note that for_each_cpu() terminates with i >= nr_cpu_ids, not i == NR_CPUS like for_each_cpu_mask() did. I don't convert all of them: sd->span changes in a later patch, so change those iterators there rather than here. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dd22cec499b..e59978eead1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2061,7 +2061,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2103,7 +2103,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, /* Traverse only the allowed CPUs */ cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu(i, tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -3121,7 +3121,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { struct rq *rq; if (!cpu_isset(i, *cpus)) @@ -3400,7 +3400,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -3942,7 +3942,7 @@ static void run_rebalance_domains(struct softirq_action *h) int balance_cpu; cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, &cpus) { /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -6906,7 +6906,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(*covered); - for_each_cpu_mask_nr(i, *span) { + for_each_cpu(i, span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; @@ -6917,7 +6917,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask_nr(j, *span) { + for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; @@ -7117,7 +7117,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask_nr(j, sg->cpumask) { + for_each_cpu(j, &sg->cpumask) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); @@ -7142,7 +7142,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; - for_each_cpu_mask_nr(cpu, *cpu_map) { + for_each_cpu(cpu, cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@ -7396,7 +7396,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7463,7 +7463,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { SCHED_CPUMASK_VAR(this_sibling_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7480,7 +7480,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7547,7 +7547,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { + for_each_cpu(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); @@ -7593,21 +7593,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(cpu_domains, i); init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(core_domains, i); init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(phys_domains, i); init_sched_groups_power(i, sd); @@ -7627,7 +7627,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -7709,7 +7709,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) cpumask_t tmpmask; int i; - for_each_cpu_mask_nr(i, *cpu_map) + for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); arch_destroy_sched_domains(cpu_map, &tmpmask); -- cgit v1.2.3 From 3404c8d97c2d3eb87b1bf4aadad957bfb5235b14 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:03 +1030 Subject: sched: get rid of boutique sched.c allocations, use cpumask_var_t. Impact: use new general API Using lots of allocs rather than one big alloc is less efficient, but who cares for this setup function? Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/sched.c | 139 +++++++++++++++++++++++---------------------------------- 1 file changed, 55 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e59978eead1..0dc9d5752d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7263,48 +7263,6 @@ SD_INIT_FUNC(CPU) SD_INIT_FUNC(MC) #endif -/* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ -struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - -#ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; -#endif -}; - -#if NR_CPUS > 128 -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ - *masks = kmalloc(sizeof(**masks), GFP_KERNEL); -} -static inline void sched_cpumask_free(struct allmasks *masks) -{ - kfree(masks); -} -#else -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ } -static inline void sched_cpumask_free(struct allmasks *masks) -{ } -#endif - -#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) @@ -7347,14 +7305,35 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const cpumask_t *cpu_map, struct sched_domain_attr *attr) { - int i; + int i, err = -ENOMEM; struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; #ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ @@ -7362,33 +7341,16 @@ static int __build_sched_domains(const cpumask_t *cpu_map, GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; + goto free_tmpmask; } #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; + goto free_sched_groups; } - /* get space for all scratch cpumask variables */ - sched_cpumask_alloc(&allmasks); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } - - tmpmask = (cpumask_t *)allmasks; - - #ifdef CONFIG_NUMA sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif @@ -7398,7 +7360,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, */ for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); @@ -7464,9 +7425,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_sibling_map = per_cpu(cpu_sibling_map, i); cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); if (i != first_cpu(*this_sibling_map)) @@ -7481,9 +7439,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); if (i != first_cpu(*this_core_map)) @@ -7497,9 +7452,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); if (cpus_empty(*nodemask)) @@ -7513,8 +7465,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, send_covered, tmpmask); @@ -7523,9 +7473,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); int j; *nodemask = node_to_cpumask(i); @@ -7560,7 +7507,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, prev = sg; for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); @@ -7639,15 +7585,40 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpu_attach_domain(sd, rd, i); } - sched_cpumask_free(allmasks); - return 0; + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - sched_cpumask_free(allmasks); kfree(rd); - return -ENOMEM; + goto free_tmpmask; #endif } -- cgit v1.2.3 From 1e5ce4f4a755ee498bd9217dae26143afa0d8f31 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:03 +1030 Subject: sched: remove any_online_cpu() Impact: use new API any_online_cpu() is a good name, but it takes a cpumask_t, not a pointer. There are several places where any_online_cpu() doesn't really want a mask arg at all. Replace all callers with cpumask_any() and cpumask_any_and(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0dc9d5752d6..a2de33d0534 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5964,7 +5964,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -6113,11 +6113,12 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); cpus_and(mask, *pnodemask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); + dest_cpu = cpumask_any_and(cpu_online_mask, &mask); /* On any allowed CPU? */ if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, + &p->cpus_allowed); /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { @@ -6133,7 +6134,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ rq = task_rq_lock(p, &flags); p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, + &p->cpus_allowed); task_rq_unlock(rq, &flags); /* @@ -6159,7 +6161,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); unsigned long flags; local_irq_save(flags); @@ -6524,7 +6526,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; -- cgit v1.2.3 From 758b2cdc6f6a22c702bd8f2344382fb1270b2161 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: wrap sched_group and sched_domain cpumask accesses. Impact: trivial wrap of member accesses This eases the transition in the next patch. We also get rid of a temporary cpumask in find_idlest_cpu() thanks to for_each_cpu_and, and sched_balance_self() due to getting weight before setting sd to NULL. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 114 +++++++++++++++++++++++++-------------------------- kernel/sched_fair.c | 10 ++--- kernel/sched_rt.c | 3 +- kernel/sched_stats.h | 3 +- 4 files changed, 63 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a2de33d0534..575f38acf4d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1501,7 +1501,7 @@ static int tg_shares_up(struct task_group *tg, void *data) struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu(i, sched_domain_span(sd)) { /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1522,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@ -2053,15 +2053,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu(i, &group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2093,17 +2095,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu(i, tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2145,7 +2144,6 @@ static int sched_balance_self(int cpu, int flag) update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -2154,14 +2152,13 @@ static int sched_balance_self(int cpu, int flag) continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2170,10 +2167,10 @@ static int sched_balance_self(int cpu, int flag) /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@ -2218,7 +2215,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@ -2266,7 +2263,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@ -3109,10 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3121,13 +3119,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu(i, &group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@ -3238,8 +3231,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3254,8 +3247,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3400,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu(i, &group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -3746,7 +3739,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@ -6618,7 +6611,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); + cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd)); cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6633,11 +6626,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT "span %s level %s\n", str, sd->name); - if (!cpu_isset(cpu, sd->span)) { + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpu_isset(cpu, group->cpumask)) { + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -6657,31 +6650,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpus_weight(group->cpumask)) { + if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } - if (cpus_intersects(*groupmask, group->cpumask)) { + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(*groupmask, *groupmask, group->cpumask); + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group)); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, *groupmask)) + if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6721,7 +6715,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) static int sd_degenerate(struct sched_domain *sd) { - if (cpus_weight(sd->span) == 1) + if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ @@ -6752,7 +6746,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (sd_degenerate(parent)) return 1; - if (!cpus_equal(sd->span, parent->span)) + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Does parent contain flags not in child? */ @@ -6913,10 +6907,10 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, *covered)) + if (cpumask_test_cpu(i, covered)) continue; - cpus_clear(sg->cpumask); + cpumask_clear(sched_group_cpus(sg)); sg->__cpu_power = 0; for_each_cpu(j, span) { @@ -6924,7 +6918,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, continue; cpu_set(j, *covered); - cpu_set(j, sg->cpumask); + cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; @@ -7119,11 +7113,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu(j, &sg->cpumask) { + for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { + if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each * physical package. @@ -7200,7 +7194,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != first_cpu(sd->groups->cpumask)) + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) return; child = sd->child; @@ -7372,7 +7366,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sd->span = *cpu_map; + cpumask_copy(sched_domain_span(sd), cpu_map); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7382,18 +7376,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(node_domains, i); SD_INIT(sd, NODE); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = p; if (p) p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); #endif p = sd; sd = &per_cpu(phys_domains, i); SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - sd->span = *nodemask; + cpumask_copy(sched_domain_span(sd), nodemask); sd->parent = p; if (p) p->child = sd; @@ -7404,8 +7399,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(core_domains, i); SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); - cpus_and(sd->span, sd->span, *cpu_map); + *sched_domain_span(sd) = cpu_coregroup_map(i); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7416,8 +7412,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(cpu_domains, i); SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -7503,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = *nodemask; + cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; cpus_or(*covered, *covered, *nodemask); prev = sg; @@ -7530,7 +7526,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sg->__cpu_power = 0; - sg->cpumask = *tmpmask; + cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; cpus_or(*covered, *covered, *tmpmask); prev->next = sg; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 98345e45b05..bba00402ed9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1024,7 +1024,6 @@ static void yield_task_fair(struct rq *rq) #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, struct task_struct *p) { - cpumask_t tmp; struct sched_domain *sd; int i; @@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p) if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { + for_each_cpu_and(i, sched_domain_span(sd), + &p->cpus_allowed) { + if (cpu_active(i) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@ -1240,7 +1238,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) * this_cpu and prev_cpu are present in: */ for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { + if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { this_sd = sd; break; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2bdd4442359..4cd813abc23 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1017,7 +1017,8 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t domain_mask; int best_cpu; - cpus_and(domain_mask, sd->span, *lowest_mask); + cpumask_and(&domain_mask, sched_domain_span(sd), + lowest_mask); best_cpu = pick_optimal_cpu(this_cpu, &domain_mask); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a2b02..ce340835d05 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, + *sched_domain_span(sd)); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { -- cgit v1.2.3 From 6c99e9ad47d9c082bd096f42fb49e397b05d58a8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: convert struct sched_group/sched_domain cpumask_ts to variable bitmaps Impact: (future) size reduction for large NR_CPUS. We move the 'cpumask' member of sched_group to the end, so when we kmalloc it we can do a minimal allocation: saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. Similar trick for 'span' in sched_domain. This isn't quite as good as converting to a cpumask_var_t, as some sched_groups are actually static, but it's safer: we don't have to figure out where to call alloc_cpumask_var/free_cpumask_var. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 65 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 575f38acf4d..6b9606a6cab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7005,19 +7005,34 @@ static void sched_domain_node_span(int node, cpumask_t *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; +/* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + /* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); + *sg = &per_cpu(sched_group_cpus, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -7026,8 +7041,8 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, * multi-core sched-domains: */ #ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_core); +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) @@ -7041,7 +7056,7 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpus_and(*mask, *mask, *cpu_map); group = first_cpu(*mask); if (sg) - *sg = &per_cpu(sched_group_core, group); + *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) @@ -7050,13 +7065,13 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *unused) { if (sg) - *sg = &per_cpu(sched_group_core, cpu); + *sg = &per_cpu(sched_group_core, cpu).sg; return cpu; } #endif -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_phys); +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, @@ -7075,7 +7090,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, group = cpu; #endif if (sg) - *sg = &per_cpu(sched_group_phys, group); + *sg = &per_cpu(sched_group_phys, group).sg; return group; } @@ -7089,7 +7104,7 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *nodemask) @@ -7101,7 +7116,7 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, group = first_cpu(*nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group); + *sg = &per_cpu(sched_group_allnodes, group).sg; return group; } @@ -7116,7 +7131,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); + sd = &per_cpu(phys_domains, j).sd; if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each @@ -7385,7 +7400,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif p = sd; - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), nodemask); @@ -7396,7 +7411,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC p = sd; - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); *sched_domain_span(sd) = cpu_coregroup_map(i); @@ -7409,7 +7424,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT p = sd; - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), @@ -7485,7 +7500,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sched_domain_node_span(i, domainspan); cpus_and(*domainspan, *domainspan, *cpu_map); - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); @@ -7518,7 +7534,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (cpus_empty(*tmpmask)) continue; - sg = kmalloc_node(sizeof(struct sched_group), + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING @@ -7538,21 +7555,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); + struct sched_domain *sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -7574,11 +7591,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; #else - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; #endif cpu_attach_domain(sd, rd, i); } -- cgit v1.2.3 From 6a7b3dc3440f7b5a9b67594af01ed562cdeb41e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: convert nohz_cpu_mask to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- kernel/sched.c | 7 +++++-- kernel/time/tick-sched.c | 10 +++++----- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index e503a002f33..c03ca3e6191 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } diff --git a/kernel/sched.c b/kernel/sched.c index 6b9606a6cab..2723d7a4a42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5870,9 +5870,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@ -8274,6 +8274,9 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); + scheduler_running = 1; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9ccab4..70f872c71f4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void) if (!ts->tick_stopped) return; - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle) if ((long)delta_jiffies >= 1) { if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle) /* * sched tick not stopped! */ - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } @@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); /* * We stopped the tick in idle. Update process times would miss the -- cgit v1.2.3 From c6c4927b22a3514c6660f0e72c78716226bd3cc8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:05 +1030 Subject: sched: convert struct root_domain to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. def_root_domain is static, and so its masks are initialized with alloc_bootmem_cpumask_var. After that, alloc_cpumask_var is used. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 69 ++++++++++++++++++++++++++++++++++++++++--------------- kernel/sched_rt.c | 26 ++++++++++----------- 2 files changed, 64 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2723d7a4a42..93309c3034d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -487,14 +487,14 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; @@ -6444,7 +6444,7 @@ static void set_rq_online(struct rq *rq) if (!rq->online) { const struct sched_class *class; - cpu_set(rq->cpu, rq->rd->online); + cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { @@ -6464,7 +6464,7 @@ static void set_rq_offline(struct rq *rq) class->rq_offline(rq); } - cpu_clear(rq->cpu, rq->rd->online); + cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } @@ -6505,7 +6505,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } @@ -6567,7 +6567,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); @@ -6768,6 +6768,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +static void free_rootdomain(struct root_domain *rd) +{ + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; @@ -6777,38 +6785,60 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->rd) { struct root_domain *old_rd = rq->rd; - if (cpu_isset(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); - cpu_clear(rq->cpu, old_rd->span); + cpumask_clear_cpu(rq->cpu, old_rd->span); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + free_rootdomain(old_rd); } atomic_inc(&rd->refcount); rq->rd = rd; - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } -static void init_rootdomain(struct root_domain *rd) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); - cpus_clear(rd->span); - cpus_clear(rd->online); + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto free_rd; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; cpupri_init(&rd->cpupri); + return 0; + +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +free_rd: + kfree(rd); + return -ENOMEM; } static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain); + init_rootdomain(&def_root_domain, true); + atomic_set(&def_root_domain.refcount, 1); } @@ -6820,7 +6850,10 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - init_rootdomain(rd); + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } return rd; } @@ -7632,7 +7665,7 @@ free_sched_groups: #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - kfree(rd); + free_rootdomain(rd); goto free_tmpmask; #endif } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4cd813abc23..820fc422c6d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq) if (!rq->online) return; - cpu_set(rq->cpu, rq->rd->rto_mask); + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set * the overload count. That is checked to determine @@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq) /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } static void update_rt_migration(struct rq *rq) @@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) } #ifdef CONFIG_SMP -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_rq(smp_processor_id())->rd->span; } #else -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } #endif @@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) return rt_rq->rt_throttled; } -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } static inline @@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq) int i, weight, more = 0; u64 rt_period; - weight = cpus_weight(rd->span); + weight = cpumask_weight(rd->span); spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq) /* * Greedy reclaim, take back as much as we can. */ - for_each_cpu_mask(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; - cpumask_t span; + const struct cpumask *span; if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@ -1181,7 +1181,7 @@ static int pull_rt_task(struct rq *this_rq) next = pick_next_task_rt(this_rq); - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; -- cgit v1.2.3 From 7d1e6a9b95e3edeac91888bc683ae62f18519432 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:09 +1030 Subject: sched: convert nohz struct to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 93309c3034d..2f8ea99df16 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3758,10 +3758,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@ -3789,7 +3788,7 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@ -3803,7 +3802,7 @@ int select_nohz_load_balancer(int stop_tick) } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -3816,10 +3815,10 @@ int select_nohz_load_balancer(int stop_tick) } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@ -3930,12 +3929,13 @@ static void run_rebalance_domains(struct softirq_action *h) */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu(balance_cpu, &cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -3973,7 +3973,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@ -3986,7 +3986,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -3998,7 +3998,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@ -4008,7 +4008,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@ -8309,6 +8309,9 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif scheduler_running = 1; } -- cgit v1.2.3 From 4d2732c63e0c05cfef2a74868d08eace922dfc3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:10 +1030 Subject: sched: convert idle_balance() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2f8ea99df16..154a95fcea7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3676,7 +3676,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3687,7 +3690,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3702,6 +3705,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* -- cgit v1.2.3 From a0e902452da16b79d7c9230630ed8a595d14fa85 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert rebalance_domains() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 154a95fcea7..67383e7f1cc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3850,7 +3850,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3875,7 +3879,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3909,6 +3913,8 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* -- cgit v1.2.3 From f17c860760927c2a8e41a021eab3317e4415e962 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert sys_sched_getaffinity() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Some jiggling here to make sure we always exit at the bottom (so we hit the free_cpumask_var there). Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 67383e7f1cc..6deff24349b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5499,19 +5499,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** -- cgit v1.2.3 From e76bd8d9850c2296a7e8e24c9dce9b5e6b55fe2f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: avoid stack var in move_task_off_dead_cpu Impact: stack usage reduction With some care, we can avoid needing a temporary cpumask (we can't really allocate here, since we can't fail). This version calls cpuset_cpus_allowed_locked() with the task_rq_lock held. I'm fairly sure this works, but there might be a deadlock hiding. And of course, we can't get rid of the last cpumask on stack until we can use cpumask_of_node instead of node_to_cpumask. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 78 +++++++++++++++++++++++++++------------------------------- 1 file changed, 36 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6deff24349b..f7dee2029e4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6112,52 +6112,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; - cpumask_t mask; struct rq *rq; int dest_cpu; + /* FIXME: Use cpumask_of_node here. */ + cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); + const struct cpumask *nodemask = &_nodemask; + +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rq = task_rq_lock(p, &flags); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + task_rq_unlock(rq, &flags); - do { - /* On same node? */ - node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); - - cpus_and(mask, *pnodemask, p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &mask); - - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = cpumask_any_and(cpu_online_mask, - &p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = cpumask_any_and(cpu_online_mask, - &p->cpus_allowed); - task_rq_unlock(rq, &flags); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* -- cgit v1.2.3 From 5a16f3d30ca4e3f166d691220c003066a14e32b5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert struct (sys_)sched_setaffinity() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space on the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Note the removal of the initializer of new_mask: since the first thing we did was "cpus_and(new_mask, new_mask, cpus_allowed)" I just changed that to "cpumask_and(new_mask, in_mask, cpus_allowed);". Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f7dee2029e4..2d4ff91e0c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5378,8 +5378,7 @@ out_unlock: long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@ -5401,6 +5400,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) @@ -5410,24 +5417,28 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: put_task_struct(p); put_online_cpus(); return retval; @@ -5453,14 +5464,17 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } long sched_getaffinity(pid_t pid, cpumask_t *mask) -- cgit v1.2.3 From d5dd3db1dce73cdd5c45c5a3498c51bd21b8864b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert sched_domain_debug to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves stack space. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. In this case, we always alloced, but we don't need to any more. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2d4ff91e0c9..24012c2a889 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6706,7 +6706,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_t *groupmask; + cpumask_var_t groupmask; int level = 0; if (!sd) { @@ -6716,8 +6716,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); return; } @@ -6730,7 +6729,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!sd) break; } - kfree(groupmask); + free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) -- cgit v1.2.3 From dcc30a35f71bcf51f1e9b336dc5e41923071509a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert cpu_isolated_map to cpumask_var_t. Impact: stack usage reduction, (future) size reduction, cleanup Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. We can also use cpulist_parse() instead of doing it manually in isolated_cpu_setup. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 24012c2a889..526618fe4a7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6917,19 +6917,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } /* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; +static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); + cpulist_parse(str, *cpu_isolated_map); return 1; } @@ -7727,7 +7720,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@ -7826,7 +7819,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -7985,7 +7978,9 @@ static int update_runtime(struct notifier_block *nfb, void __init sched_init_smp(void) { - cpumask_t non_isolated_cpus; + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -7994,10 +7989,10 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -8012,9 +8007,10 @@ void __init sched_init_smp(void) init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); } #else void __init sched_init_smp(void) @@ -8334,6 +8330,7 @@ void __init sched_init(void) #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); #endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); scheduler_running = 1; } -- cgit v1.2.3 From 4212823fb459eacc8098dd420bb68ebb9917989d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert falback_doms to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 526618fe4a7..42588ad93b2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7697,10 +7697,10 @@ static struct sched_domain_attr *dattr_cur; /* * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. */ -static cpumask_t fallback_doms; +static cpumask_var_t fallback_doms; void __attribute__((weak)) arch_update_cpu_topology(void) { @@ -7719,7 +7719,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) - doms_cur = &fallback_doms; + doms_cur = fallback_doms; cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); @@ -7818,7 +7818,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - doms_new = &fallback_doms; + doms_new = fallback_doms; cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -7838,7 +7838,7 @@ match2: } /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) + if (doms_cur != fallback_doms) kfree(doms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; @@ -8011,6 +8011,8 @@ void __init sched_init_smp(void) BUG(); sched_init_granularity(); free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); } #else void __init sched_init_smp(void) -- cgit v1.2.3 From 68e74568fbe5854952355e942acca51f138096d9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert struct cpupri_vec cpumask_var_t. Impact: stack usage reduction, (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. The fact cpupro_init is called both before and after the slab is available makes for an ugly parameter unfortunately. We also use cpumask_any_and to get rid of a temporary in cpupri_find. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++++-- kernel/sched_cpupri.c | 39 ++++++++++++++++++++++++++++----------- kernel/sched_cpupri.h | 5 +++-- 3 files changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 42588ad93b2..94fa333c1e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6792,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + cpupri_cleanup(&rd->cpupri); + free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6834,7 +6836,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) alloc_bootmem_cpumask_var(&def_root_domain.span); alloc_bootmem_cpumask_var(&def_root_domain.online); alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri); + cpupri_init(&rd->cpupri, true); return 0; } @@ -6845,9 +6847,12 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - cpupri_init(&rd->cpupri); + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; return 0; +free_rto_mask: + free_cpumask_var(rd->rto_mask); free_online: free_cpumask_var(rd->online); free_span: diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 52154fefab7..018b7be1db2 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -67,24 +67,21 @@ static int convert_prio(int prio) * Returns: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - cpumask_t *lowest_mask) + struct cpumask *lowest_mask) { int idx = 0; int task_pri = convert_prio(p->prio); for_each_cpupri_active(cp->pri_active, idx) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - cpumask_t mask; if (idx >= task_pri) break; - cpus_and(mask, p->cpus_allowed, vec->mask); - - if (cpus_empty(mask)) + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - *lowest_mask = mask; + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); return 1; } @@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); - cpu_clear(cpu, vec->mask); + cpumask_clear_cpu(cpu, vec->mask); spin_unlock_irqrestore(&vec->lock, flags); } @@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_lock_irqsave(&vec->lock, flags); - cpu_set(cpu, vec->mask); + cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); @@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context + * @bootmem: true if allocations need to use bootmem * - * Returns: (void) + * Returns: -ENOMEM if memory fails. */ -void cpupri_init(struct cpupri *cp) +int cpupri_init(struct cpupri *cp, bool bootmem) { int i; @@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp) spin_lock_init(&vec->lock); vec->count = 0; - cpus_clear(vec->mask); + if (bootmem) + alloc_bootmem_cpumask_var(&vec->mask); + else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + goto cleanup; } for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; + +cleanup: + for (i--; i >= 0; i--) + free_cpumask_var(cp->pri_to_cpu[i].mask); + return -ENOMEM; } +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ + int i; + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) + free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index f25811b0f93..642a94ef8a0 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -14,7 +14,7 @@ struct cpupri_vec { spinlock_t lock; int count; - cpumask_t mask; + cpumask_var_t mask; }; struct cpupri { @@ -27,7 +27,8 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, cpumask_t *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -void cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp, bool bootmem); +void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) #define cpupri_init() do { } while (0) -- cgit v1.2.3 From 24600ce89a819a8f2fb4fd69fd777218a82ade20 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert check_preempt_equal_prio to cpumask_var_t. Impact: stack reduction for large NR_CPUS Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves stack space. We simply return if the allocation fails: since we don't use it we could just pass NULL to cpupri_find and have it handle that. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 820fc422c6d..1fa13624293 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_t mask; + cpumask_var_t mask; if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, mask)) + goto free; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) + goto free; /* * There appears to be other cpus that can accept @@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); +free: + free_cpumask_var(mask); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 0e3900e6d3b04c44737ebc505604dcd8ed30e354 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert local_cpu_mask to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_rt.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94fa333c1e7..f2be6187003 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,6 +8018,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); } #else void __init sched_init_smp(void) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1fa13624293..1f0e99d1a8c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -962,7 +962,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) return next; } -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { @@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -1551,3 +1551,12 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ + +/* Note that this is never called for !SMP, but that's OK. */ +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} -- cgit v1.2.3 From 96f874e26428ab5d2db681c100210c254775e154 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:14 +1030 Subject: sched: convert remaining old-style cpumask operators Impact: Trivial API conversion NR_CPUS -> nr_cpu_ids cpumask_t -> struct cpumask sizeof(cpumask_t) -> cpumask_size() cpumask_a = cpumask_b -> cpumask_copy(&cpumask_a, &cpumask_b) cpu_set() -> cpumask_set_cpu() first_cpu() -> cpumask_first() cpumask_of_cpu() -> cpumask_of() cpus_* -> cpumask_* There are some FIXMEs where we all archs to complete infrastructure (patches have been sent): cpu_coregroup_map -> cpu_coregroup_mask node_to_cpumask* -> cpumask_of_node There is also one FIXME where we pass an array of cpumasks to partition_sched_domains(): this implies knowing the definition of 'struct cpumask' and the size of a cpumask. This will be fixed in a future patch. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 212 ++++++++++++++++++++++++++++------------------------ kernel/sched_fair.c | 4 +- kernel/sched_rt.c | 18 ++--- 3 files changed, 124 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f2be6187003..eba6a156d33 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2829,7 +2829,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; @@ -2895,7 +2895,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@ -3070,7 +3070,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3387,7 +3387,7 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -3396,7 +3396,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@ -3426,7 +3426,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -3434,7 +3434,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3494,8 +3494,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@ -3512,7 +3512,8 @@ redo: /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@ -3587,7 +3588,7 @@ out: */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3596,7 +3597,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3640,8 +3641,8 @@ redo: double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } @@ -5376,7 +5377,7 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; @@ -5445,13 +5446,13 @@ out_put_task: } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@ -5477,7 +5478,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return retval; } -long sched_getaffinity(pid_t pid, cpumask_t *mask) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@ -5494,7 +5495,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@ -5872,7 +5873,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -5956,7 +5957,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@ -5964,13 +5965,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@ -5978,12 +5979,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { @@ -6028,7 +6029,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@ -6629,13 +6630,13 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) + struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd)); - cpus_clear(*groupmask); + cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6936,24 +6937,25 @@ __setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) { struct sched_group *first = NULL, *last = NULL; int i; - cpus_clear(*covered); + cpumask_clear(covered); for_each_cpu(i, span) { struct sched_group *sg; @@ -6970,7 +6972,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, *covered); + cpumask_set_cpu(j, covered); cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) @@ -7035,9 +7037,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static void sched_domain_node_span(int node, cpumask_t *span) +static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; + /* FIXME: use cpumask_of_node() */ node_to_cpumask_ptr(nodemask, node); int i; @@ -7081,8 +7084,8 @@ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu).sg; @@ -7100,22 +7103,21 @@ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu).sg; @@ -7127,18 +7129,18 @@ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; #ifdef CONFIG_SCHED_MC + /* FIXME: Use cpu_coregroup_mask. */ *mask = cpu_coregroup_map(cpu); cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); #else group = cpu; #endif @@ -7159,14 +7161,16 @@ static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { int group; + /* FIXME: use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - cpus_and(*nodemask, *pnodemask, *cpu_map); - group = first_cpu(*nodemask); + cpumask_and(nodemask, pnodemask, cpu_map); + group = cpumask_first(nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group).sg; @@ -7202,7 +7206,8 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { int cpu, i; @@ -7215,10 +7220,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, i); cpus_and(*nodemask, *pnodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; if (sg == NULL) @@ -7236,7 +7242,8 @@ next_sg: } } #else /* !CONFIG_NUMA */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { } #endif /* CONFIG_NUMA */ @@ -7366,7 +7373,7 @@ static void set_domain_attribute(struct sched_domain *sd, * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const cpumask_t *cpu_map, +static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { int i, err = -ENOMEM; @@ -7416,7 +7423,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, } #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; #endif /* @@ -7425,12 +7432,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; + /* FIXME: use cpumask_of_node */ *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); @@ -7491,9 +7499,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, @@ -7505,9 +7513,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { + /* FIXME: Use cpu_coregroup_mask */ *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) + if (i != cpumask_first(this_core_map)) continue; init_sched_build_groups(this_core_map, cpu_map, @@ -7518,9 +7527,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; init_sched_build_groups(nodemask, cpu_map, @@ -7541,17 +7551,18 @@ static int __build_sched_domains(const cpumask_t *cpu_map, struct sched_group *sg, *prev; int j; + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); - cpus_clear(*covered); + cpumask_clear(covered); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { + if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; } sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); + cpumask_and(domainspan, domainspan, cpu_map); sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, i); @@ -7570,21 +7581,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sg->__cpu_power = 0; cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; - cpus_or(*covered, *covered, *nodemask); + cpumask_or(covered, covered, nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, n); - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) break; - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) + cpumask_and(tmpmask, tmpmask, pnodemask); + if (cpumask_empty(tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group) + @@ -7598,7 +7610,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sg->__cpu_power = 0; cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); + cpumask_or(covered, covered, tmpmask); prev->next = sg; prev = sg; } @@ -7634,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, tmpmask); init_numa_sched_groups_power(sg); } @@ -7690,12 +7702,12 @@ error: #endif } -static int build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const struct cpumask *cpu_map) { return __build_sched_domains(cpu_map, NULL); } -static cpumask_t *doms_cur; /* current sched domains */ +static struct cpumask *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -7716,13 +7728,13 @@ void __attribute__((weak)) arch_update_cpu_topology(void) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const struct cpumask *cpu_map) { int err; arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms_cur) doms_cur = fallback_doms; cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); @@ -7733,8 +7745,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); } @@ -7743,15 +7755,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map, * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ -static void detach_destroy_domains(const cpumask_t *cpu_map) +static void detach_destroy_domains(const struct cpumask *cpu_map) { - cpumask_t tmpmask; + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@ -7776,7 +7789,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the @@ -7790,13 +7803,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * the single partition 'fallback_doms', it also forces the domains * to be rebuilt. * - * If doms_new == NULL it will be replaced with cpu_online_map. + * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; @@ -7811,7 +7825,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) + if (cpumask_equal(&doms_cur[i], &doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } @@ -7831,7 +7845,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) + if (cpumask_equal(&doms_new[i], &doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bba00402ed9..08ffffd4a41 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1017,7 +1017,7 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) + * hence we need to mask them out (cpu_active_mask) * * Returns the CPU we should wake onto. */ @@ -1244,7 +1244,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) goto out; /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1f0e99d1a8c..fb3964579a8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -923,7 +923,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -997,7 +997,7 @@ static int find_lowest_rq(struct task_struct *task) * I guess we might want to change cpupri_find() to ignore those * in the first place. */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * At this point we have built a mask of cpus representing the @@ -1007,7 +1007,7 @@ static int find_lowest_rq(struct task_struct *task) * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ - if (cpu_isset(cpu, *lowest_mask)) + if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* @@ -1064,8 +1064,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || task_running(rq, task) || !task->se.on_rq)) { @@ -1315,9 +1315,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, } static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - int weight = cpus_weight(*new_mask); + int weight = cpumask_weight(new_mask); BUG_ON(!rt_task(p)); @@ -1338,7 +1338,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, update_rt_migration(rq); } - p->cpus_allowed = *new_mask; + cpumask_copy(&p->cpus_allowed, new_mask); p->rt.nr_cpus_allowed = weight; } -- cgit v1.2.3 From bf4d83f66476086c6b50dc52aac00d71ad70494e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:57:51 +1030 Subject: sched: convert nohz struct to cpumask_var_t, fix Impact: build fix Fix the !CONFIG_SMP case. Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index eba6a156d33..1aa840a9f58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8349,10 +8349,12 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ scheduler_running = 1; } -- cgit v1.2.3 From 3d8cbdf8650f44d95333ca645d950832a0653f35 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:58:41 +1030 Subject: sched: convert local_cpu_mask to cpumask_var_t, fix Impact: build fix for !CONFIG_SMP Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fb3964579a8..94aab72f6a0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1381,6 +1381,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, if (!rq->rt.rt_nr_running) pull_rt_task(rq); } + +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} #endif /* CONFIG_SMP */ /* @@ -1552,11 +1560,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) } #endif /* CONFIG_SCHED_DEBUG */ -/* Note that this is never called for !SMP, but that's OK. */ -static inline void init_sched_rt_class(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); -} -- cgit v1.2.3 From 1224e376f2a7e3c7ab19ef37099a78597978a696 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:59:20 +1030 Subject: sched: avoid stack var in move_task_off_dead_cpu, fix Impact: locking fix We can't call cpuset_cpus_allowed_locked() with the rq lock held. However, the rq lock merely protects us from (1) cpu_online_mask changing and (2) someone else changing p->cpus_allowed. The first can't happen because we're being called from a cpu hotplug notifier. The second doesn't really matter: we are forcing the task off a CPU it was affine to, so we're not doing very well anyway. So we remove the rq lock from this path, and all is good. Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1aa840a9f58..3f5bfdc3d94 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6126,8 +6126,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - struct rq *rq; int dest_cpu; /* FIXME: Use cpumask_of_node here. */ cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); @@ -6146,10 +6144,8 @@ again: /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { - rq = task_rq_lock(p, &flags); cpuset_cpus_allowed_locked(p, &p->cpus_allowed); dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - task_rq_unlock(rq, &flags); /* * Don't tell them about moving exiting tasks or -- cgit v1.2.3 From 98a79d6a50181ca1ecf7400eda01d5dc1bc0dbf0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:19:41 +1030 Subject: cpumask: centralize cpu_online_map and cpu_possible_map Impact: cleanup Each SMP arch defines these themselves. Move them to a central location. Twists: 1) Some archs (m32, parisc, s390) set possible_map to all 1, so we add a CONFIG_INIT_ALL_POSSIBLE for this rather than break them. 2) mips and sparc32 '#define cpu_possible_map phys_cpu_present_map'. Those archs simply have phys_cpu_present_map replaced everywhere. 3) Alpha defined cpu_possible_map to cpu_present_map; this is tricky so I just manipulate them both in sync. 4) IA64, cris and m32r have gratuitous 'extern cpumask_t cpu_possible_map' declarations. Signed-off-by: Rusty Russell Reviewed-by: Grant Grundler Tested-by: Tony Luck Acked-by: Ingo Molnar Cc: Mike Travis Cc: ink@jurassic.park.msu.ru Cc: rmk@arm.linux.org.uk Cc: starvik@axis.com Cc: tony.luck@intel.com Cc: takata@linux-m32r.org Cc: ralf@linux-mips.org Cc: grundler@parisc-linux.org Cc: paulus@samba.org Cc: schwidefsky@de.ibm.com Cc: lethal@linux-sh.org Cc: wli@holomorphy.com Cc: davem@davemloft.net Cc: jdike@addtoit.com Cc: mingo@redhat.com --- kernel/cpu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ea32e8d68b..bae131a1211 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,19 +24,20 @@ cpumask_t cpu_present_map __read_mostly; EXPORT_SYMBOL(cpu_present_map); -#ifndef CONFIG_SMP - /* * Represents all cpu's that are currently online. */ -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); +#ifdef CONFIG_INIT_ALL_POSSIBLE cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +#else +cpumask_t cpu_possible_map __read_mostly; +#endif EXPORT_SYMBOL(cpu_possible_map); -#else /* CONFIG_SMP */ - +#ifdef CONFIG_SMP /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); -- cgit v1.2.3 From 29c0177e6a4ac094302bed54a1d4bbb6b740a9ef Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:25 +1030 Subject: cpumask: change cpumask_scnprintf, cpumask_parse_user, cpulist_parse, and cpulist_scnprintf to take pointers. Impact: change calling convention of existing cpumask APIs Most cpumask functions started with cpus_: these have been replaced by cpumask_ ones which take struct cpumask pointers as expected. These four functions don't have good replacement names; fortunately they're rarely used, so we just change them over. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Cc: paulus@samba.org Cc: mingo@redhat.com Cc: tony.luck@intel.com Cc: ralf@linux-mips.org Cc: Greg Kroah-Hartman Cc: cl@linux-foundation.org Cc: srostedt@redhat.com --- kernel/cpuset.c | 4 ++-- kernel/irq/proc.c | 4 ++-- kernel/profile.c | 4 ++-- kernel/sched.c | 4 ++-- kernel/sched_stats.h | 2 +- kernel/taskstats.c | 2 +- kernel/trace/trace.c | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 96c0ba13b8c..39c1a4c1c5a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (!*buf) { cpus_clear(trialcs.cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs.cpus_allowed); + retval = cpulist_parse(buf, &trialcs.cpus_allowed); if (retval < 0) return retval; @@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) mask = cs->cpus_allowed; mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, mask); + return cpulist_scnprintf(page, PAGE_SIZE, &mask); } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d257e7d6a8a..f293349d49d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -47,7 +47,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, irq_balancing_disabled(irq)) return -EIO; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; @@ -95,7 +95,7 @@ static ssize_t default_affinity_write(struct file *file, cpumask_t new_value; int err; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; diff --git a/kernel/profile.c b/kernel/profile.c index dc41827fbfe..7d620dfdde5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,7 +442,7 @@ void profile_tick(int type) static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); + int len = cpumask_scnprintf(page, count, (cpumask_t *)data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file, unsigned long full_count = count, err; cpumask_t new_value; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; diff --git a/kernel/sched.c b/kernel/sched.c index e4bb1dd7b30..d2d16d1273b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6666,7 +6666,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); + cpulist_scnprintf(str, sizeof(str), &sd->span); cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6720,7 +6720,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpus_or(*groupmask, *groupmask, group->cpumask); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), &group->cpumask); printk(KERN_CONT " %s", str); group = group->next; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a2b02..6beff1e4eea 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, &sd->span); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bd6be76303c..6d7dc4ec4aa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask) if (!data) return -ENOMEM; nla_strlcpy(data, na, len); - ret = cpulist_parse(data, *mask); + ret = cpulist_parse(data, mask); kfree(data); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e3252f30..d2e75479dc5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2126,7 +2126,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2147,7 +2147,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, int err, cpu; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); if (err) goto err_unlock; -- cgit v1.2.3 From 0de26520c7cabf36e1de090ea8092f011a6106ce Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:26 +1030 Subject: cpumask: make irq_set_affinity() take a const struct cpumask Impact: change existing irq_chip API Not much point with gentle transition here: the struct irq_chip's setaffinity method signature needs to change. Fortunately, not widely used code, but hits a few architectures. Note: In irq_select_affinity() I save a temporary in by mangling irq_desc[irq].affinity directly. Ingo, does this break anything? (Folded in fix from KOSAKI Motohiro) Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Reviewed-by: Grant Grundler Acked-by: Ingo Molnar Cc: ralf@linux-mips.org Cc: grundler@parisc-linux.org Cc: jeremy@xensource.com Cc: KOSAKI Motohiro --- kernel/irq/chip.c | 2 +- kernel/irq/manage.c | 22 ++++++++++------------ kernel/irq/migration.c | 14 +++++++------- kernel/irq/proc.c | 29 +++++++++++++++++++---------- kernel/time/tick-common.c | 6 +++--- 5 files changed, 40 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 10b5092e9bf..58d8e31daa4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -45,7 +45,7 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpus_setall(desc->affinity); + cpumask_setall(&desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 801addda3c4..10ad2f87ed9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq) * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = cpumask; + cpumask_copy(&desc->pending_mask, cpumask); } #else - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) */ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { - cpumask_t mask; - if (!irq_can_set_affinity(irq)) return 0; - cpus_and(mask, cpu_online_map, irq_default_affinity); - /* * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpus_intersects(desc->affinity, cpu_online_map)) - mask = desc->affinity; + if (cpumask_any_and(&desc->affinity, cpu_online_mask) + < nr_cpu_ids) + goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - desc->affinity = mask; - desc->chip->set_affinity(irq, mask); + cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); +set_affinity: + desc->chip->set_affinity(irq, &desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9db681d9581..bd72329e630 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,7 +4,6 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); - cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -19,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(desc->pending_mask))) + if (unlikely(cpumask_empty(&desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -27,8 +26,6 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, desc->pending_mask, cpu_online_map); - /* * If there was a valid mask to work with, please * do the disable, re-program, enable sequence. @@ -41,10 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(!cpus_empty(tmp))) { - desc->chip->set_affinity(irq,tmp); + if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) + < nr_cpu_ids)) { + cpumask_and(&desc->affinity, + &desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, &desc->affinity); } - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f293349d49d..8e91c976252 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; - cpumask_t new_value; + cpumask_var_t new_value; int err; if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; - err = cpumask_parse_user(buffer, count, &new_value); + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto free_cpumask; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(*new_value)) { + err = -EINVAL; + goto free_cpumask; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) + if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity_usr(irq) ? -EINVAL : count; - - irq_set_affinity(irq, new_value); + err = irq_select_affinity_usr(irq) ? -EINVAL : count; + } else { + irq_set_affinity(irq, new_value); + err = count; + } - return count; +free_cpumask: + free_cpumask_var(new_value); + return err; } static int irq_affinity_proc_open(struct inode *inode, struct file *file) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df12434b43c..ab65d217583 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, - const cpumask_t *cpumask) + const struct cpumask *cpumask) { ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; @@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpus_equal(newdev->cpumask, *cpumask)) - irq_set_affinity(newdev->irq, *cpumask); + if (!cpumask_equal(&newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current -- cgit v1.2.3 From 320ab2b0b1e08e3805a3e1084a2f0eb1938d5d67 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:26 +1030 Subject: cpumask: convert struct clock_event_device to cpumask pointers. Impact: change calling convention of existing clock_event APIs struct clock_event_timer's cpumask field gets changed to take pointer, as does the ->broadcast function. Another single-patch change. For safety, we BUG_ON() in clockevents_register_device() if it's not set. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- kernel/time/clockevents.c | 2 ++ kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 8 ++++---- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f8d968063ce..ea2f48af83c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -166,6 +166,8 @@ static void clockevents_notify_released(void) void clockevents_register_device(struct clock_event_device *dev) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(!dev->cpumask); + /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f98a1b7b16e..9590af2327b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask) */ cpu = first_cpu(mask); td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(mask); + td->evtdev->broadcast(&mask); } } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index ab65d217583..f8372be7412 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -171,7 +171,7 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpumask_equal(&newdev->cpumask, cpumask)) + if (!cpumask_equal(newdev->cpumask, cpumask)) irq_set_affinity(newdev->irq, cpumask); /* @@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - if (!cpu_isset(cpu, newdev->cpumask)) + if (!cpumask_test_cpu(cpu, newdev->cpumask)) goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) goto out_bc; } -- cgit v1.2.3 From afb8a9b70b86866a60e08b2956ae4e1406390336 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 18 Dec 2008 23:26:09 +0530 Subject: sched: framework for sched_mc/smt_power_savings=N Impact: extend range of /sys/devices/system/cpu/sched_mc_power_savings Currently the sched_mc/smt_power_savings variable is a boolean, which either enables or disables topology based power savings. This patch extends the behaviour of the variable from boolean to multivalued, such that based on the value, we decide how aggressively do we want to perform powersavings balance at appropriate sched domain based on topology. Variable levels of power saving tunable would benefit end user to match the required level of power savings vs performance trade-off depending on the system configuration and workloads. This version makes the sched_mc_power_savings global variable to take more values (0,1,2). Later versions can have a single tunable called sched_power_savings instead of sched_{mc,smt}_power_savings. Signed-off-by: Gautham R Shenoy Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b309027bf9e..56b285cd535 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7906,14 +7906,25 @@ int arch_reinit_sched_domains(void) static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { int ret; + unsigned int level = 0; - if (buf[0] != '0' && buf[0] != '1') + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) return -EINVAL; if (smt) - sched_smt_power_savings = (buf[0] == '1'); + sched_smt_power_savings = level; else - sched_mc_power_savings = (buf[0] == '1'); + sched_mc_power_savings = level; ret = arch_reinit_sched_domains(); -- cgit v1.2.3 From d5679bd11916eba5c8ee9033003e1a5ce56ece9a Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:16 +0530 Subject: sched: favour lower logical cpu number for sched_mc balance Impact: change load-balancing direction to match that of irqbalanced Just in case two groups have identical load, prefer to move load to lower logical cpu number rather than the present logic of moving to higher logical number. find_busiest_group() tries to look for a group_leader that has spare capacity to take more tasks and freeup an appropriate least loaded group. Just in case there is a tie and the load is equal, then the group with higher logical number is favoured. This conflicts with user space irqbalance daemon that will move interrupts to lower logical number if the system utilisation is very low. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 56b285cd535..94b9d11e331 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3241,7 +3241,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group)) > cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; @@ -3257,7 +3257,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group)) < cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; -- cgit v1.2.3 From 7a09b1a27b1e5a4957e4af9951420fea02c44fba Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:22 +0530 Subject: sched: nominate preferred wakeup cpu Impact: extend load-balancing code (no change in behavior yet) When the system utilisation is low and more cpus are idle, then the process waking up from sleep should prefer to wakeup an idle cpu from semi-idle cpu package (multi core package) rather than a completely idle cpu package which would waste power. Use the sched_mc balance logic in find_busiest_group() to nominate a preferred wakeup cpu. This info can be stored in appropriate sched_domain, but updating this info in all copies of sched_domain is not practical. Hence this information is stored in root_domain struct which is one copy per partitioned sched domain. The root_domain can be accessed from each cpu's runqueue and there is one copy per partitioned sched domain. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94b9d11e331..c1b8b3031eb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -509,6 +509,14 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif }; /* @@ -3384,6 +3392,10 @@ out_balanced: if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + first_cpu(group_leader->cpumask); + } return group_min; } #endif -- cgit v1.2.3 From 7eb52dfa70dbf5232b5b83ec4357e6bebaa8fde8 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:29 +0530 Subject: sched: bias task wakeups to preferred semi-idle packages Impact: tweak task wakeup to save power more agressively Preferred wakeup cpu (from a semi idle package) has been nominated in find_busiest_group() in the previous patch. Use this information in sched_mc_preferred_wakeup_cpu in function wake_idle() to bias task wakeups if the following conditions are satisfied: - The present cpu that is trying to wakeup the process is idle and waking the target process on this cpu will potentially wakeup a completely idle package - The previous cpu on which the target process ran is also idle and hence selecting the previous cpu may wakeup a semi idle cpu package - The task being woken up is allowed to run in the nominated cpu (cpu affinity and restrictions) Basically if both the current cpu and the previous cpu on which the task ran is idle, select the nominated cpu from semi idle cpu package for running the new task that is waking up. Cache hotness is considered since the actual biasing happens in wake_idle() only if the application is cache cold. This technique will effectively move short running bursty jobs in a mostly idle system. Wakeup biasing for power savings gets automatically disabled if system utilisation increases due to the fact that the probability of finding both this_cpu and prev_cpu idle decreases. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ffffd4a41..36b5e34fa99 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1026,6 +1026,24 @@ static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; + unsigned int chosen_wakeup_cpu; + int this_cpu; + + /* + * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu + * are idle and this is not a kernel thread and this task's affinity + * allows it to be moved to preferred cpu, then just move! + */ + + this_cpu = smp_processor_id(); + chosen_wakeup_cpu = + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && + idle_cpu(cpu) && idle_cpu(this_cpu) && + p->mm && !(p->flags & PF_KTHREAD) && + cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) + return chosen_wakeup_cpu; /* * If it is idle, then it is the best cpu to run this task. -- cgit v1.2.3 From ad273b32e482cdef306eac32b28d97f513a022f4 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:36 +0530 Subject: sched: activate active load balancing in new idle cpus Impact: tweak task balancing to save power more agressively Active load balancing is a process by which migration thread is woken up on the target CPU in order to pull current running task on another package into this newly idle package. This method is already in use with normal load_balance(), this patch introduces this method to new idle cpus when sched_mc is set to POWERSAVINGS_BALANCE_WAKEUP. This logic provides effective consolidation of short running daemon jobs in a almost idle system The side effect of this patch may be ping-ponging of tasks if the system is moderately utilised. May need to adjust the iterations before triggering. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1b8b3031eb..8fc0d5aa43b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3670,10 +3670,64 @@ redo: } if (!ld_moved) { + int active_balance; + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + if (active_balance) + wake_up_process(busiest->migration_thread); + } else sd->nr_balance_failed = 0; -- cgit v1.2.3 From 9924da434a13668fceb208d56dbdf86d166862cc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 00:53:40 +0100 Subject: sched: fix warning in kernel/sched.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix cpumask conversion bug this warning: kernel/sched.c: In function ‘find_busiest_group’: kernel/sched.c:3429: warning: passing argument 1 of ‘__first_cpu’ from incompatible pointer type shows that we forgot to convert a new patch to the new cpumask APIs. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8fc0d5aa43b..ae5ca3f9e77 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3394,7 +3394,7 @@ out_balanced: *imbalance = min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - first_cpu(group_leader->cpumask); + cpumask_first(sched_group_cpus(group_leader)); } return group_min; } -- cgit v1.2.3 From 36dffab679c7eeb91c2507400cf4da6e9e01164e Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Sat, 20 Dec 2008 10:06:38 +0530 Subject: sched: nominate preferred wakeup cpu, fix Andrew Morton reported: > kernel/sched.c: In function 'schedule': > kernel/sched.c:3679: warning: 'active_balance' may be used uninitialized in this function > > This warning is correct - the code is buggy. In sched.c load_balance_newidle, there's real potential use of uninitialised variable - fix it. Signed-off-by: Vaidyanathan Srinivasan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ae5ca3f9e77..756d981d91a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3670,7 +3670,7 @@ redo: } if (!ld_moved) { - int active_balance; + int active_balance = 0; schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && -- cgit v1.2.3 From be4d638c1597580ed2294d899d9f1a2cd10e462c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 26 Dec 2008 22:23:43 +1030 Subject: cpumask: Replace cpu_coregroup_map with cpu_coregroup_mask cpu_coregroup_map returned a cpumask_t: it's going away. (Note, the sched part of this patch won't apply meaningfully to the sched tree, but I'm posting it to show the goal). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Jens Axboe Cc: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d2d16d1273b..42929239830 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7119,7 +7119,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, { int group; #ifdef CONFIG_SCHED_MC - *mask = cpu_coregroup_map(cpu); + *mask = *cpu_coregroup_mask(cpu); cpus_and(*mask, *mask, *cpu_map); group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) @@ -7485,7 +7485,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(core_domains, i); SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); + sd->span = *cpu_coregroup_mask(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7528,7 +7528,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_core_map = cpu_coregroup_map(i); + *this_core_map = *cpu_coregroup_mask(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); if (i != first_cpu(*this_core_map)) continue; -- cgit v1.2.3 From b3199c025d1646e25e7d1d640dd605db251dccf8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:14 +1030 Subject: cpumask: switch over to cpu_online/possible/active/present_mask: core Impact: cleanup This implements the obsolescent cpu_online_map in terms of cpu_online_mask, rather than the other way around. Same for the other maps. The documentation comments are also updated to refer to _mask rather than _map. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/cpu.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index bae131a1211..3ddc509b19c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,30 +15,8 @@ #include #include -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -/* - * Represents all cpu's that are currently online. - */ -cpumask_t cpu_online_map __read_mostly; -EXPORT_SYMBOL(cpu_online_map); - -#ifdef CONFIG_INIT_ALL_POSSIBLE -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -#else -cpumask_t cpu_possible_map __read_mostly; -#endif -EXPORT_SYMBOL(cpu_possible_map); - #ifdef CONFIG_SMP -/* Serializes the updates to cpu_online_map, cpu_present_map */ +/* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); @@ -65,8 +43,6 @@ void __init cpu_hotplug_init(void) cpu_hotplug.refcount = 0; } -cpumask_t cpu_active_map; - #ifdef CONFIG_HOTPLUG_CPU void get_online_cpus(void) @@ -97,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); /* * The following two API's must be used when attempting - * to serialize the updates to cpu_online_map, cpu_present_map. + * to serialize the updates to cpu_online_mask, cpu_present_mask. */ void cpu_maps_update_begin(void) { @@ -503,3 +479,24 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap); const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); + +#ifdef CONFIG_INIT_ALL_POSSIBLE +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly + = CPU_BITS_ALL; +#else +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +#endif +const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); +EXPORT_SYMBOL(cpu_possible_mask); + +static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); +EXPORT_SYMBOL(cpu_online_mask); + +static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); +EXPORT_SYMBOL(cpu_present_mask); + +static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); +EXPORT_SYMBOL(cpu_active_mask); -- cgit v1.2.3 From 3fa41520696fec2815e2d88fbcccdda77ba4d693 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: make set_cpu_*/init_cpu_* out-of-line They're only for use in boot/cpu hotplug code anyway, and this avoids the use of deprecated cpu_*_map. Stephen Rothwell points out that gcc 4.2.4 (on powerpc at least) didn't like the cast away of const anyway: include/linux/cpumask.h: In function 'set_cpu_possible': include/linux/cpumask.h:1052: warning: passing argument 2 of 'cpumask_set_cpu' discards qualifiers from pointer target type So this kills two birds with one stone. Signed-off-by: Rusty Russell --- kernel/cpu.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 3ddc509b19c..2c9f78f3a2f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -500,3 +500,50 @@ EXPORT_SYMBOL(cpu_present_mask); static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); + +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); +} + +void set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); +} + +void set_cpu_online(unsigned int cpu, bool online) +{ + if (online) + cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); +} + +void set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); +} + +void init_cpu_present(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_present_bits), src); +} + +void init_cpu_possible(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_possible_bits), src); +} + +void init_cpu_online(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_online_bits), src); +} -- cgit v1.2.3 From 54b11e6d57a10aa9d0009efd93873e17bffd5d30 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: smp_call_function_many() Impact: Implementation change to remove cpumask_t from stack. Actually change smp_call_function_mask() to smp_call_function_many(). We avoid cpumasks on the stack in this version. (S390 has its own version, but that's going away apparently). We have to do some dancing to figure out if 0 or 1 other cpus are in the mask supplied and the online mask without allocating a tmp cpumask. It's still fairly cheap. We allocate the cpumask at the end of the call_function_data structure: if allocation fails we fallback to smp_call_function_single rather than using the baroque quiescing code (which needs a cpumask on stack). (Thanks to Hiroshi Shimamoto for spotting several bugs in previous versions!) Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Hiroshi Shimamoto Cc: npiggin@suse.de Cc: axboe@kernel.dk --- kernel/smp.c | 139 +++++++++++++++++++++-------------------------------------- 1 file changed, 49 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 75c8dde58c5..9f0eafed139 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -24,8 +24,8 @@ struct call_function_data { struct call_single_data csd; spinlock_t lock; unsigned int refs; - cpumask_t cpumask; struct rcu_head rcu_head; + unsigned long cpumask_bits[]; }; struct call_single_queue { @@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function_queue, csd.list) { int refs; - if (!cpu_isset(cpu, data->cpumask)) + if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) continue; data->csd.func(data->csd.info); spin_lock(&data->lock); - cpu_clear(cpu, data->cpumask); + cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits)); WARN_ON(data->refs == 0); data->refs--; refs = data->refs; @@ -266,51 +266,13 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } -/* Dummy function */ -static void quiesce_dummy(void *unused) -{ -} - -/* - * Ensure stack based data used in call function mask is safe to free. - * - * This is needed by smp_call_function_mask when using on-stack data, because - * a single call function queue is shared by all CPUs, and any CPU may pick up - * the data item on the queue at any time before it is deleted. So we need to - * ensure that all CPUs have transitioned through a quiescent state after - * this call. - * - * This is a very slow function, implemented by sending synchronous IPIs to - * all possible CPUs. For this reason, we have to alloc data rather than use - * stack based data even in the case of synchronous calls. The stack based - * data is then just used for deadlock/oom fallback which will be very rare. - * - * If a faster scheme can be made, we could go back to preferring stack based - * data -- the data allocation/free is non-zero cost. - */ -static void smp_call_function_mask_quiesce_stack(cpumask_t mask) -{ - struct call_single_data data; - int cpu; - - data.func = quiesce_dummy; - data.info = NULL; - - for_each_cpu_mask(cpu, mask) { - data.flags = CSD_FLAG_WAIT; - generic_exec_single(cpu, &data); - } -} - /** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. + * smp_call_function_many(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. - * * If @wait is true, then returns once @func has returned. Note that @wait * will be implicitly turned on in case of allocation failures, since * we fall back to on-stack allocation. @@ -319,53 +281,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask) * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, - int wait) +void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *), void *info, + bool wait) { - struct call_function_data d; - struct call_function_data *data = NULL; - cpumask_t allbutself; + struct call_function_data *data; unsigned long flags; - int cpu, num_cpus; - int slowpath = 0; + int cpu, next_cpu; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); - cpu = smp_processor_id(); - allbutself = cpu_online_map; - cpu_clear(cpu, allbutself); - cpus_and(mask, mask, allbutself); - num_cpus = cpus_weight(mask); - - /* - * If zero CPUs, return. If just a single CPU, turn this request - * into a targetted single call instead since it's faster. - */ - if (!num_cpus) - return 0; - else if (num_cpus == 1) { - cpu = first_cpu(mask); - return smp_call_function_single(cpu, func, info, wait); + /* So, what's a CPU they want? Ignoring this one. */ + cpu = cpumask_first_and(mask, cpu_online_mask); + if (cpu == smp_processor_id()) + cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + /* No online cpus? We're done. */ + if (cpu >= nr_cpu_ids) + return; + + /* Do we have another CPU which isn't us? */ + next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + if (next_cpu == smp_processor_id()) + next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); + + /* Fastpath: do that cpu by itself. */ + if (next_cpu >= nr_cpu_ids) { + smp_call_function_single(cpu, func, info, wait); + return; } - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) { - data->csd.flags = CSD_FLAG_ALLOC; - if (wait) - data->csd.flags |= CSD_FLAG_WAIT; - } else { - data = &d; - data->csd.flags = CSD_FLAG_WAIT; - wait = 1; - slowpath = 1; + data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); + if (unlikely(!data)) { + /* Slow path. */ + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpumask_test_cpu(cpu, mask)) + smp_call_function_single(cpu, func, info, wait); + } + return; } spin_lock_init(&data->lock); + data->csd.flags = CSD_FLAG_ALLOC; + if (wait) + data->csd.flags |= CSD_FLAG_WAIT; data->csd.func = func; data->csd.info = info; - data->refs = num_cpus; - data->cpumask = mask; + cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); + data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); spin_lock_irqsave(&call_function_lock, flags); list_add_tail_rcu(&data->csd.list, &call_function_queue); @@ -377,18 +343,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(mask); + arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits)); /* optionally wait for the CPUs to complete */ - if (wait) { + if (wait) csd_flag_wait(&data->csd); - if (unlikely(slowpath)) - smp_call_function_mask_quiesce_stack(mask); - } - - return 0; } -EXPORT_SYMBOL(smp_call_function_mask); +EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. @@ -396,7 +357,7 @@ EXPORT_SYMBOL(smp_call_function_mask); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. + * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. In case of allocation @@ -407,12 +368,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function(void (*func)(void *), void *info, int wait) { - int ret; - preempt_disable(); - ret = smp_call_function_mask(cpu_online_map, func, info, wait); + smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); - return ret; + return 0; } EXPORT_SYMBOL(smp_call_function); -- cgit v1.2.3 From ce47d974f71af26d00832e83a43ac79bec272d99 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:17 +1030 Subject: cpumask: arch_send_call_function_ipi_mask: core Impact: new API to reduce stack usage We're weaning the core code off handing cpumask's around on-stack. This introduces arch_send_call_function_ipi_mask(). Signed-off-by: Rusty Russell --- kernel/smp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 9f0eafed139..172b1826890 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -266,6 +266,12 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } +/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ +#ifndef arch_send_call_function_ipi_mask +#define arch_send_call_function_ipi_mask(maskp) \ + arch_send_call_function_ipi(*(maskp)) +#endif + /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -343,7 +349,7 @@ void smp_call_function_many(const struct cpumask *mask, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits)); + arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); /* optionally wait for the CPUs to complete */ if (wait) -- cgit v1.2.3 From 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:37 +0100 Subject: [PATCH] fix scaled & unscaled cputime accounting The utimescaled / stimescaled fields in the task structure and the global cpustat should be set on all architectures. On s390 the calls to account_user_time_scaled and account_system_time_scaled never have been added. In addition system time that is accounted as guest time to the user time of a process is accounted to the scaled system time instead of the scaled user time. To fix the bugs and to prevent future forgetfulness this patch merges account_system_time_scaled into account_system_time and account_user_time_scaled into account_user_time. Cc: Benjamin Herrenschmidt Cc: Hidetoshi Seto Cc: Tony Luck Cc: Jeremy Fitzhardinge Cc: Chris Wright Cc: Michael Neuling Acked-by: Paul Mackerras Signed-off-by: Martin Schwidefsky --- kernel/sched.c | 41 ++++++++++++++++------------------------- kernel/time/tick-sched.c | 5 +++-- kernel/timer.c | 12 +++++------- 3 files changed, 24 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fff1c4a20b6..5b03679ff71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4080,13 +4080,17 @@ unsigned long long task_delta_exec(struct task_struct *p) * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime) +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; + /* Add user time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); /* Add user time to cpustat. */ @@ -4103,51 +4107,49 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { cputime64_t tmp; struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; tmp = cputime_to_cputime64(cputime); + /* Add guest time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); + /* Add guest time to cpustat. */ cpustat->user = cputime64_add(cpustat->user, tmp); cpustat->guest = cputime64_add(cpustat->guest, tmp); } -/* - * Account scaled user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->utimescaled = cputime_add(p->utimescaled, cputime); -} - /* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) + cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime); + account_guest_time(p, cputime, cputime_scaled); return; } + /* Add system time to process. */ p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -4166,17 +4168,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, acct_update_integrals(p); } -/* - * Account scaled system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - */ -void account_system_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->stimescaled = cputime_add(p->stimescaled, cputime); -} - /* * Account for involuntary wait time. * @p: the process from which the cpu time has been stolen diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8f3fc2582d3..1f2fce2479f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -420,6 +420,7 @@ void tick_nohz_restart_sched_tick(void) int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long ticks; + cputime_t cputime; ktime_t now; local_irq_disable(); @@ -452,8 +453,8 @@ void tick_nohz_restart_sched_tick(void) */ if (ticks && ticks < LONG_MAX) { add_preempt_count(HARDIRQ_OFFSET); - account_system_time(current, HARDIRQ_OFFSET, - jiffies_to_cputime(ticks)); + cputime = jiffies_to_cputime(ticks); + account_system_time(current, HARDIRQ_OFFSET, cputime, cputime); sub_preempt_count(HARDIRQ_OFFSET); } diff --git a/kernel/timer.c b/kernel/timer.c index 566257d1dc1..b5efb528aa1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1023,13 +1023,11 @@ void account_process_tick(struct task_struct *p, int user_tick) { cputime_t one_jiffy = jiffies_to_cputime(1); - if (user_tick) { - account_user_time(p, one_jiffy); - account_user_time_scaled(p, cputime_to_scaled(one_jiffy)); - } else { - account_system_time(p, HARDIRQ_OFFSET, one_jiffy); - account_system_time_scaled(p, cputime_to_scaled(one_jiffy)); - } + if (user_tick) + account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy)); + else + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + cputime_to_scaled(one_jiffy)); } #endif -- cgit v1.2.3 From 79741dd35713ff4f6fd0eafd59fa94e8a4ba922d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:38 +0100 Subject: [PATCH] idle cputime accounting The cpu time spent by the idle process actually doing something is currently accounted as idle time. This is plain wrong, the architectures that support VIRT_CPU_ACCOUNTING=y can do better: distinguish between the time spent doing nothing and the time spent by idle doing work. The first is accounted with account_idle_time and the second with account_system_time. The architectures that use the account_xxx_time interface directly and not the account_xxx_ticks interface now need to do the check for the idle process in their arch code. In particular to improve the system vs true idle time accounting the arch code needs to measure the true idle time instead of just testing for the idle process. To improve the tick based accounting as well we would need an architecture primitive that can tell us if the pt_regs of the interrupted context points to the magic instruction that halts the cpu. In addition idle time is no more added to the stime of the idle process. This field now contains the system time of the idle process as it should be. On systems without VIRT_CPU_ACCOUNTING this will always be zero as every tick that occurs while idle is running will be accounted as idle time. This patch contains the necessary common code changes to be able to distinguish idle system time and true idle time. The architectures with support for VIRT_CPU_ACCOUNTING need some changes to exploit this. Signed-off-by: Martin Schwidefsky --- kernel/sched.c | 80 ++++++++++++++++++++++++++++++++++++++---------- kernel/time/tick-sched.c | 13 ++++---- kernel/timer.c | 13 -------- 3 files changed, 69 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5b03679ff71..635eaffe1e4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4139,7 +4139,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { @@ -4158,37 +4157,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else - cpustat->idle = cputime64_add(cpustat->idle, tmp); + cpustat->system = cputime64_add(cpustat->system, tmp); + /* Account for system time used */ acct_update_integrals(p); } /* * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen * @steal: the cpu time spent in involuntary wait */ -void account_steal_time(struct task_struct *p, cputime_t steal) +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp = cputime_to_cputime64(steal); + cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); } +#endif + /* * Use precise platform statistics if available: */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1f2fce2479f..611fa4c0baa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -419,8 +419,9 @@ void tick_nohz_restart_sched_tick(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING unsigned long ticks; - cputime_t cputime; +#endif ktime_t now; local_irq_disable(); @@ -442,6 +443,7 @@ void tick_nohz_restart_sched_tick(void) tick_do_update_jiffies64(now); cpu_clear(cpu, nohz_cpu_mask); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -451,12 +453,9 @@ void tick_nohz_restart_sched_tick(void) /* * We might be one off. Do not randomly account a huge number of ticks! */ - if (ticks && ticks < LONG_MAX) { - add_preempt_count(HARDIRQ_OFFSET); - cputime = jiffies_to_cputime(ticks); - account_system_time(current, HARDIRQ_OFFSET, cputime, cputime); - sub_preempt_count(HARDIRQ_OFFSET); - } + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif touch_softlockup_watchdog(); /* diff --git a/kernel/timer.c b/kernel/timer.c index b5efb528aa1..dee3f641a7a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1018,19 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now) } #endif -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy = jiffies_to_cputime(1); - - if (user_tick) - account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy)); - else - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, - cputime_to_scaled(one_jiffy)); -} -#endif - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. -- cgit v1.2.3 From 4f4b6c1a94a8735bbdc030a2911cf395495645b6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:15 +1030 Subject: cpumask: prepare for iterators to only go to nr_cpu_ids/nr_cpumask_bits.: core Impact: cleanup In future, all cpumask ops will only be valid (in general) for bit numbers < nr_cpu_ids. So use that instead of NR_CPUS in iterators and other comparisons. This is always safe: no cpu number can be >= nr_cpu_ids, and nr_cpu_ids is initialized to NR_CPUS at boot. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Acked-by: James Morris Cc: Eric Biederman --- kernel/kexec.c | 2 +- kernel/smp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ac0fde7b54d..3fb855ad6aa 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) struct elf_prstatus prstatus; u32 *buf; - if ((cpu < 0) || (cpu >= NR_CPUS)) + if ((cpu < 0) || (cpu >= nr_cpu_ids)) return; /* Using ELF notes here is opportunistic. diff --git a/kernel/smp.c b/kernel/smp.c index 172b1826890..5cfa0e5e3e8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { + } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { -- cgit v1.2.3 From 9e01c1b74c9531e301c900edaa92a99fcb7738f2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:22 +1030 Subject: cpumask: convert kernel trace functions Impact: Reduce future memory usage, use new cpumask API. (Eventually, cpumask_var_t will be allocated based on nr_cpu_ids, not NR_CPUS). Convert kernel trace functions to use struct cpumask API: 1) Use cpumask_copy/cpumask_test_cpu/for_each_cpu. 2) Use cpumask_var_t and alloc_cpumask_var/free_cpumask_var everywhere. 3) Use on_each_cpu instead of playing with current->cpus_allowed. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 42 ++++++++++++++++++------------- kernel/trace/trace.c | 60 ++++++++++++++++++++++++++------------------ kernel/trace/trace_sysprof.c | 13 +++------- 3 files changed, 64 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d601a7c458..a9d9760dc7b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -195,7 +195,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ - for_each_cpu_mask(cpu, buffer->cpumask) + for_each_cpu(cpu, buffer->cpumask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) @@ -267,7 +267,7 @@ struct ring_buffer { unsigned pages; unsigned flags; int cpus; - cpumask_t cpumask; + cpumask_var_t cpumask; atomic_t record_disabled; struct mutex mutex; @@ -458,6 +458,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (!buffer) return NULL; + if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + goto fail_free_buffer; + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; @@ -465,14 +468,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (buffer->pages == 1) buffer->pages++; - buffer->cpumask = cpu_possible_map; + cpumask_copy(buffer->cpumask, cpu_possible_mask); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) - goto fail_free_buffer; + goto fail_free_cpumask; for_each_buffer_cpu(buffer, cpu) { buffer->buffers[cpu] = @@ -492,6 +495,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) } kfree(buffer->buffers); + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); + fail_free_buffer: kfree(buffer); return NULL; @@ -510,6 +516,8 @@ ring_buffer_free(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); + free_cpumask_var(buffer->cpumask); + kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); @@ -1283,7 +1291,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1396,7 +1404,7 @@ int ring_buffer_write(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1478,7 +1486,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1498,7 +1506,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1515,7 +1523,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1532,7 +1540,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1850,7 +1858,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct buffer_page *reader; int nr_loops = 0; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; cpu_buffer = buffer->buffers[cpu]; @@ -2025,7 +2033,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_event *event; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2062,7 +2070,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) struct ring_buffer_iter *iter; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kmalloc(sizeof(*iter), GFP_KERNEL); @@ -2172,7 +2180,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2228,7 +2236,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; cpu_buffer = buffer->buffers[cpu]; @@ -2252,8 +2260,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; - if (!cpu_isset(cpu, buffer_a->cpumask) || - !cpu_isset(cpu, buffer_b->cpumask)) + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) return -EINVAL; /* At least make sure the two buffers are somewhat the same */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e91f43b6ba..5d04e27f3b4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -89,10 +89,10 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_t __read_mostly tracing_buffer_mask; +static cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ - for_each_cpu_mask(cpu, tracing_buffer_mask) + for_each_cpu(cpu, tracing_buffer_mask) /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -2646,13 +2646,7 @@ static struct file_operations show_traces_fops = { /* * Only trace on a CPU if the bitmask is set: */ -static cpumask_t tracing_cpumask = CPU_MASK_ALL; - -/* - * When tracing/tracing_cpu_mask is modified then this holds - * the new bitmask we are about to install: - */ -static cpumask_t tracing_cpumask_new; +static cpumask_var_t tracing_cpumask; /* * The tracer itself will not take this lock, but still we want @@ -2674,7 +2668,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2693,9 +2687,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { int err, cpu; + cpumask_var_t tracing_cpumask_new; + + if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_unlock; @@ -2706,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpu_isset(cpu, tracing_cpumask) && - !cpu_isset(cpu, tracing_cpumask_new)) { + if (cpumask_test_cpu(cpu, tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&global_trace.data[cpu]->disabled); } - if (!cpu_isset(cpu, tracing_cpumask) && - cpu_isset(cpu, tracing_cpumask_new)) { + if (!cpumask_test_cpu(cpu, tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&global_trace.data[cpu]->disabled); } } __raw_spin_unlock(&ftrace_max_lock); local_irq_enable(); - tracing_cpumask = tracing_cpumask_new; + cpumask_copy(tracing_cpumask, tracing_cpumask_new); mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask_new); return count; err_unlock: mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask); return err; } @@ -3752,7 +3752,6 @@ void ftrace_dump(void) static DEFINE_SPINLOCK(ftrace_dump_lock); /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static cpumask_t mask; static int dump_ran; unsigned long flags; int cnt = 0, cpu; @@ -3786,8 +3785,6 @@ void ftrace_dump(void) * and then release the locks again. */ - cpus_clear(mask); - while (!trace_empty(&iter)) { if (!cnt) @@ -3823,19 +3820,28 @@ __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; int i; + int ret = -ENOMEM; - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_buffer_mask = cpu_possible_map; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(tracing_cpumask, cpu_all_mask); + + /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - return 0; + goto out_free_cpumask; } global_trace.entries = ring_buffer_size(global_trace.buffer); + #ifdef CONFIG_TRACER_MAX_TRACE max_tr.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); @@ -3843,7 +3849,7 @@ __init static int tracer_alloc_buffers(void) printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); - return 0; + goto out_free_cpumask; } max_tr.entries = ring_buffer_size(max_tr.buffer); WARN_ON(max_tr.entries != global_trace.entries); @@ -3873,8 +3879,14 @@ __init static int tracer_alloc_buffers(void) &trace_panic_notifier); register_die_notifier(&trace_die_notifier); + ret = 0; - return 0; +out_free_cpumask: + free_cpumask_var(tracing_cpumask); +out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); +out: + return ret; } early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index a5779bd975d..eaca5ad803f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -196,9 +196,9 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void start_stack_timer(int cpu) +static void start_stack_timer(void *unused) { - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; @@ -208,14 +208,7 @@ static void start_stack_timer(int cpu) static void start_stack_timers(void) { - cpumask_t saved_mask = current->cpus_allowed; - int cpu; - - for_each_online_cpu(cpu) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - start_stack_timer(cpu); - } - set_cpus_allowed_ptr(current, &saved_mask); + on_each_cpu(start_stack_timer, NULL, 1); } static void stop_stack_timer(int cpu) -- cgit v1.2.3 From 4462344ee9ea9224d026801b877887f2f39774a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:23 +1030 Subject: cpumask: convert kernel trace functions further Impact: Reduce future memory usage, use new cpumask API. Since the last patch was created and acked, more old cpumask users slipped into kernel/trace. Mostly trivial conversions, except struct trace_iterator's "started" member becomes a cpumask_var_t. Signed-off-by: Rusty Russell --- kernel/trace/trace.c | 12 +++++++++--- kernel/trace/trace.h | 2 +- kernel/trace/trace_boot.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hw_branches.c | 6 +++--- kernel/trace/trace_power.c | 2 +- 6 files changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d04e27f3b4..c580233add9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1811,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpu_isset(iter->cpu, iter->started)) + if (cpumask_test_cpu(iter->cpu, iter->started)) return; - cpu_set(iter->cpu, iter->started); + cpumask_set_cpu(iter->cpu, iter->started); trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } @@ -3114,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (!iter) return -ENOMEM; + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&trace_types_lock); /* trace pipe does not show start of buffer */ - cpus_setall(iter->started); + cpumask_setall(iter->started); iter->tr = &global_trace; iter->trace = current_trace; @@ -3134,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + free_cpumask_var(iter->started); kfree(iter); atomic_dec(&tracing_reader); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc7a4f86403..4d3d381bfd9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -368,7 +368,7 @@ struct trace_iterator { loff_t pos; long idx; - cpumask_t started; + cpumask_var_t started; }; int tracing_is_enabled(void); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3ccebde2848..366c8c333e1 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); tracing_sched_switch_assign_trace(tr); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4bf39fcae97..930c08e5b38 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu) int i; int ret; int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); /* diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index b6a3e20a49a..649df22d435 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); } @@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); } @@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); } diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index a7172a352f6..7bda248daf5 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr) trace_power_enabled = 1; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); return 0; } -- cgit v1.2.3 From f1fc057c79cb2d27602fb3ad08a031f13459ef27 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:23 +1030 Subject: cpumask: remove any_online_cpu() users: kernel/ Impact: Remove obsolete API usage any_online_cpu() is a good name, but it takes a cpumask_t, not a pointer. There are several places where any_online_cpu() doesn't really want a mask arg at all. Replace all callers with cpumask_any() and cpumask_any_and(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/softirq.c | 2 +- kernel/softlockup.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 466e75ce271..b7568d7def2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -733,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 1ab790c67b1..492f0c72fec 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -303,7 +303,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - check_cpu = any_online_cpu(cpu_online_map); + check_cpu = cpumask_any(cpu_online_mask); wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU @@ -313,7 +313,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) cpumask_t temp_cpu_online_map = cpu_online_map; cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = any_online_cpu(temp_cpu_online_map); + check_cpu = cpumask_any(&temp_cpu_online_map); } break; @@ -323,7 +323,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: p = per_cpu(watchdog_task, hotcpu); -- cgit v1.2.3 From a45185d2d7108b01b90b9e0293377be4d6346dde Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:24 +1030 Subject: cpumask: convert kernel/compat.c Impact: Reduce stack usage, use new cpumask API. Straightforward conversion; cpumasks' size is given by cpumask_size() (now a variable rather than fixed) and on-stack cpu masks use cpumask_var_t. Signed-off-by: Rusty Russell --- kernel/compat.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 8eafe3eb50d..d52e2ec1deb 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -454,16 +454,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, } static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, - unsigned len, cpumask_t *new_mask) + unsigned len, struct cpumask *new_mask) { unsigned long *k; - if (len < sizeof(cpumask_t)) - memset(new_mask, 0, sizeof(cpumask_t)); - else if (len > sizeof(cpumask_t)) - len = sizeof(cpumask_t); + if (len < cpumask_size()) + memset(new_mask, 0, cpumask_size()); + else if (len > cpumask_size()) + len = cpumask_size(); - k = cpus_addr(*new_mask); + k = cpumask_bits(new_mask); return compat_get_bitmap(k, user_mask_ptr, len * 8); } @@ -471,40 +471,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask); if (retval) - return retval; + goto out; - return sched_setaffinity(pid, &new_mask); + retval = sched_setaffinity(pid, new_mask); +out: + free_cpumask_var(new_mask); + return retval; } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; unsigned long *k; - unsigned int min_length = sizeof(cpumask_t); + unsigned int min_length = cpumask_size(); - if (NR_CPUS <= BITS_PER_COMPAT_LONG) + if (nr_cpu_ids <= BITS_PER_COMPAT_LONG) min_length = sizeof(compat_ulong_t); if (len < min_length) return -EINVAL; - ret = sched_getaffinity(pid, &mask); + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); if (ret < 0) - return ret; + goto out; - k = cpus_addr(mask); + k = cpumask_bits(mask); ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret) - return ret; + if (ret == 0) + ret = min_length; - return min_length; +out: + free_cpumask_var(mask); + return ret; } int get_compat_itimerspec(struct itimerspec *dst, -- cgit v1.2.3 From e7577c50f2fb2d1c167e2c04a4b4c2cc042acb82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:25 +1030 Subject: cpumask: convert kernel/workqueue.c Impact: Reduce memory usage, use new cpumask API. cpu_populated_map becomes a cpumask_var_t, and cpu_singlethread_map is simply a cpumask pointer: it's simply the cpumask containing the first possible CPU anyway. Signed-off-by: Rusty Russell --- kernel/workqueue.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4952322cba4..2f445833ae3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); static int singlethread_cpu __read_mostly; -static cpumask_t cpu_singlethread_map __read_mostly; +static const struct cpumask *cpu_singlethread_map __read_mostly; /* * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD * flushes cwq->worklist. This means that flush_workqueue/wait_on_work @@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly; * use cpu_possible_map, the cpumask below is more a documentation * than optimization. */ -static cpumask_t cpu_populated_map __read_mostly; +static cpumask_var_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ static inline int is_wq_single_threaded(struct workqueue_struct *wq) @@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq) return wq->singlethread; } -static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) +static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) { return is_wq_single_threaded(wq) - ? &cpu_singlethread_map : &cpu_populated_map; + ? cpu_singlethread_map : cpu_populated_map; } static @@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) */ void flush_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; might_sleep(); @@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work) { struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - const cpumask_t *cpu_map; + const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; cpu_maps_update_begin(); @@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - cpu_set(cpu, cpu_populated_map); + cpumask_set_cpu(cpu, cpu_populated_map); } undo: list_for_each_entry(wq, &workqueues, list) { @@ -964,7 +964,7 @@ undo: switch (action) { case CPU_UP_CANCELED: case CPU_POST_DEAD: - cpu_clear(cpu, cpu_populated_map); + cpumask_clear_cpu(cpu, cpu_populated_map); } return ret; @@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu); void __init init_workqueues(void) { - cpu_populated_map = cpu_online_map; - singlethread_cpu = first_cpu(cpu_possible_map); - cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); + alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + + cpumask_copy(cpu_populated_map, cpu_online_mask); + singlethread_cpu = cpumask_first(cpu_possible_mask); + cpu_singlethread_map = cpumask_of(singlethread_cpu); hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); -- cgit v1.2.3 From 6b954823c24f04ed026a8517f6bab5abda279db8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:25 +1030 Subject: cpumask: convert kernel time functions Impact: Use new APIs Convert kernel/time functions to use struct cpumask *. Note the ugly bitmap declarations in tick-broadcast.c. These should be cpumask_var_t, but there was no obvious initialization function to put the alloc_cpumask_var() calls in. This was safe. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/time/clocksource.c | 6 +-- kernel/time/tick-broadcast.c | 113 ++++++++++++++++++++++--------------------- kernel/time/tick-common.c | 6 +-- 3 files changed, 64 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9ed2eec9752..32141b15d63 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -148,7 +148,7 @@ static void clocksource_watchdog(unsigned long data) int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); if (next_cpu >= nr_cpu_ids) - next_cpu = first_cpu(cpu_online_map); + next_cpu = cpumask_first(cpu_online_mask); watchdog_timer.expires += WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, next_cpu); } @@ -173,7 +173,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -195,7 +195,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9590af2327b..356fac57a18 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,7 +28,9 @@ */ struct tick_device tick_broadcast_device; -static cpumask_t tick_broadcast_mask; +/* FIXME: Use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); +static DECLARE_BITMAP(tmpmask, NR_CPUS); static DEFINE_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void) return &tick_broadcast_device; } -cpumask_t *tick_get_broadcast_mask(void) +struct cpumask *tick_get_broadcast_mask(void) { - return &tick_broadcast_mask; + return to_cpumask(tick_broadcast_mask); } /* @@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) clockevents_exchange_device(NULL, dev); tick_broadcast_device.evtdev = dev; - if (!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(dev); return 1; } @@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; - cpu_set(cpu, tick_broadcast_mask); + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_clear_oneshot(cpu); } } @@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) } /* - * Broadcast the event to the cpus, which are set in the mask + * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(cpumask_t mask) +static void tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; @@ -135,22 +137,21 @@ static void tick_do_broadcast(cpumask_t mask) /* * Check, if the current cpu is in the mask */ - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); td = &per_cpu(tick_cpu_device, cpu); td->evtdev->event_handler(td->evtdev); } - if (!cpus_empty(mask)) { + if (!cpumask_empty(mask)) { /* * It might be necessary to actually check whether the devices * have different broadcast functions. For now, just use the * one of the first device. This works as long as we have this * misfeature only on x86 (lapic) */ - cpu = first_cpu(mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(&mask); + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); + td->evtdev->broadcast(mask); } } @@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask) */ static void tick_do_periodic_broadcast(void) { - cpumask_t mask; - spin_lock(&tick_broadcast_lock); - cpus_and(mask, cpu_online_map, tick_broadcast_mask); - tick_do_broadcast(mask); + cpumask_and(to_cpumask(tmpmask), + cpu_online_mask, tick_get_broadcast_mask()); + tick_do_broadcast(to_cpumask(tmpmask)); spin_unlock(&tick_broadcast_lock); } @@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpus_empty(tick_broadcast_mask); + bc_stopped = cpumask_empty(tick_get_broadcast_mask()); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpu_isset(cpu, tick_broadcast_mask)) { - cpu_set(cpu, tick_broadcast_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpu_isset(cpu, tick_broadcast_mask)) { - cpu_clear(cpu, tick_broadcast_mask); + cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why) break; } - if (cpus_empty(tick_broadcast_mask)) { + if (cpumask_empty(tick_get_broadcast_mask())) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -272,7 +272,7 @@ out: */ void tick_broadcast_on_off(unsigned long reason, int *oncpu) { - if (!cpu_isset(*oncpu, cpu_online_map)) + if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else @@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpus_empty(tick_broadcast_mask)) + if (bc && cpumask_empty(tick_get_broadcast_mask())) clockevents_shutdown(bc); } @@ -342,10 +342,10 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if(!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(bc); - broadcast = cpu_isset(smp_processor_id(), - tick_broadcast_mask); + broadcast = cpumask_test_cpu(smp_processor_id(), + tick_get_broadcast_mask()); break; case TICKDEV_MODE_ONESHOT: broadcast = tick_resume_broadcast_oneshot(bc); @@ -360,14 +360,15 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -static cpumask_t tick_broadcast_oneshot_mask; +/* FIXME: use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); /* - * Debugging: see timer_list.c + * Exposed for debugging: see timer_list.c */ -cpumask_t *tick_get_broadcast_oneshot_mask(void) +struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return &tick_broadcast_oneshot_mask; + return to_cpumask(tick_broadcast_oneshot_mask); } static int tick_broadcast_set_event(ktime_t expires, int force) @@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu) static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; - cpumask_t mask; ktime_t now, next_event; int cpu; @@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - mask = CPU_MASK_NONE; + cpumask_clear(to_cpumask(tmpmask)); now = ktime_get(); /* Find all expired events */ - for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { + for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, to_cpumask(tmpmask)); else if (td->evtdev->next_event.tv64 < next_event.tv64) next_event.tv64 = td->evtdev->next_event.tv64; } @@ -424,7 +424,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(mask); + tick_do_broadcast(to_cpumask(tmpmask)); /* * Two reasons for reprogram: @@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) goto out; if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_set(cpu, tick_broadcast_oneshot_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(dev->next_event, 1); } } else { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_clear_cpu(cpu, + tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); if (dev->next_event.tv64 != KTIME_MAX) tick_program_event(dev->next_event, 1); @@ -502,10 +503,11 @@ out: */ static void tick_broadcast_clear_oneshot(int cpu) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); } -static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) { struct tick_device *td; int cpu; @@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; int cpu = smp_processor_id(); - cpumask_t mask; bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - mask = tick_broadcast_mask; - cpu_clear(cpu, mask); - cpus_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, mask); - - if (was_periodic && !cpus_empty(mask)) { - tick_broadcast_init_next_event(&mask, tick_next_period); + cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); + cpumask_or(tick_get_broadcast_oneshot_mask(), + tick_get_broadcast_oneshot_mask(), + to_cpumask(tmpmask)); + + if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_next_period); tick_broadcast_set_event(tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; @@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f8372be7412..63e05d423a0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = NULL; } clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); @@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup) } /* Transfer the do_timer job away from this cpu */ if (*cpup == tick_do_timer_cpu) { - int cpu = first_cpu(cpu_online_map); + int cpu = cpumask_first(cpu_online_mask); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); -- cgit v1.2.3 From d036e67b40f52bdd95392390108defbac7e53837 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert kernel/irq Impact: Reduce stack usage, use new cpumask API. ALPHA mod! Main change is that irq_default_affinity becomes a cpumask_var_t, so treat it as a pointer (this effects alpha). Signed-off-by: Rusty Russell --- kernel/irq/manage.c | 11 +++++++++-- kernel/irq/proc.c | 32 +++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c4a9b6216..cd0cd8dcb34 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,8 +16,15 @@ #include "internals.h" #ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; -cpumask_t irq_default_affinity = CPU_MASK_ALL; +static int init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL); + cpumask_setall(irq_default_affinity); + return 0; +} +core_initcall(init_irq_default_affinity); /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) @@ -127,7 +134,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); + cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); set_affinity: desc->chip->set_affinity(irq, &desc->affinity); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d2c0e5ee53c..2abd3a7716e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - cpumask_t *mask = &desc->affinity; + const struct cpumask *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) @@ -93,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = { static int default_affinity_show(struct seq_file *m, void *v) { - seq_cpumask(m, &irq_default_affinity); + seq_cpumask(m, irq_default_affinity); seq_putc(m, '\n'); return 0; } @@ -101,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v) static ssize_t default_affinity_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - cpumask_t new_value; + cpumask_var_t new_value; int err; - err = cpumask_parse_user(buffer, count, &new_value); + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto out; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(new_value)) { + err = -EINVAL; + goto out; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) - return -EINVAL; + if (!cpumask_intersects(new_value, cpu_online_mask)) { + err = -EINVAL; + goto out; + } - irq_default_affinity = new_value; + cpumask_copy(irq_default_affinity, new_value); + err = count; - return count; +out: + free_cpumask_var(new_value); + return err; } static int default_affinity_open(struct inode *inode, struct file *file) -- cgit v1.2.3 From bd232f97b30f6bb630efa136a777647545db3039 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert RCU implementations Impact: use new cpumask API. rcu_ctrlblk contains a cpumask, and it's highly optimized so I don't want a cpumask_var_t (ie. a pointer) for the CONFIG_CPUMASK_OFFSTACK case. It could use a dangling bitmap, and be allocated in __rcu_init to save memory, but for the moment we use a bitmap. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). We remove on-stack cpumasks, using cpumask_var_t for rcu_torture_shuffle_tasks() and for_each_cpu_and in force_quiescent_state(). Signed-off-by: Rusty Russell --- kernel/rcuclassic.c | 32 +++++++++++++++++--------------- kernel/rcupreempt.c | 19 ++++++++++--------- kernel/rcutorture.c | 27 +++++++++++++++------------ 3 files changed, 42 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index e503a002f33..0ff9b05706a 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; @@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp, struct rcu_ctrlblk *rcp) { int cpu; - cpumask_t cpumask; unsigned long flags; set_need_resched(); @@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp, * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. * - * cpu_online_map is updated by the _cpu_down() + * cpu_online_mask is updated by the _cpu_down() * using __stop_machine(). Since we're in irqs disabled * section, __stop_machine() is not exectuting, hence - * the cpu_online_map is stable. + * the cpu_online_mask is stable. * * However, a cpu might have been offlined _just_ before * we disabled irqs while entering here. @@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp, * notification, leading to the offlined cpu's bit * being set in the rcp->cpumask. * - * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent * sending smp_reschedule() to an offlined CPU. */ - cpus_and(cpumask, rcp->cpumask, cpu_online_map); - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask_nr(cpu, cpumask) - smp_send_reschedule(cpu); + for_each_cpu_and(cpu, + to_cpumask(rcp->cpumask), cpu_online_mask) { + if (cpu != rdp->cpu) + smp_send_reschedule(cpu); + } } spin_unlock_irqrestore(&rcp->lock, flags); } @@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for_each_possible_cpu(cpu) { - if (cpu_isset(cpu, rcp->cpumask)) + if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) printk(" %d", cpu); } printk(" (detected by %d, t=%ld jiffies)\n", @@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) long delta; delta = jiffies - rcp->jiffies_stall; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && + delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rcp); @@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(to_cpumask(rcp->cpumask), + cpu_online_mask, &nohz_cpu_mask); rcp->signaled = 0; } @@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) */ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) { - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { + cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); + if (cpumask_empty(to_cpumask(rcp->cpumask))) { /* batch completed ! */ rcp->completed = rcp->cur; rcu_start_batch(rcp); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 04982659875..f9dc8f3720f 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] = { "idle", "waitack", "waitzero", "waitmb" }; #endif /* #ifdef CONFIG_RCU_TRACE */ -static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; +static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly + = CPU_BITS_NONE; /* * Enum and per-CPU flag to determine when each CPU has seen @@ -758,7 +759,7 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; dyntick_save_progress_counter(cpu); } @@ -776,7 +777,7 @@ rcu_try_flip_waitack(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitack_needed(cpu) && per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); @@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void) /* Check to see if the sum of the "last" counters is zero. */ RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; if (sum != 0) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); @@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; dyntick_save_progress_counter(cpu); } @@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitmb_needed(cpu) && per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); @@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu) RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; - cpu_clear(cpu, rcu_cpu_online_map); + cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); @@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu) struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpu_set(cpu, rcu_cpu_online_map); + cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); /* @@ -1430,7 +1431,7 @@ void __init __rcu_init(void) * We don't need protection against CPU-Hotplug here * since * a) If a CPU comes online while we are iterating over the - * cpu_online_map below, we would only end up making a + * cpu_online_mask below, we would only end up making a * duplicate call to rcu_online_cpu() which sets the corresponding * CPU's mask in the rcu_cpu_online_map. * diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b3106552210..3245b40952c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -868,49 +868,52 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_t tmp_mask; + cpumask_var_t tmp_mask; int i; - cpus_setall(tmp_mask); + if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) + BUG(); + + cpumask_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } + if (num_online_cpus() == 1) + goto out; if (rcu_idle_cpu != -1) - cpu_clear(rcu_idle_cpu, tmp_mask); + cpumask_clear_cpu(rcu_idle_cpu, tmp_mask); - set_cpus_allowed_ptr(current, &tmp_mask); + set_cpus_allowed_ptr(current, tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed_ptr(reader_tasks[i], - &tmp_mask); + tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], - &tmp_mask); + tmp_mask); } if (writer_task) - set_cpus_allowed_ptr(writer_task, &tmp_mask); + set_cpus_allowed_ptr(writer_task, tmp_mask); if (stats_task) - set_cpus_allowed_ptr(stats_task, &tmp_mask); + set_cpus_allowed_ptr(stats_task, tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; else rcu_idle_cpu--; +out: put_online_cpus(); + free_cpumask_var(tmp_mask); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the -- cgit v1.2.3 From c309b917cab55799ea489d7b5f1b77025d9f8462 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:27 +1030 Subject: cpumask: convert kernel/profile.c Impact: Reduce kernel memory usage, use new cpumask API. Avoid a static cpumask_t for prof_cpu_mask, and an on-stack cpumask_t in prof_cpu_mask_write_proc. Both become cpumask_var_t. prof_cpu_mask is only allocated when profiling is on, but the NULL checks are optimized out by gcc for the !CPUMASK_OFFSTACK case. Also removed some strange and unnecessary casts. Signed-off-by: Rusty Russell --- kernel/profile.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 4cb7d68fed8..d18e2d2654f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +static cpumask_var_t prof_cpu_mask; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -113,9 +113,13 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); if (!slab_is_available()) { prof_buffer = alloc_bootmem(buffer_bytes); + alloc_bootmem_cpumask_var(&prof_cpu_mask); return 0; } + if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); if (prof_buffer) return 0; @@ -128,6 +132,7 @@ int __ref profile_init(void) if (prof_buffer) return 0; + free_cpumask_var(prof_cpu_mask); return -ENOMEM; } @@ -386,13 +391,15 @@ out_free: return NOTIFY_BAD; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cpu_set(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - cpu_clear(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); if (per_cpu(cpu_profile_hits, cpu)[0]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); per_cpu(cpu_profile_hits, cpu)[0] = NULL; @@ -430,7 +437,8 @@ void profile_tick(int type) if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && prof_cpu_mask != NULL && + cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } @@ -442,7 +450,7 @@ void profile_tick(int type) static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, (cpumask_t *)data); + int len = cpumask_scnprintf(page, count, data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -452,16 +460,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, static int prof_cpu_mask_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { - cpumask_t *mask = (cpumask_t *)data; + struct cpumask *mask = data; unsigned long full_count = count, err; - cpumask_t new_value; + cpumask_var_t new_value; - err = cpumask_parse_user(buffer, count, &new_value); - if (err) - return err; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; - *mask = new_value; - return full_count; + err = cpumask_parse_user(buffer, count, new_value); + if (!err) { + cpumask_copy(mask, new_value); + err = full_count; + } + free_cpumask_var(new_value); + return err; } void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) @@ -472,7 +484,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); if (!entry) return; - entry->data = (void *)&prof_cpu_mask; + entry->data = prof_cpu_mask; entry->read_proc = prof_cpu_mask_read_proc; entry->write_proc = prof_cpu_mask_write_proc; } -- cgit v1.2.3 From e0b582ec56f1a1d8b30ebf340a7b91fb09f26c8c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:28 +1030 Subject: cpumask: convert kernel/cpu.c Impact: Reduce kernel stack and memory usage, use new cpumask API. Use cpumask_var_t for take_cpu_down() stack var, and frozen_cpus. Note that notify_cpu_starting() can be called before core_initcall allocates frozen_cpus, but the NULL check is optimized out by gcc for the CONFIG_CPUMASK_OFFSTACK=n case. Signed-off-by: Rusty Russell --- kernel/cpu.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2c9f78f3a2f..47fff3b63cb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -194,7 +194,7 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - cpumask_t old_allowed, tmp; + cpumask_var_t old_allowed; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { @@ -208,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; + if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) + return -ENOMEM; + cpu_hotplug_begin(); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); @@ -222,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } /* Ensure that we are not runnable on dying cpu */ - old_allowed = current->cpus_allowed; - cpus_setall(tmp); - cpu_clear(cpu, tmp); - set_cpus_allowed_ptr(current, &tmp); - tmp = cpumask_of_cpu(cpu); + cpumask_copy(old_allowed, ¤t->cpus_allowed); + set_cpus_allowed_ptr(current, + cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); - err = __stop_machine(take_cpu_down, &tcd_param, &tmp); + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, @@ -254,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); out_allowed: - set_cpus_allowed_ptr(current, &old_allowed); + set_cpus_allowed_ptr(current, old_allowed); out_release: cpu_hotplug_done(); if (!err) { @@ -262,6 +263,7 @@ out_release: hcpu) == NOTIFY_BAD) BUG(); } + free_cpumask_var(old_allowed); return err; } @@ -280,7 +282,7 @@ int __ref cpu_down(unsigned int cpu) /* * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_map. + * using stale version of the cpu_active_mask. * This is not strictly necessary becuase stop_machine() * that we run down the line already provides the required * synchronization. But it's really a side effect and we do not @@ -344,7 +346,7 @@ out_notify: int __cpuinit cpu_up(unsigned int cpu) { int err = 0; - if (!cpu_isset(cpu, cpu_possible_map)) { + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) @@ -369,25 +371,25 @@ out: } #ifdef CONFIG_PM_SLEEP_SMP -static cpumask_t frozen_cpus; +static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; cpu_maps_update_begin(); - first_cpu = first_cpu(cpu_online_map); + first_cpu = cpumask_first(cpu_online_mask); /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; error = _cpu_down(cpu, 1); if (!error) { - cpu_set(cpu, frozen_cpus); + cpumask_set_cpu(cpu, frozen_cpus); printk("CPU%d is down\n", cpu); } else { printk(KERN_ERR "Error taking CPU%d down: %d\n", @@ -413,11 +415,11 @@ void __ref enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); cpu_hotplug_disabled = 0; - if (cpus_empty(frozen_cpus)) + if (cpumask_empty(frozen_cpus)) goto out; printk("Enabling non-boot CPUs ...\n"); - for_each_cpu_mask_nr(cpu, frozen_cpus) { + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); @@ -425,10 +427,18 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); } + +static int alloc_frozen_cpus(void) +{ + if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) + return -ENOMEM; + return 0; +} +core_initcall(alloc_frozen_cpus); #endif /* CONFIG_PM_SLEEP_SMP */ /** @@ -444,7 +454,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) unsigned long val = CPU_STARTING; #ifdef CONFIG_PM_SLEEP_SMP - if (cpu_isset(cpu, frozen_cpus)) + if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) val = CPU_STARTING_FROZEN; #endif /* CONFIG_PM_SLEEP_SMP */ raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); @@ -456,7 +466,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< Date: Thu, 1 Jan 2009 10:12:28 +1030 Subject: cpumask: convert rest of files in kernel/ Impact: Reduce stack usage, use new cpumask API. Mainly changing cpumask_t to 'struct cpumask' and similar simple API conversion. Two conversions worth mentioning: 1) we use cpumask_any_but to avoid a temporary in kernel/softlockup.c, 2) Use cpumask_var_t in taskstats_user_cmd(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Balbir Singh Cc: Ingo Molnar --- kernel/power/poweroff.c | 2 +- kernel/softlockup.c | 6 ++---- kernel/stop_machine.c | 8 ++++---- kernel/taskstats.c | 39 ++++++++++++++++++++++++--------------- 4 files changed, 31 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 72016f05147..97890831e1b 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { /* run sysrq poweroff on boot cpu */ - schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 492f0c72fec..d9188c66278 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -310,10 +310,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: if (hotcpu == check_cpu) { - cpumask_t temp_cpu_online_map = cpu_online_map; - - cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = cpumask_any(&temp_cpu_online_map); + /* Pick any other online cpu. */ + check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); } break; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 24e8ceacc38..286c41722e8 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -69,10 +69,10 @@ static void stop_cpu(struct work_struct *unused) int err; if (!active_cpus) { - if (cpu == first_cpu(cpu_online_map)) + if (cpu == cpumask_first(cpu_online_mask)) smdata = &active; } else { - if (cpu_isset(cpu, *active_cpus)) + if (cpumask_test_cpu(cpu, active_cpus)) smdata = &active; } /* Simple state machine */ @@ -109,7 +109,7 @@ static int chill(void *unused) return 0; } -int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { struct work_struct *sm_work; int i, ret; @@ -142,7 +142,7 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) return ret; } -int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { int ret; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 6d7dc4ec4aa..888adbcca30 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,18 +290,17 @@ ret: return; } -static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) { struct listener_list *listeners; struct listener *s, *tmp; unsigned int cpu; - cpumask_t mask = *maskp; - if (!cpus_subset(mask, cpu_possible_map)) + if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; if (isadd == REGISTER) { - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); if (!s) @@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) /* Deregister or cleanup */ cleanup: - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { @@ -335,7 +334,7 @@ cleanup: return 0; } -static int parse(struct nlattr *na, cpumask_t *mask) +static int parse(struct nlattr *na, struct cpumask *mask) { char *data; int len; @@ -428,23 +427,33 @@ err: static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { - int rc = 0; + int rc; struct sk_buff *rep_skb; struct taskstats *stats; size_t size; - cpumask_t mask; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) - return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, REGISTER); + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, REGISTER); + goto free_return_rc; + } - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +free_return_rc: + free_cpumask_var(mask); return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, DEREGISTER); + } + free_cpumask_var(mask); /* * Size includes space for nested attributes -- cgit v1.2.3 From 5db0e1e9e0f30f160b832a0b5cd1131954bf4f6e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:29 +1030 Subject: cpumask: replace for_each_cpu_mask_nr with for_each_cpu in kernel/time/ Impact: cleanup Simple replacement, now the _nr is redundant. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Ingo Molnar --- kernel/time/clocksource.c | 3 ++- kernel/time/tick-broadcast.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 32141b15d63..ca89e1593f0 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,7 +145,8 @@ static void clocksource_watchdog(unsigned long data) * Cycle through CPUs to check if the CPUs stay * synchronized to each other. */ - int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); + int next_cpu = cpumask_next(raw_smp_processor_id(), + cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 356fac57a18..118a3b3b3f9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -512,7 +512,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, struct tick_device *td; int cpu; - for_each_cpu_mask_nr(cpu, *mask) { + for_each_cpu(cpu, mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev) td->evtdev->next_event = expires; -- cgit v1.2.3 From 263ec6457bb23d57b575ede18ff6c3d11e0b4e96 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Jan 2009 13:16:09 +0100 Subject: cpumask: convert RCU implementations, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup This warning: kernel/rcuclassic.c: In function ‘rcu_start_batch’: kernel/rcuclassic.c:397: warning: passing argument 1 of ‘cpumask_andnot’ from incompatible pointer type triggers because one usage site of rcp->cpumask was not converted to to_cpumask(rcp->cpumask). There's no ill effects of this bug. Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 6ec495f60ea..490934fc7ac 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -394,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask); + cpumask_andnot(to_cpumask(rcp->cpumask), + cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } -- cgit v1.2.3 From 6bdf197b04b3ae7c85785bc5a9576f1bcb0ac7c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Jan 2009 12:50:46 +0100 Subject: ia64: cpumask fix for is_affinity_mask_valid() Impact: build fix on ia64 ia64's default_affinity_write() still had old cpumask_t usage: /home/mingo/tip/kernel/irq/proc.c: In function `default_affinity_write': /home/mingo/tip/kernel/irq/proc.c:114: error: incompatible type for argument 1 of `is_affinity_mask_valid' make[3]: *** [kernel/irq/proc.o] Error 1 make[3]: *** Waiting for unfinished jobs.... update it to cpumask_var_t. Signed-off-by: Ingo Molnar --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 2abd3a7716e..aae3f742bce 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -54,7 +54,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (err) goto free_cpumask; - if (!is_affinity_mask_valid(*new_value)) { + if (!is_affinity_mask_valid(new_value)) { err = -EINVAL; goto free_cpumask; } -- cgit v1.2.3 From 6ca09dfc9f180d038dcef93c167a833f43a8246f Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Wed, 31 Dec 2008 18:08:45 -0800 Subject: sched: put back some stack hog changes that were undone in kernel/sched.c Impact: prevents panic from stack overflow on numa-capable machines. Some of the "removal of stack hogs" changes in kernel/sched.c by using node_to_cpumask_ptr were undone by the early cpumask API updates, and causes a panic due to stack overflow. This patch undoes those changes by using cpumask_of_node() which returns a 'const struct cpumask *'. In addition, cpu_coregoup_map is replaced with cpu_coregroup_mask further reducing stack usage. (Both of these updates removed 9 FIXME's!) Also: Pick up some remaining changes from the old 'cpumask_t' functions to the new 'struct cpumask *' functions. Optimize memory traffic by allocating each percpu local_cpu_mask on the same node as the referring cpu. Signed-off-by: Mike Travis Acked-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 53 +++++++++++++++-------------------------------------- kernel/sched_rt.c | 3 ++- 2 files changed, 17 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 27ba1d642f0..dd862d70e71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3715,7 +3715,7 @@ redo: * don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { double_unlock_balance(this_rq, busiest); all_pinned = 1; return ld_moved; @@ -6220,9 +6220,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - /* FIXME: Use cpumask_of_node here. */ - cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); - const struct cpumask *nodemask = &_nodemask; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: /* Look for allowed, online CPU in same node. */ @@ -7133,21 +7131,18 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; - /* FIXME: use cpumask_of_node() */ - node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(*span); + cpumask_clear(span); nodes_clear(used_nodes); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(node)); node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, &used_nodes); - node_to_cpumask_ptr_next(nodemask, next_node); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(next_node)); } } #endif /* CONFIG_NUMA */ @@ -7227,9 +7222,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, { int group; #ifdef CONFIG_SCHED_MC - /* FIXME: Use cpu_coregroup_mask. */ - *mask = cpu_coregroup_map(cpu); - cpus_and(*mask, *mask, *cpu_map); + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); @@ -7259,10 +7252,8 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, struct cpumask *nodemask) { int group; - /* FIXME: use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - cpumask_and(nodemask, pnodemask, cpu_map); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); group = cpumask_first(nodemask); if (sg) @@ -7313,10 +7304,8 @@ static void free_sched_groups(const struct cpumask *cpu_map, for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; - /* FIXME: Use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, i); - cpus_and(*nodemask, *pnodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) continue; @@ -7525,9 +7514,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - /* FIXME: use cpumask_of_node */ - *nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > @@ -7568,9 +7555,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); - *sched_domain_span(sd) = cpu_coregroup_map(i); - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, + cpu_coregroup_mask(i)); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7606,9 +7592,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - /* FIXME: Use cpu_coregroup_mask */ - *this_core_map = cpu_coregroup_map(i); - cpus_and(*this_core_map, *this_core_map, *cpu_map); + cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); if (i != cpumask_first(this_core_map)) continue; @@ -7620,9 +7604,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - /* FIXME: Use cpumask_of_node */ - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) continue; @@ -7644,11 +7626,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_group *sg, *prev; int j; - /* FIXME: Use cpumask_of_node */ - *nodemask = node_to_cpumask(i); cpumask_clear(covered); - - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; @@ -7679,8 +7658,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; - /* FIXME: Use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, n); cpumask_complement(notcovered, covered); cpumask_and(tmpmask, notcovered, cpu_map); @@ -7688,7 +7665,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (cpumask_empty(tmpmask)) break; - cpumask_and(tmpmask, tmpmask, pnodemask); + cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); if (cpumask_empty(tmpmask)) continue; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 833b6d44483..954e1a81b79 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1383,7 +1383,8 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); + alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 4f6b434fee2402b3decdeae9d16eb648725ae426 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Dec 2008 19:50:34 -0500 Subject: don't reallocate buffer in every audit_sockaddr() No need to do that more than once per process lifetime; allocating/freeing on each sendto/accept/etc. is bloody pointless. Signed-off-by: Al Viro --- kernel/auditsc.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4819f371197..c2e43ebb1b6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -174,12 +174,6 @@ struct audit_aux_data_socketcall { unsigned long args[0]; }; -struct audit_aux_data_sockaddr { - struct audit_aux_data d; - int len; - char a[0]; -}; - struct audit_aux_data_fd_pair { struct audit_aux_data d; int fd[2]; @@ -234,7 +228,8 @@ struct audit_context { struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; - + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; /* Save things to print about task_struct */ pid_t pid, ppid; uid_t uid, euid, suid, fsuid; @@ -921,6 +916,7 @@ static inline void audit_free_context(struct audit_context *context) free_tree_refs(context); audit_free_aux(context); kfree(context->filterkey); + kfree(context->sockaddr); kfree(context); context = previous; } while (context); @@ -1383,13 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, " a%d=%lx", i, axs->args[i]); break; } - case AUDIT_SOCKADDR: { - struct audit_aux_data_sockaddr *axs = (void *)aux; - - audit_log_format(ab, "saddr="); - audit_log_n_hex(ab, axs->a, axs->len); - break; } - case AUDIT_FD_PAIR: { struct audit_aux_data_fd_pair *axs = (void *)aux; audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); @@ -1421,6 +1410,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_end(ab); } + if (context->sockaddr_len) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); + if (ab) { + audit_log_format(ab, "saddr="); + audit_log_n_hex(ab, (void *)context->sockaddr, + context->sockaddr_len); + audit_log_end(ab); + } + } + for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; @@ -1689,6 +1688,7 @@ void audit_syscall_exit(int valid, long return_code) context->aux_pids = NULL; context->target_pid = 0; context->target_sid = 0; + context->sockaddr_len = 0; kfree(context->filterkey); context->filterkey = NULL; tsk->audit_context = context; @@ -2468,22 +2468,20 @@ int __audit_fd_pair(int fd1, int fd2) */ int audit_sockaddr(int len, void *a) { - struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->len = len; - memcpy(ax->a, a, len); + if (!context->sockaddr) { + void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); + if (!p) + return -ENOMEM; + context->sockaddr = p; + } - ax->d.type = AUDIT_SOCKADDR; - ax->d.next = context->aux; - context->aux = (void *)ax; + context->sockaddr_len = len; + memcpy(context->sockaddr, a, len); return 0; } -- cgit v1.2.3 From f3298dc4f2277874d40cb4fc3a6e277317d6603b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 03:16:51 -0500 Subject: sanitize audit_socketcall * don't bother with allocations * now that it can't fail, make it return void Signed-off-by: Al Viro --- kernel/auditsc.c | 66 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c2e43ebb1b6..5cda66466e1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,6 @@ struct audit_aux_data_execve { struct mm_struct *mm; }; -struct audit_aux_data_socketcall { - struct audit_aux_data d; - int nargs; - unsigned long args[0]; -}; - struct audit_aux_data_fd_pair { struct audit_aux_data d; int fd[2]; @@ -247,6 +241,14 @@ struct audit_context { struct audit_tree_refs *trees, *first_trees; int tree_count; + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + }; + #if AUDIT_DEBUG int put_count; int ino_count; @@ -1226,6 +1228,27 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); } +static void show_special(struct audit_context *context) +{ + struct audit_buffer *ab; + int i; + + ab = audit_log_start(context, GFP_KERNEL, context->type); + if (!ab) + return; + + switch (context->type) { + case AUDIT_SOCKETCALL: { + int nargs = context->socketcall.nargs; + audit_log_format(ab, "nargs=%d", nargs); + for (i = 0; i < nargs; i++) + audit_log_format(ab, " a%d=%lx", i, + context->socketcall.args[i]); + break; } + } + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1372,13 +1395,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_execve_info(context, &ab, axi); break; } - case AUDIT_SOCKETCALL: { - struct audit_aux_data_socketcall *axs = (void *)aux; - audit_log_format(ab, "nargs=%d", axs->nargs); - for (i=0; inargs; i++) - audit_log_format(ab, " a%d=%lx", i, axs->args[i]); - break; } - case AUDIT_FD_PAIR: { struct audit_aux_data_fd_pair *axs = (void *)aux; audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); @@ -1410,6 +1426,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_end(ab); } + if (context->type) + show_special(context); + if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); if (ab) { @@ -1689,6 +1708,7 @@ void audit_syscall_exit(int valid, long return_code) context->target_pid = 0; context->target_sid = 0; context->sockaddr_len = 0; + context->type = 0; kfree(context->filterkey); context->filterkey = NULL; tsk->audit_context = context; @@ -2406,27 +2426,17 @@ int audit_bprm(struct linux_binprm *bprm) * @nargs: number of args * @args: args array * - * Returns 0 for success or NULL context or < 0 on error. */ -int audit_socketcall(int nargs, unsigned long *args) +void audit_socketcall(int nargs, unsigned long *args) { - struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->nargs = nargs; - memcpy(ax->args, args, nargs * sizeof(unsigned long)); + return; - ax->d.type = AUDIT_SOCKETCALL; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_SOCKETCALL; + context->socketcall.nargs = nargs; + memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); } /** -- cgit v1.2.3 From a33e6751003c5ade603737d828b1519d980ce392 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 03:40:06 -0500 Subject: sanitize audit_ipc_obj() * get rid of allocations * make it return void * simplify callers Signed-off-by: Al Viro --- kernel/auditsc.c | 88 ++++++++++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5cda66466e1..73504313264 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -247,6 +247,12 @@ struct audit_context { int nargs; long args[6]; } socketcall; + struct { + uid_t uid; + gid_t gid; + mode_t mode; + u32 osid; + } ipc; }; #if AUDIT_DEBUG @@ -605,19 +611,12 @@ static int audit_filter_rules(struct task_struct *tsk, } } /* Find ipc objects that match */ - if (ctx) { - struct audit_aux_data *aux; - for (aux = ctx->aux; aux; - aux = aux->next) { - if (aux->type == AUDIT_IPC) { - struct audit_aux_data_ipcctl *axi = (void *)aux; - if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { - ++result; - break; - } - } - } - } + if (!ctx || ctx->type != AUDIT_IPC) + break; + if (security_audit_rule_match(ctx->ipc.osid, + f->type, f->op, + f->lsm_rule, ctx)) + ++result; } break; case AUDIT_ARG0: @@ -1228,7 +1227,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); } -static void show_special(struct audit_context *context) +static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; int i; @@ -1245,6 +1244,23 @@ static void show_special(struct audit_context *context) audit_log_format(ab, " a%d=%lx", i, context->socketcall.args[i]); break; } + case AUDIT_IPC: { + u32 osid = context->ipc.osid; + + audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + context->ipc.uid, context->ipc.gid, context->ipc.mode); + if (osid) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx(osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", osid); + *call_panic = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + break; } } audit_log_end(ab); } @@ -1363,26 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); break; } - case AUDIT_IPC: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "ouid=%u ogid=%u mode=%#o", - axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - break; } - case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, @@ -1427,7 +1423,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } if (context->type) - show_special(context); + show_special(context, &call_panic); if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); @@ -2349,25 +2345,15 @@ int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) * audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_ipc_obj(struct kern_ipc_perm *ipcp) +void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->uid = ipcp->uid; - ax->gid = ipcp->gid; - ax->mode = ipcp->mode; - security_ipc_getsecid(ipcp, &ax->osid); - ax->d.type = AUDIT_IPC; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.uid = ipcp->uid; + context->ipc.gid = ipcp->gid; + context->ipc.mode = ipcp->mode; + security_ipc_getsecid(ipcp, &context->ipc.osid); + context->type = AUDIT_IPC; } /** -- cgit v1.2.3 From e816f370cbadd2afea9f1a42f232d0636137d563 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 03:47:15 -0500 Subject: sanitize audit_ipc_set_perm() * get rid of allocations * make it return void * simplify callers Signed-off-by: Al Viro --- kernel/auditsc.c | 59 +++++++++++++++++++++++++------------------------------- 1 file changed, 26 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 73504313264..fbed62e05bc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -151,16 +151,6 @@ struct audit_aux_data_mq_getsetattr { struct mq_attr mqstat; }; -struct audit_aux_data_ipcctl { - struct audit_aux_data d; - struct ipc_perm p; - unsigned long qbytes; - uid_t uid; - gid_t gid; - mode_t mode; - u32 osid; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -252,6 +242,11 @@ struct audit_context { gid_t gid; mode_t mode; u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + mode_t perm_mode; + unsigned long qbytes; } ipc; }; @@ -1260,6 +1255,19 @@ static void show_special(struct audit_context *context, int *call_panic) security_release_secctx(ctx, len); } } + if (context->ipc.has_perm) { + audit_log_end(ab); + ab = audit_log_start(context, GFP_KERNEL, + AUDIT_IPC_SET_PERM); + audit_log_format(ab, + "qbytes=%lx ouid=%u ogid=%u mode=%#o", + context->ipc.qbytes, + context->ipc.perm_uid, + context->ipc.perm_gid, + context->ipc.perm_mode); + if (!ab) + return; + } break; } } audit_log_end(ab); @@ -1379,13 +1387,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); break; } - case AUDIT_IPC_SET_PERM: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", - axi->qbytes, axi->uid, axi->gid, axi->mode); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2352,6 +2353,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) context->ipc.uid = ipcp->uid; context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; + context->ipc.has_perm = 0; security_ipc_getsecid(ipcp, &context->ipc.osid); context->type = AUDIT_IPC; } @@ -2363,26 +2365,17 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @gid: msgq group id * @mode: msgq mode (permissions) * - * Returns 0 for success or NULL context or < 0 on error. + * Called only after audit_ipc_obj(). */ -int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->qbytes = qbytes; - ax->uid = uid; - ax->gid = gid; - ax->mode = mode; - - ax->d.type = AUDIT_IPC_SET_PERM; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.qbytes = qbytes; + context->ipc.perm_uid = uid; + context->ipc.perm_gid = gid; + context->ipc.perm_mode = mode; + context->ipc.has_perm = 1; } int audit_bprm(struct linux_binprm *bprm) -- cgit v1.2.3 From 7392906ea915b9a2c14dea32b3604b4e178f82f7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 06:58:59 -0500 Subject: sanitize audit_mq_getsetattr() * get rid of allocations * make it return void * don't duplicate parts of audit_dummy_context() Signed-off-by: Al Viro --- kernel/auditsc.c | 54 +++++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fbed62e05bc..c50178c7e24 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -145,12 +145,6 @@ struct audit_aux_data_mq_notify { struct sigevent notification; }; -struct audit_aux_data_mq_getsetattr { - struct audit_aux_data d; - mqd_t mqdes; - struct mq_attr mqstat; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -248,6 +242,10 @@ struct audit_context { mode_t perm_mode; unsigned long qbytes; } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; }; #if AUDIT_DEBUG @@ -1269,6 +1267,15 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_GETSETATTR: { + struct mq_attr *attr = &context->mq_getsetattr.mqstat; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + context->mq_getsetattr.mqdes, + attr->mq_flags, attr->mq_maxmsg, + attr->mq_msgsize, attr->mq_curmsgs); + break; } } audit_log_end(ab); } @@ -1377,16 +1384,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->notification.sigev_signo); break; } - case AUDIT_MQ_GETSETATTR: { - struct audit_aux_data_mq_getsetattr *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " - "mq_curmsgs=%ld ", - axi->mqdes, - axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, - axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2316,30 +2313,13 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) * @mqdes: MQ descriptor * @mqstat: MQ flags * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { - struct audit_aux_data_mq_getsetattr *ax; struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->mqdes = mqdes; - ax->mqstat = *mqstat; - - ax->d.type = AUDIT_MQ_GETSETATTR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_getsetattr.mqdes = mqdes; + context->mq_getsetattr.mqstat = *mqstat; + context->type = AUDIT_MQ_GETSETATTR; } /** -- cgit v1.2.3 From 20114f71b27cafeb7c7e41d2b0f0b68c3fbb022b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 07:16:12 -0500 Subject: sanitize audit_mq_notify() * don't copy_from_user() twice * don't bother with allocations * don't duplicate parts of audit_dummy_context() * make it return void Signed-off-by: Al Viro --- kernel/auditsc.c | 56 ++++++++++++++++---------------------------------------- 1 file changed, 16 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c50178c7e24..3ece960de89 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -139,12 +139,6 @@ struct audit_aux_data_mq_sendrecv { struct timespec abs_timeout; }; -struct audit_aux_data_mq_notify { - struct audit_aux_data d; - mqd_t mqdes; - struct sigevent notification; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -246,6 +240,10 @@ struct audit_context { mqd_t mqdes; struct mq_attr mqstat; } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; }; #if AUDIT_DEBUG @@ -1267,6 +1265,11 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_NOTIFY: { + audit_log_format(ab, "mqdes=%d sigev_signo=%d", + context->mq_notify.mqdes, + context->mq_notify.sigev_signo); + break; } case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, @@ -1376,14 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); break; } - case AUDIT_MQ_NOTIFY: { - struct audit_aux_data_mq_notify *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d sigev_signo=%d", - axi->mqdes, - axi->notification.sigev_signo); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2274,38 +2269,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, * @mqdes: MQ descriptor * @u_notification: Notification event * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { - struct audit_aux_data_mq_notify *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_notification != NULL) { - if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->notification, 0, sizeof(ax->notification)); - - ax->mqdes = mqdes; + if (notification) + context->mq_notify.sigev_signo = notification->sigev_signo; + else + context->mq_notify.sigev_signo = 0; - ax->d.type = AUDIT_MQ_NOTIFY; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_notify.mqdes = mqdes; + context->type = AUDIT_MQ_NOTIFY; } /** -- cgit v1.2.3 From c32c8af43b9adde8d6f938d8e6328c13b8de79ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 03:46:48 -0500 Subject: sanitize AUDIT_MQ_SENDRECV * logging the original value of *msg_prio in mq_timedreceive(2) is insane - the argument is write-only (i.e. syscall always ignores the original value and only overwrites it). * merge __audit_mq_timed{send,receive} * don't do copy_from_user() twice * don't mess with allocations in auditsc part * ... and don't bother checking !audit_enabled and !context in there - we'd already checked for audit_dummy_context(). Signed-off-by: Al Viro --- kernel/auditsc.c | 127 +++++++++++++------------------------------------------ 1 file changed, 29 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3ece960de89..140c4745347 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -131,14 +131,6 @@ struct audit_aux_data_mq_open { struct mq_attr attr; }; -struct audit_aux_data_mq_sendrecv { - struct audit_aux_data d; - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -244,6 +236,12 @@ struct audit_context { mqd_t mqdes; int sigev_signo; } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; }; #if AUDIT_DEBUG @@ -1265,6 +1263,16 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_SENDRECV: { + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + context->mq_sendrecv.mqdes, + context->mq_sendrecv.msg_len, + context->mq_sendrecv.msg_prio, + context->mq_sendrecv.abs_timeout.tv_sec, + context->mq_sendrecv.abs_timeout.tv_nsec); + break; } case AUDIT_MQ_NOTIFY: { audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, @@ -1370,15 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->attr.mq_curmsgs); break; } - case AUDIT_MQ_SENDRECV: { - struct audit_aux_data_mq_sendrecv *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", - axi->mqdes, axi->msg_len, axi->msg_prio, - axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2171,97 +2170,29 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) } /** - * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec __user *u_abs_timeout) -{ - struct audit_aux_data_mq_sendrecv *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); - - ax->mqdes = mqdes; - ax->msg_len = msg_len; - ax->msg_prio = msg_prio; - - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive - * @mqdes: MQ descriptor - * @msg_len: Message length - * @u_msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time + * @abs_timeout: Message timeout in absolute time * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, - unsigned int __user *u_msg_prio, - const struct timespec __user *u_abs_timeout) +void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec *abs_timeout) { - struct audit_aux_data_mq_sendrecv *ax; struct audit_context *context = current->audit_context; + struct timespec *p = &context->mq_sendrecv.abs_timeout; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_msg_prio != NULL) { - if (get_user(ax->msg_prio, u_msg_prio)) { - kfree(ax); - return -EFAULT; - } - } else - ax->msg_prio = 0; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + if (abs_timeout) + memcpy(p, abs_timeout, sizeof(struct timespec)); + else + memset(p, 0, sizeof(struct timespec)); - ax->mqdes = mqdes; - ax->msg_len = msg_len; + context->mq_sendrecv.mqdes = mqdes; + context->mq_sendrecv.msg_len = msg_len; + context->mq_sendrecv.msg_prio = msg_prio; - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_SENDRECV; } /** -- cgit v1.2.3 From 564f6993ffef656aebaf46cf2f1f6cb4f5c97207 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 04:02:26 -0500 Subject: sanitize audit_mq_open() * don't bother with allocations * don't do double copy_from_user() * don't duplicate parts of check for audit_dummy_context() Signed-off-by: Al Viro --- kernel/auditsc.c | 65 ++++++++++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 140c4745347..83e946f1cdd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -124,13 +124,6 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 -struct audit_aux_data_mq_open { - struct audit_aux_data d; - int oflag; - mode_t mode; - struct mq_attr attr; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -242,6 +235,11 @@ struct audit_context { unsigned int msg_prio; struct timespec abs_timeout; } mq_sendrecv; + struct { + int oflag; + mode_t mode; + struct mq_attr attr; + } mq_open; }; #if AUDIT_DEBUG @@ -1263,6 +1261,16 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_OPEN: { + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + context->mq_open.oflag, context->mq_open.mode, + context->mq_open.attr.mq_flags, + context->mq_open.attr.mq_maxmsg, + context->mq_open.attr.mq_msgsize, + context->mq_open.attr.mq_curmsgs); + break; } case AUDIT_MQ_SENDRECV: { audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " @@ -1368,15 +1376,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { - case AUDIT_MQ_OPEN: { - struct audit_aux_data_mq_open *axi = (void *)aux; - audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " - "mq_msgsize=%ld mq_curmsgs=%ld", - axi->oflag, axi->mode, axi->attr.mq_flags, - axi->attr.mq_maxmsg, axi->attr.mq_msgsize, - axi->attr.mq_curmsgs); - break; } case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; @@ -2135,38 +2134,20 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) * @mode: mode bits * @u_attr: queue attributes * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) +void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) { - struct audit_aux_data_mq_open *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_attr != NULL) { - if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->attr, 0, sizeof(ax->attr)); + if (attr) + memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); + else + memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); - ax->oflag = oflag; - ax->mode = mode; + context->mq_open.oflag = oflag; + context->mq_open.mode = mode; - ax->d.type = AUDIT_MQ_OPEN; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_OPEN; } /** -- cgit v1.2.3 From 157cf649a735a2f7e8dba0ed08e6e38b6c30d886 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 04:57:47 -0500 Subject: sanitize audit_fd_pair() * no allocations * return void Signed-off-by: Al Viro --- kernel/auditsc.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 83e946f1cdd..327e65d5067 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -131,11 +131,6 @@ struct audit_aux_data_execve { struct mm_struct *mm; }; -struct audit_aux_data_fd_pair { - struct audit_aux_data d; - int fd[2]; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -241,6 +236,7 @@ struct audit_context { struct mq_attr attr; } mq_open; }; + int fds[2]; #if AUDIT_DEBUG int put_count; @@ -1382,11 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_execve_info(context, &ab, axi); break; } - case AUDIT_FD_PAIR: { - struct audit_aux_data_fd_pair *axs = (void *)aux; - audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); - break; } - case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1416,6 +1407,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->type) show_special(context, &call_panic); + if (context->fds[0] >= 0) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); + if (ab) { + audit_log_format(ab, "fd0=%d fd1=%d", + context->fds[0], context->fds[1]); + audit_log_end(ab); + } + } + if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); if (ab) { @@ -1696,6 +1696,7 @@ void audit_syscall_exit(int valid, long return_code) context->target_sid = 0; context->sockaddr_len = 0; context->type = 0; + context->fds[0] = -1; kfree(context->filterkey); context->filterkey = NULL; tsk->audit_context = context; @@ -2291,29 +2292,12 @@ void audit_socketcall(int nargs, unsigned long *args) * @fd1: the first file descriptor * @fd2: the second file descriptor * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_fd_pair(int fd1, int fd2) +void __audit_fd_pair(int fd1, int fd2) { struct audit_context *context = current->audit_context; - struct audit_aux_data_fd_pair *ax; - - if (likely(!context)) { - return 0; - } - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) { - return -ENOMEM; - } - - ax->fd[0] = fd1; - ax->fd[1] = fd2; - - ax->d.type = AUDIT_FD_PAIR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->fds[0] = fd1; + context->fds[1] = fd2; } /** -- cgit v1.2.3 From 57f71a0af4244d9ba3c0bce74b1d2e66e8d520bd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Jan 2009 14:52:57 -0500 Subject: sanitize audit_log_capset() * no allocations * return void * don't duplicate checked for dummy context Signed-off-by: Al Viro --- kernel/auditsc.c | 44 ++++++++++++++++---------------------------- kernel/capability.c | 4 +--- 2 files changed, 17 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 327e65d5067..c76a58215f5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -235,6 +235,10 @@ struct audit_context { mode_t mode; struct mq_attr attr; } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; }; int fds[2]; @@ -1291,6 +1295,12 @@ static void show_special(struct audit_context *context, int *call_panic) attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } + case AUDIT_CAPSET: { + audit_log_format(ab, "pid=%d", context->capset.pid); + audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); + audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); + audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + break; } } audit_log_end(ab); } @@ -1392,14 +1402,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); break; } - case AUDIT_CAPSET: { - struct audit_aux_data_capset *axs = (void *)aux; - audit_log_format(ab, "pid=%d", axs->pid); - audit_log_cap(ab, "cap_pi", &axs->cap.inheritable); - audit_log_cap(ab, "cap_pp", &axs->cap.permitted); - audit_log_cap(ab, "cap_pe", &axs->cap.effective); - break; } - } audit_log_end(ab); } @@ -2456,29 +2458,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, * Record the aguments userspace sent to sys_capset for later printing by the * audit system if applicable */ -int __audit_log_capset(pid_t pid, +void __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old) { - struct audit_aux_data_capset *ax; struct audit_context *context = current->audit_context; - - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->d.type = AUDIT_CAPSET; - ax->d.next = context->aux; - context->aux = (void *)ax; - - ax->pid = pid; - ax->cap.effective = new->cap_effective; - ax->cap.inheritable = new->cap_effective; - ax->cap.permitted = new->cap_permitted; - - return 0; + context->capset.pid = pid; + context->capset.cap.effective = new->cap_effective; + context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.permitted = new->cap_permitted; + context->type = AUDIT_CAPSET; } /** diff --git a/kernel/capability.c b/kernel/capability.c index 36b4b4daebe..c598d9d5be4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (ret < 0) goto error; - ret = audit_log_capset(pid, new, current_cred()); - if (ret < 0) - return ret; + audit_log_capset(pid, new, current_cred()); return commit_creds(new); -- cgit v1.2.3 From 1a9d0797b8977d413435277bf9661efbbd584693 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 12:04:02 -0500 Subject: audit_update_lsm_rules() misses the audit_inode_hash[] ones Signed-off-by: Al Viro --- kernel/auditfilter.c | 77 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9fd85a4640a..0febaa0f784 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1778,6 +1778,41 @@ unlock_and_return: return result; } +static int update_lsm_rule(struct audit_entry *entry) +{ + struct audit_entry *nentry; + struct audit_watch *watch; + struct audit_tree *tree; + int err = 0; + + if (!security_audit_rule_known(&entry->rule)) + return 0; + + watch = entry->rule.watch; + tree = entry->rule.tree; + nentry = audit_dupe_rule(&entry->rule, watch); + if (IS_ERR(nentry)) { + /* save the first error encountered for the + * return value */ + err = PTR_ERR(nentry); + audit_panic("error updating LSM filters"); + if (watch) + list_del(&entry->rule.rlist); + list_del_rcu(&entry->list); + } else { + if (watch) { + list_add(&nentry->rule.rlist, &watch->rules); + list_del(&entry->rule.rlist); + } else if (tree) + list_replace_init(&entry->rule.rlist, + &nentry->rule.rlist); + list_replace_rcu(&entry->list, &nentry->list); + } + call_rcu(&entry->rcu, audit_free_rule_rcu); + + return err; +} + /* This function will re-initialize the lsm_rule field of all applicable rules. * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the @@ -1785,42 +1820,24 @@ unlock_and_return: * updated rule. */ int audit_update_lsm_rules(void) { - struct audit_entry *entry, *n, *nentry; - struct audit_watch *watch; - struct audit_tree *tree; + struct audit_entry *e, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { - if (!security_audit_rule_known(&entry->rule)) - continue; - - watch = entry->rule.watch; - tree = entry->rule.tree; - nentry = audit_dupe_rule(&entry->rule, watch); - if (IS_ERR(nentry)) { - /* save the first error encountered for the - * return value */ - if (!err) - err = PTR_ERR(nentry); - audit_panic("error updating LSM filters"); - if (watch) - list_del(&entry->rule.rlist); - list_del_rcu(&entry->list); - } else { - if (watch) { - list_add(&nentry->rule.rlist, - &watch->rules); - list_del(&entry->rule.rlist); - } else if (tree) - list_replace_init(&entry->rule.rlist, - &nentry->rule.rlist); - list_replace_rcu(&entry->list, &nentry->list); - } - call_rcu(&entry->rcu, audit_free_rule_rcu); + list_for_each_entry_safe(e, n, &audit_filter_list[i], list) { + int res = update_lsm_rule(e); + if (!err) + err = res; + } + } + for (i=0; i< AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry_safe(e, n, &audit_inode_hash[i], list) { + int res = update_lsm_rule(e); + if (!err) + err = res; } } -- cgit v1.2.3 From 0590b9335a1c72a3f0defcc6231287f7817e07c8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 23:45:27 -0500 Subject: fixing audit rule ordering mess, part 1 Problem: ordering between the rules on exit chain is currently lost; all watch and inode rules are listed after everything else _and_ exit,never on one kind doesn't stop exit,always on another from being matched. Solution: assign priorities to rules, keep track of the current highest-priority matching rule and its result (always/never). Signed-off-by: Al Viro --- kernel/audit.h | 5 +--- kernel/auditfilter.c | 17 +++++++++-- kernel/auditsc.c | 79 ++++++++++++++++++++++++++++------------------------ 3 files changed, 58 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 9d6717412fe..16f18cac661 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return __audit_signal_info(sig, t); return 0; } -extern enum audit_state audit_filter_inodes(struct task_struct *, - struct audit_context *); -extern void audit_set_auditable(struct audit_context *); +extern void audit_filter_inodes(struct task_struct *, struct audit_context *); #else #define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED -#define audit_set_auditable(c) #endif diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0febaa0f784..995a2e86808 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -919,6 +919,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; + new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; new->watch = NULL; @@ -987,9 +988,8 @@ static void audit_update_watch(struct audit_parent *parent, /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context && - audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) - audit_set_auditable(current->audit_context); + if (invalidating && current->audit_context) + audit_filter_inodes(current, current->audit_context); nwatch = audit_dupe_watch(owatch); if (IS_ERR(nwatch)) { @@ -1258,6 +1258,9 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, return ret; } +static u64 prio_low = ~0ULL/2; +static u64 prio_high = ~0ULL/2 - 1; + /* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry, struct list_head *list) @@ -1319,6 +1322,14 @@ static inline int audit_add_rule(struct audit_entry *entry, } } + entry->rule.prio = ~0ULL; + if (entry->rule.listnr == AUDIT_FILTER_EXIT) { + if (entry->rule.flags & AUDIT_FILTER_PREPEND) + entry->rule.prio = ++prio_high; + else + entry->rule.prio = --prio_low; + } + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c76a58215f5..19d2c2747c8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -165,14 +165,14 @@ struct audit_tree_refs { struct audit_context { int dummy; /* must be the first element */ int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state; + enum audit_state state, current_state; unsigned int serial; /* serial number for record */ struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ unsigned long argv[4]; /* syscall arguments */ int return_valid; /* return code is valid */ long return_code;/* syscall return code */ - int auditable; /* 1 if record should be written */ + u64 prio; int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -630,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk, return 0; } } - if (rule->filterkey && ctx) - ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + + if (ctx) { + if (rule->prio <= ctx->prio) + return 0; + if (rule->filterkey) { + kfree(ctx->filterkey); + ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + } + ctx->prio = rule->prio; + } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; @@ -685,6 +693,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, audit_filter_rules(tsk, &e->rule, ctx, NULL, &state)) { rcu_read_unlock(); + ctx->current_state = state; return state; } } @@ -698,15 +707,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, * buckets applicable to the inode numbers in audit_names[]. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ -enum audit_state audit_filter_inodes(struct task_struct *tsk, - struct audit_context *ctx) +void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { int i; struct audit_entry *e; enum audit_state state; if (audit_pid && tsk->tgid == audit_pid) - return AUDIT_DISABLED; + return; rcu_read_lock(); for (i = 0; i < ctx->name_count; i++) { @@ -723,17 +731,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk, if ((e->rule.mask[word] & bit) == bit && audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { rcu_read_unlock(); - return state; + ctx->current_state = state; + return; } } } rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; } -void audit_set_auditable(struct audit_context *ctx) +static void audit_set_auditable(struct audit_context *ctx) { - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } } static inline struct audit_context *audit_get_context(struct task_struct *tsk, @@ -764,23 +775,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, else context->return_code = return_code; - if (context->in_syscall && !context->dummy && !context->auditable) { - enum audit_state state; - - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - if (state == AUDIT_RECORD_CONTEXT) { - context->auditable = 1; - goto get_context; - } - - state = audit_filter_inodes(tsk, context); - if (state == AUDIT_RECORD_CONTEXT) - context->auditable = 1; - + if (context->in_syscall && !context->dummy) { + audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(tsk, context); } -get_context: - tsk->audit_context = NULL; return context; } @@ -790,8 +789,7 @@ static inline void audit_free_names(struct audit_context *context) int i; #if AUDIT_DEBUG == 2 - if (context->auditable - ||context->put_count + context->ino_count != context->name_count) { + if (context->put_count + context->ino_count != context->name_count) { printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", @@ -842,6 +840,7 @@ static inline void audit_zero_context(struct audit_context *context, { memset(context, 0, sizeof(*context)); context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; } static inline struct audit_context *audit_alloc_context(enum audit_state state) @@ -1543,7 +1542,7 @@ void audit_free(struct task_struct *tsk) * We use GFP_ATOMIC here because we might be doing this * in the context of the idle thread */ /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); audit_free_context(context); @@ -1627,15 +1626,17 @@ void audit_syscall_entry(int arch, int major, state = context->state; context->dummy = !audit_n_rules; - if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) + if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { + context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); + } if (likely(state == AUDIT_DISABLED)) return; context->serial = 0; context->ctime = CURRENT_TIME; context->in_syscall = 1; - context->auditable = !!(state == AUDIT_RECORD_CONTEXT); + context->current_state = state; context->ppid = 0; } @@ -1643,17 +1644,20 @@ void audit_finish_fork(struct task_struct *child) { struct audit_context *ctx = current->audit_context; struct audit_context *p = child->audit_context; - if (!p || !ctx || !ctx->auditable) + if (!p || !ctx) + return; + if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) return; p->arch = ctx->arch; p->major = ctx->major; memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); p->ctime = ctx->ctime; p->dummy = ctx->dummy; - p->auditable = ctx->auditable; p->in_syscall = ctx->in_syscall; p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); p->ppid = current->pid; + p->prio = ctx->prio; + p->current_state = ctx->current_state; } /** @@ -1677,11 +1681,11 @@ void audit_syscall_exit(int valid, long return_code) if (likely(!context)) return; - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); context->in_syscall = 0; - context->auditable = 0; + context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; if (context->previous) { struct audit_context *new_context = context->previous; @@ -2091,7 +2095,10 @@ int auditsc_get_stamp(struct audit_context *ctx, t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } return 1; } -- cgit v1.2.3 From e45aa212ea81d39b38ba158df344dc3a500153e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Dec 2008 01:17:50 -0500 Subject: audit rules ordering, part 2 Fix the actual rule listing; add per-type lists _not_ used for matching, with all exit,... sitting on one such list. Simplifies "do something for all rules" logics, while we are at it... Signed-off-by: Al Viro --- kernel/audit_tree.c | 1 + kernel/auditfilter.c | 95 +++++++++++++++++++++------------------------------- 2 files changed, 40 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8b509441f49..48bddad2a3d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree) audit_log_end(ab); rule->tree = NULL; list_del_rcu(&entry->list); + list_del(&entry->rule.list); call_rcu(&entry->rcu, audit_free_rule_rcu); } } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 995a2e86808..5d4edc6f7a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #error Fix audit_filter_list initialiser #endif }; +static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_rules_list[0]), + LIST_HEAD_INIT(audit_rules_list[1]), + LIST_HEAD_INIT(audit_rules_list[2]), + LIST_HEAD_INIT(audit_rules_list[3]), + LIST_HEAD_INIT(audit_rules_list[4]), + LIST_HEAD_INIT(audit_rules_list[5]), +}; DEFINE_MUTEX(audit_filter_mutex); @@ -1007,12 +1015,15 @@ static void audit_update_watch(struct audit_parent *parent, list_del_rcu(&oentry->list); nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); - else { + } else { int h = audit_hash_ino((u32)ino); list_add(&nentry->rule.rlist, &nwatch->rules); list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); } call_rcu(&oentry->rcu, audit_free_rule_rcu); @@ -1077,6 +1088,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) audit_log_end(ab); } list_del(&r->rlist); + list_del(&r->list); list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); } @@ -1331,9 +1343,13 @@ static inline int audit_add_rule(struct audit_entry *entry, } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + list_add(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { + list_add_tail(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_tail_rcu(&entry->list, list); } #ifdef CONFIG_AUDITSYSCALL @@ -1415,6 +1431,7 @@ static inline int audit_del_rule(struct audit_entry *entry, audit_remove_tree_rule(&e->rule); list_del_rcu(&e->list); + list_del(&e->rule.list); call_rcu(&e->rcu, audit_free_rule_rcu); #ifdef CONFIG_AUDITSYSCALL @@ -1443,30 +1460,16 @@ out: static void audit_list(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *entry; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; irule); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(entry, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule *rule; - rule = audit_krule_to_rule(&entry->rule); + rule = audit_krule_to_rule(r); if (unlikely(!rule)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, @@ -1485,30 +1488,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q) static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *e; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; irule); - if (unlikely(!data)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data) + data->buflen); - if (skb) - skb_queue_tail(q, skb); - kfree(data); - } - } - for (i=0; i< AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(e, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule_data *data; - data = audit_krule_to_data(&e->rule); + data = audit_krule_to_data(r); if (unlikely(!data)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, @@ -1789,35 +1778,37 @@ unlock_and_return: return result; } -static int update_lsm_rule(struct audit_entry *entry) +static int update_lsm_rule(struct audit_krule *r) { + struct audit_entry *entry = container_of(r, struct audit_entry, rule); struct audit_entry *nentry; struct audit_watch *watch; struct audit_tree *tree; int err = 0; - if (!security_audit_rule_known(&entry->rule)) + if (!security_audit_rule_known(r)) return 0; - watch = entry->rule.watch; - tree = entry->rule.tree; - nentry = audit_dupe_rule(&entry->rule, watch); + watch = r->watch; + tree = r->tree; + nentry = audit_dupe_rule(r, watch); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ err = PTR_ERR(nentry); audit_panic("error updating LSM filters"); if (watch) - list_del(&entry->rule.rlist); + list_del(&r->rlist); list_del_rcu(&entry->list); + list_del(&r->list); } else { if (watch) { list_add(&nentry->rule.rlist, &watch->rules); - list_del(&entry->rule.rlist); + list_del(&r->rlist); } else if (tree) - list_replace_init(&entry->rule.rlist, - &nentry->rule.rlist); + list_replace_init(&r->rlist, &nentry->rule.rlist); list_replace_rcu(&entry->list, &nentry->list); + list_replace(&r->list, &nentry->rule.list); } call_rcu(&entry->rcu, audit_free_rule_rcu); @@ -1831,27 +1822,19 @@ static int update_lsm_rule(struct audit_entry *entry) * updated rule. */ int audit_update_lsm_rules(void) { - struct audit_entry *e, *n; + struct audit_krule *r, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(e, n, &audit_filter_list[i], list) { - int res = update_lsm_rule(e); - if (!err) - err = res; - } - } - for (i=0; i< AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry_safe(e, n, &audit_inode_hash[i], list) { - int res = update_lsm_rule(e); + list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { + int res = update_lsm_rule(r); if (!err) err = res; } } - mutex_unlock(&audit_filter_mutex); return err; -- cgit v1.2.3 From e048e02c89db7bd49d1a5fac77a11c8fb3603087 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Dec 2008 03:51:22 -0500 Subject: make sure that filterkey of task,always rules is reported Signed-off-by: Al Viro --- kernel/auditsc.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 19d2c2747c8..8cbddff6c28 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -652,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk, * completely disabled for this task. Since we only have the task * structure at this point, we can only check uid and gid. */ -static enum audit_state audit_filter_task(struct task_struct *tsk) +static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) { struct audit_entry *e; enum audit_state state; @@ -660,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { + if (state == AUDIT_RECORD_CONTEXT) + *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); return state; } @@ -866,18 +868,21 @@ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; enum audit_state state; + char *key = NULL; if (likely(!audit_ever_enabled)) return 0; /* Return if not auditing. */ - state = audit_filter_task(tsk); + state = audit_filter_task(tsk, &key); if (likely(state == AUDIT_DISABLED)) return 0; if (!(context = audit_alloc_context(state))) { + kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; } + context->filterkey = key; tsk->audit_context = context; set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); @@ -1703,8 +1708,10 @@ void audit_syscall_exit(int valid, long return_code) context->sockaddr_len = 0; context->type = 0; context->fds[0] = -1; - kfree(context->filterkey); - context->filterkey = NULL; + if (context->state != AUDIT_RECORD_CONTEXT) { + kfree(context->filterkey); + context->filterkey = NULL; + } tsk->audit_context = context; } } -- cgit v1.2.3 From 36c4f1b18c8a7d0adb4085e7f531860b837bb6b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Dec 2008 01:50:28 -0500 Subject: clean up audit_rule_{add,del} a bit Signed-off-by: Al Viro --- kernel/auditfilter.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 5d4edc6f7a3..e6e3829cadd 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1114,12 +1114,16 @@ static void audit_inotify_unregister(struct list_head *in_list) /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head **p) { struct audit_entry *e, *found = NULL; + struct list_head *list; int h; - if (entry->rule.watch) { + if (entry->rule.inode_f) { + h = audit_hash_ino(entry->rule.inode_f->val); + *p = list = &audit_inode_hash[h]; + } else if (entry->rule.watch) { /* we don't know the inode number, so must walk entire hash */ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { list = &audit_inode_hash[h]; @@ -1130,6 +1134,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry, } } goto out; + } else { + *p = list = &audit_filter_list[entry->rule.listnr]; } list_for_each_entry(e, list, list) @@ -1274,14 +1280,13 @@ static u64 prio_low = ~0ULL/2; static u64 prio_high = ~0ULL/2 - 1; /* Add rule to given filterlist if not a duplicate. */ -static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_add_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct nameidata *ndp = NULL, *ndw = NULL; + struct list_head *list; int h, err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1292,13 +1297,8 @@ static inline int audit_add_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); mutex_unlock(&audit_filter_mutex); if (e) { err = -EEXIST; @@ -1372,15 +1372,14 @@ error: } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch, *tmp_watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; + struct list_head *list; LIST_HEAD(inotify_list); - int h, ret = 0; + int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1390,13 +1389,8 @@ static inline int audit_del_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); if (!e) { mutex_unlock(&audit_filter_mutex); ret = -ENOENT; @@ -1603,8 +1597,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_add_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_add_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "add", &entry->rule, !err); @@ -1620,8 +1613,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_del_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_del_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "remove", &entry->rule, !err); -- cgit v1.2.3 From 5af75d8d58d0f9f7b7c0515b35786b22892d5f12 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Dec 2008 05:59:26 -0500 Subject: audit: validate comparison operations, store them in sane form Don't store the field->op in the messy (and very inconvenient for e.g. audit_comparator()) form; translate to dense set of values and do full validation of userland-submitted value while we are at it. ->audit_init_rule() and ->audit_match_rule() get new values now; in-tree instances updated. Signed-off-by: Al Viro --- kernel/audit_tree.c | 2 +- kernel/auditfilter.c | 132 +++++++++++++++++++++++++-------------------------- 2 files changed, 66 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 48bddad2a3d..8ad9545b8db 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -618,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) if (pathname[0] != '/' || rule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || rule->inode_f || rule->watch || rule->tree) return -EINVAL; rule->tree = alloc_tree(pathname); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e6e3829cadd..fbf24d121d9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -252,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree) + krule->watch || krule->inode_f || krule->tree || + (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; krule->inode_f = f; @@ -270,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, if (path[0] != '/' || path[len-1] == '/' || krule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || krule->inode_f || krule->watch || krule->tree) return -EINVAL; @@ -420,12 +421,32 @@ exit_err: return ERR_PTR(err); } +static u32 audit_ops[] = +{ + [Audit_equal] = AUDIT_EQUAL, + [Audit_not_equal] = AUDIT_NOT_EQUAL, + [Audit_bitmask] = AUDIT_BIT_MASK, + [Audit_bittest] = AUDIT_BIT_TEST, + [Audit_lt] = AUDIT_LESS_THAN, + [Audit_gt] = AUDIT_GREATER_THAN, + [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, + [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, +}; + +static u32 audit_to_op(u32 op) +{ + u32 n; + for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) + ; + return n; +} + + /* Translate struct audit_rule to kernel's rule respresentation. * Exists for backward compatibility with userspace. */ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; - struct audit_field *ino_f; int err = 0; int i; @@ -435,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 n; + + n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); + + /* Support for legacy operators where + * AUDIT_NEGATE bit signifies != and otherwise assumes == */ + if (n & AUDIT_NEGATE) + f->op = Audit_not_equal; + else if (!n) + f->op = Audit_equal; + else + f->op = audit_to_op(n); + + entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; err = -EINVAL; + if (f->op == Audit_bad) + goto exit_free; + switch(f->type) { default: goto exit_free; @@ -462,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EXIT: case AUDIT_SUCCESS: /* bit ops are only useful on syscall args */ - if (f->op == AUDIT_BIT_MASK || - f->op == AUDIT_BIT_TEST) { - err = -EINVAL; + if (f->op == Audit_bitmask || f->op == Audit_bittest) goto exit_free; - } break; case AUDIT_ARG0: case AUDIT_ARG1: @@ -475,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) break; /* arch is only allowed to be = or != */ case AUDIT_ARCH: - if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) - && (f->op != AUDIT_NEGATE) && (f->op)) { - err = -EINVAL; + if (f->op != Audit_not_equal && f->op != Audit_equal) goto exit_free; - } entry->rule.arch_f = f; break; case AUDIT_PERM: @@ -496,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; } - - entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (f->op & AUDIT_NEGATE) - f->op = AUDIT_NOT_EQUAL; - else if (!f->op) - f->op = AUDIT_EQUAL; - else if (f->op == AUDIT_OPERATORS) { - err = -EINVAL; - goto exit_free; - } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -538,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; - struct audit_field *ino_f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -554,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, struct audit_field *f = &entry->rule.fields[i]; err = -EINVAL; - if (!(data->fieldflags[i] & AUDIT_OPERATORS) || - data->fieldflags[i] & ~AUDIT_OPERATORS) + + f->op = audit_to_op(data->fieldflags[i]); + if (f->op == Audit_bad) goto exit_free; - f->op = data->fieldflags[i] & AUDIT_OPERATORS; f->type = data->fields[i]; f->val = data->values[i]; f->lsm_str = NULL; @@ -670,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -721,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule->fields[i] = krule->fields[i].type; if (krule->vers_ops == 1) { - if (krule->fields[i].op & AUDIT_NOT_EQUAL) + if (krule->fields[i].op == Audit_not_equal) rule->fields[i] |= AUDIT_NEGATE; } else { - rule->fields[i] |= krule->fields[i].op; + rule->fields[i] |= audit_ops[krule->fields[i].op]; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; @@ -752,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) struct audit_field *f = &krule->fields[i]; data->fields[i] = f->type; - data->fieldflags[i] = f->op; + data->fieldflags[i] = audit_ops[f->op]; switch(f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -1626,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return err; } -int audit_comparator(const u32 left, const u32 op, const u32 right) +int audit_comparator(u32 left, u32 op, u32 right) { switch (op) { - case AUDIT_EQUAL: + case Audit_equal: return (left == right); - case AUDIT_NOT_EQUAL: + case Audit_not_equal: return (left != right); - case AUDIT_LESS_THAN: + case Audit_lt: return (left < right); - case AUDIT_LESS_THAN_OR_EQUAL: + case Audit_le: return (left <= right); - case AUDIT_GREATER_THAN: + case Audit_gt: return (left > right); - case AUDIT_GREATER_THAN_OR_EQUAL: + case Audit_ge: return (left >= right); - case AUDIT_BIT_MASK: + case Audit_bitmask: return (left & right); - case AUDIT_BIT_TEST: + case Audit_bittest: return ((left & right) == right); + default: + BUG(); + return 0; } - BUG(); - return 0; } /* Compare given dentry name with last component in given path, -- cgit v1.2.3 From 7b574b7b0124ed344911f5d581e9bc2d83bbeb19 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 4 Jan 2009 12:00:45 -0800 Subject: cgroups: fix a race between cgroup_clone and umount The race is calling cgroup_clone() while umounting the ns cgroup subsys, and thus cgroup_clone() might access invalid cgroup_fs, or kill_sb() is called after cgroup_clone() created a new dir in it. The BUG I triggered is BUG_ON(root->number_of_cgroups != 1); ------------[ cut here ]------------ kernel BUG at kernel/cgroup.c:1093! invalid opcode: 0000 [#1] SMP ... Process umount (pid: 5177, ti=e411e000 task=e40c4670 task.ti=e411e000) ... Call Trace: [] ? deactivate_super+0x3f/0x51 [] ? mntput_no_expire+0xb3/0xdd [] ? sys_umount+0x265/0x2ac [] ? sys_oldumount+0xd/0xf [] ? sysenter_do_call+0x12/0x31 ... EIP: [] cgroup_kill_sb+0x23/0xe0 SS:ESP 0068:e411ef2c ---[ end trace c766c1be3bf944ac ]--- Cc: Serge E. Hallyn Signed-off-by: Li Zefan Cc: Paul Menage Cc: "Serge E. Hallyn" Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 48348dde6d8..891a84eb9d3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2945,7 +2945,11 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ - atomic_inc(&parent->root->sb->s_active); + if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { + /* We race with the final deactivate_super() */ + mutex_unlock(&cgroup_mutex); + return 0; + } /* Keep the cgroup alive */ get_css_set(cg); -- cgit v1.2.3