From ea6f18ed5a1531caf678374f30a0990c9e6742f3 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 25 Nov 2008 02:35:02 +1030 Subject: sched: reduce stack size requirements in kernel/sched.c Impact: cleanup * use node_to_cpumask_ptr in place of node_to_cpumask to reduce stack requirements in sched.c Signed-off-by: Mike Travis Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bb827651558..dd22cec499b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6110,8 +6110,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) do { /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); + node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); + + cpus_and(mask, *pnodemask, p->cpus_allowed); dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ @@ -7098,9 +7099,9 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *nodemask) { int group; + node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpus_and(*nodemask, *pnodemask, *cpu_map); group = first_cpu(*nodemask); if (sg) @@ -7150,9 +7151,9 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + node_to_cpumask_ptr(pnodemask, i); - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpus_and(*nodemask, *pnodemask, *cpu_map); if (cpus_empty(*nodemask)) continue; -- cgit v1.2.3 From abcd083a1a658d2bc1f7fced02632bfe03918002 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:02 +1030 Subject: sched: convert sched.c from for_each_cpu_mask to for_each_cpu. Impact: trivial API conversion This is a simple conversion, but note that for_each_cpu() terminates with i >= nr_cpu_ids, not i == NR_CPUS like for_each_cpu_mask() did. I don't convert all of them: sd->span changes in a later patch, so change those iterators there rather than here. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dd22cec499b..e59978eead1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2061,7 +2061,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2103,7 +2103,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, /* Traverse only the allowed CPUs */ cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu(i, tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -3121,7 +3121,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { struct rq *rq; if (!cpu_isset(i, *cpus)) @@ -3400,7 +3400,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, &group->cpumask) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -3942,7 +3942,7 @@ static void run_rebalance_domains(struct softirq_action *h) int balance_cpu; cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, &cpus) { /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -6906,7 +6906,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(*covered); - for_each_cpu_mask_nr(i, *span) { + for_each_cpu(i, span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; @@ -6917,7 +6917,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask_nr(j, *span) { + for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; @@ -7117,7 +7117,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu_mask_nr(j, sg->cpumask) { + for_each_cpu(j, &sg->cpumask) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); @@ -7142,7 +7142,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; - for_each_cpu_mask_nr(cpu, *cpu_map) { + for_each_cpu(cpu, cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@ -7396,7 +7396,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7463,7 +7463,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { SCHED_CPUMASK_VAR(this_sibling_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7480,7 +7480,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7547,7 +7547,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { + for_each_cpu(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); @@ -7593,21 +7593,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(cpu_domains, i); init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(core_domains, i); init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = &per_cpu(phys_domains, i); init_sched_groups_power(i, sd); @@ -7627,7 +7627,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -7709,7 +7709,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) cpumask_t tmpmask; int i; - for_each_cpu_mask_nr(i, *cpu_map) + for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); arch_destroy_sched_domains(cpu_map, &tmpmask); -- cgit v1.2.3 From 3404c8d97c2d3eb87b1bf4aadad957bfb5235b14 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:03 +1030 Subject: sched: get rid of boutique sched.c allocations, use cpumask_var_t. Impact: use new general API Using lots of allocs rather than one big alloc is less efficient, but who cares for this setup function? Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/sched.c | 139 +++++++++++++++++++++++---------------------------------- 1 file changed, 55 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e59978eead1..0dc9d5752d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7263,48 +7263,6 @@ SD_INIT_FUNC(CPU) SD_INIT_FUNC(MC) #endif -/* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ -struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - -#ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; -#endif -}; - -#if NR_CPUS > 128 -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ - *masks = kmalloc(sizeof(**masks), GFP_KERNEL); -} -static inline void sched_cpumask_free(struct allmasks *masks) -{ - kfree(masks); -} -#else -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ } -static inline void sched_cpumask_free(struct allmasks *masks) -{ } -#endif - -#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) @@ -7347,14 +7305,35 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const cpumask_t *cpu_map, struct sched_domain_attr *attr) { - int i; + int i, err = -ENOMEM; struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; #ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ @@ -7362,33 +7341,16 @@ static int __build_sched_domains(const cpumask_t *cpu_map, GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; + goto free_tmpmask; } #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; + goto free_sched_groups; } - /* get space for all scratch cpumask variables */ - sched_cpumask_alloc(&allmasks); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } - - tmpmask = (cpumask_t *)allmasks; - - #ifdef CONFIG_NUMA sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif @@ -7398,7 +7360,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, */ for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); @@ -7464,9 +7425,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_sibling_map = per_cpu(cpu_sibling_map, i); cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); if (i != first_cpu(*this_sibling_map)) @@ -7481,9 +7439,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); if (i != first_cpu(*this_core_map)) @@ -7497,9 +7452,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); if (cpus_empty(*nodemask)) @@ -7513,8 +7465,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, send_covered, tmpmask); @@ -7523,9 +7473,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); int j; *nodemask = node_to_cpumask(i); @@ -7560,7 +7507,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, prev = sg; for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); @@ -7639,15 +7585,40 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpu_attach_domain(sd, rd, i); } - sched_cpumask_free(allmasks); - return 0; + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - sched_cpumask_free(allmasks); kfree(rd); - return -ENOMEM; + goto free_tmpmask; #endif } -- cgit v1.2.3 From 1e5ce4f4a755ee498bd9217dae26143afa0d8f31 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:03 +1030 Subject: sched: remove any_online_cpu() Impact: use new API any_online_cpu() is a good name, but it takes a cpumask_t, not a pointer. There are several places where any_online_cpu() doesn't really want a mask arg at all. Replace all callers with cpumask_any() and cpumask_any_and(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0dc9d5752d6..a2de33d0534 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5964,7 +5964,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -6113,11 +6113,12 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); cpus_and(mask, *pnodemask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); + dest_cpu = cpumask_any_and(cpu_online_mask, &mask); /* On any allowed CPU? */ if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, + &p->cpus_allowed); /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { @@ -6133,7 +6134,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ rq = task_rq_lock(p, &flags); p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, + &p->cpus_allowed); task_rq_unlock(rq, &flags); /* @@ -6159,7 +6161,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); unsigned long flags; local_irq_save(flags); @@ -6524,7 +6526,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; -- cgit v1.2.3 From 758b2cdc6f6a22c702bd8f2344382fb1270b2161 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: wrap sched_group and sched_domain cpumask accesses. Impact: trivial wrap of member accesses This eases the transition in the next patch. We also get rid of a temporary cpumask in find_idlest_cpu() thanks to for_each_cpu_and, and sched_balance_self() due to getting weight before setting sd to NULL. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 114 +++++++++++++++++++++++++-------------------------- kernel/sched_fair.c | 10 ++--- kernel/sched_rt.c | 3 +- kernel/sched_stats.h | 3 +- 4 files changed, 63 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a2de33d0534..575f38acf4d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1501,7 +1501,7 @@ static int tg_shares_up(struct task_group *tg, void *data) struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu(i, sched_domain_span(sd)) { /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1522,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@ -2053,15 +2053,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu(i, &group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -2093,17 +2095,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu(i, tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -2145,7 +2144,6 @@ static int sched_balance_self(int cpu, int flag) update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -2154,14 +2152,13 @@ static int sched_balance_self(int cpu, int flag) continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2170,10 +2167,10 @@ static int sched_balance_self(int cpu, int flag) /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@ -2218,7 +2215,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@ -2266,7 +2263,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@ -3109,10 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3121,13 +3119,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu(i, &group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@ -3238,8 +3231,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3254,8 +3247,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3400,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long max_load = 0; int i; - for_each_cpu(i, &group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; if (!cpu_isset(i, *cpus)) @@ -3746,7 +3739,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@ -6618,7 +6611,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); + cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd)); cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6633,11 +6626,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT "span %s level %s\n", str, sd->name); - if (!cpu_isset(cpu, sd->span)) { + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpu_isset(cpu, group->cpumask)) { + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -6657,31 +6650,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpus_weight(group->cpumask)) { + if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } - if (cpus_intersects(*groupmask, group->cpumask)) { + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(*groupmask, *groupmask, group->cpumask); + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group)); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, *groupmask)) + if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6721,7 +6715,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) static int sd_degenerate(struct sched_domain *sd) { - if (cpus_weight(sd->span) == 1) + if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ @@ -6752,7 +6746,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (sd_degenerate(parent)) return 1; - if (!cpus_equal(sd->span, parent->span)) + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Does parent contain flags not in child? */ @@ -6913,10 +6907,10 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, *covered)) + if (cpumask_test_cpu(i, covered)) continue; - cpus_clear(sg->cpumask); + cpumask_clear(sched_group_cpus(sg)); sg->__cpu_power = 0; for_each_cpu(j, span) { @@ -6924,7 +6918,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, continue; cpu_set(j, *covered); - cpu_set(j, sg->cpumask); + cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; @@ -7119,11 +7113,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) if (!sg) return; do { - for_each_cpu(j, &sg->cpumask) { + for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { + if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each * physical package. @@ -7200,7 +7194,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != first_cpu(sd->groups->cpumask)) + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) return; child = sd->child; @@ -7372,7 +7366,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sd->span = *cpu_map; + cpumask_copy(sched_domain_span(sd), cpu_map); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7382,18 +7376,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(node_domains, i); SD_INIT(sd, NODE); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = p; if (p) p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); #endif p = sd; sd = &per_cpu(phys_domains, i); SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - sd->span = *nodemask; + cpumask_copy(sched_domain_span(sd), nodemask); sd->parent = p; if (p) p->child = sd; @@ -7404,8 +7399,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(core_domains, i); SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); - cpus_and(sd->span, sd->span, *cpu_map); + *sched_domain_span(sd) = cpu_coregroup_map(i); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7416,8 +7412,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(cpu_domains, i); SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -7503,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = *nodemask; + cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; cpus_or(*covered, *covered, *nodemask); prev = sg; @@ -7530,7 +7526,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, goto error; } sg->__cpu_power = 0; - sg->cpumask = *tmpmask; + cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; cpus_or(*covered, *covered, *tmpmask); prev->next = sg; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 98345e45b05..bba00402ed9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1024,7 +1024,6 @@ static void yield_task_fair(struct rq *rq) #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, struct task_struct *p) { - cpumask_t tmp; struct sched_domain *sd; int i; @@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p) if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { + for_each_cpu_and(i, sched_domain_span(sd), + &p->cpus_allowed) { + if (cpu_active(i) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@ -1240,7 +1238,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) * this_cpu and prev_cpu are present in: */ for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { + if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { this_sd = sd; break; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2bdd4442359..4cd813abc23 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1017,7 +1017,8 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t domain_mask; int best_cpu; - cpus_and(domain_mask, sd->span, *lowest_mask); + cpumask_and(&domain_mask, sched_domain_span(sd), + lowest_mask); best_cpu = pick_optimal_cpu(this_cpu, &domain_mask); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a2b02..ce340835d05 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, + *sched_domain_span(sd)); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { -- cgit v1.2.3 From 6c99e9ad47d9c082bd096f42fb49e397b05d58a8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: convert struct sched_group/sched_domain cpumask_ts to variable bitmaps Impact: (future) size reduction for large NR_CPUS. We move the 'cpumask' member of sched_group to the end, so when we kmalloc it we can do a minimal allocation: saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. Similar trick for 'span' in sched_domain. This isn't quite as good as converting to a cpumask_var_t, as some sched_groups are actually static, but it's safer: we don't have to figure out where to call alloc_cpumask_var/free_cpumask_var. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 65 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 575f38acf4d..6b9606a6cab 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7005,19 +7005,34 @@ static void sched_domain_node_span(int node, cpumask_t *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; +/* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + /* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); + *sg = &per_cpu(sched_group_cpus, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -7026,8 +7041,8 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, * multi-core sched-domains: */ #ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_core); +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) @@ -7041,7 +7056,7 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpus_and(*mask, *mask, *cpu_map); group = first_cpu(*mask); if (sg) - *sg = &per_cpu(sched_group_core, group); + *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) @@ -7050,13 +7065,13 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *unused) { if (sg) - *sg = &per_cpu(sched_group_core, cpu); + *sg = &per_cpu(sched_group_core, cpu).sg; return cpu; } #endif -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_phys); +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, @@ -7075,7 +7090,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, group = cpu; #endif if (sg) - *sg = &per_cpu(sched_group_phys, group); + *sg = &per_cpu(sched_group_phys, group).sg; return group; } @@ -7089,7 +7104,7 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, cpumask_t *nodemask) @@ -7101,7 +7116,7 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, group = first_cpu(*nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group); + *sg = &per_cpu(sched_group_allnodes, group).sg; return group; } @@ -7116,7 +7131,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); + sd = &per_cpu(phys_domains, j).sd; if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each @@ -7385,7 +7400,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif p = sd; - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), nodemask); @@ -7396,7 +7411,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC p = sd; - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); *sched_domain_span(sd) = cpu_coregroup_map(i); @@ -7409,7 +7424,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT p = sd; - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), @@ -7485,7 +7500,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sched_domain_node_span(i, domainspan); cpus_and(*domainspan, *domainspan, *cpu_map); - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); @@ -7518,7 +7534,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (cpus_empty(*tmpmask)) continue; - sg = kmalloc_node(sizeof(struct sched_group), + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING @@ -7538,21 +7555,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); + struct sched_domain *sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -7574,11 +7591,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; #else - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; #endif cpu_attach_domain(sd, rd, i); } -- cgit v1.2.3 From 6a7b3dc3440f7b5a9b67594af01ed562cdeb41e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:04 +1030 Subject: sched: convert nohz_cpu_mask to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- kernel/sched.c | 7 +++++-- kernel/time/tick-sched.c | 10 +++++----- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index e503a002f33..c03ca3e6191 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } diff --git a/kernel/sched.c b/kernel/sched.c index 6b9606a6cab..2723d7a4a42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5870,9 +5870,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@ -8274,6 +8274,9 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); + scheduler_running = 1; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 342fc9ccab4..70f872c71f4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void) if (!ts->tick_stopped) return; - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle) if ((long)delta_jiffies >= 1) { if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle) /* * sched tick not stopped! */ - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } @@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); /* * We stopped the tick in idle. Update process times would miss the -- cgit v1.2.3 From c6c4927b22a3514c6660f0e72c78716226bd3cc8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:05 +1030 Subject: sched: convert struct root_domain to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. def_root_domain is static, and so its masks are initialized with alloc_bootmem_cpumask_var. After that, alloc_cpumask_var is used. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 69 ++++++++++++++++++++++++++++++++++++++++--------------- kernel/sched_rt.c | 26 ++++++++++----------- 2 files changed, 64 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2723d7a4a42..93309c3034d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -487,14 +487,14 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; @@ -6444,7 +6444,7 @@ static void set_rq_online(struct rq *rq) if (!rq->online) { const struct sched_class *class; - cpu_set(rq->cpu, rq->rd->online); + cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { @@ -6464,7 +6464,7 @@ static void set_rq_offline(struct rq *rq) class->rq_offline(rq); } - cpu_clear(rq->cpu, rq->rd->online); + cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } @@ -6505,7 +6505,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } @@ -6567,7 +6567,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); @@ -6768,6 +6768,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +static void free_rootdomain(struct root_domain *rd) +{ + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; @@ -6777,38 +6785,60 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->rd) { struct root_domain *old_rd = rq->rd; - if (cpu_isset(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); - cpu_clear(rq->cpu, old_rd->span); + cpumask_clear_cpu(rq->cpu, old_rd->span); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + free_rootdomain(old_rd); } atomic_inc(&rd->refcount); rq->rd = rd; - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } -static void init_rootdomain(struct root_domain *rd) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); - cpus_clear(rd->span); - cpus_clear(rd->online); + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto free_rd; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; cpupri_init(&rd->cpupri); + return 0; + +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +free_rd: + kfree(rd); + return -ENOMEM; } static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain); + init_rootdomain(&def_root_domain, true); + atomic_set(&def_root_domain.refcount, 1); } @@ -6820,7 +6850,10 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - init_rootdomain(rd); + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } return rd; } @@ -7632,7 +7665,7 @@ free_sched_groups: #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - kfree(rd); + free_rootdomain(rd); goto free_tmpmask; #endif } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4cd813abc23..820fc422c6d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq) if (!rq->online) return; - cpu_set(rq->cpu, rq->rd->rto_mask); + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set * the overload count. That is checked to determine @@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq) /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } static void update_rt_migration(struct rq *rq) @@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) } #ifdef CONFIG_SMP -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_rq(smp_processor_id())->rd->span; } #else -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } #endif @@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) return rt_rq->rt_throttled; } -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } static inline @@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq) int i, weight, more = 0; u64 rt_period; - weight = cpus_weight(rd->span); + weight = cpumask_weight(rd->span); spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq) /* * Greedy reclaim, take back as much as we can. */ - for_each_cpu_mask(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; - cpumask_t span; + const struct cpumask *span; if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@ -1181,7 +1181,7 @@ static int pull_rt_task(struct rq *this_rq) next = pick_next_task_rt(this_rq); - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; -- cgit v1.2.3 From 7d1e6a9b95e3edeac91888bc683ae62f18519432 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:09 +1030 Subject: sched: convert nohz struct to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 93309c3034d..2f8ea99df16 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3758,10 +3758,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@ -3789,7 +3788,7 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@ -3803,7 +3802,7 @@ int select_nohz_load_balancer(int stop_tick) } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -3816,10 +3815,10 @@ int select_nohz_load_balancer(int stop_tick) } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@ -3930,12 +3929,13 @@ static void run_rebalance_domains(struct softirq_action *h) */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu(balance_cpu, &cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@ -3973,7 +3973,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@ -3986,7 +3986,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -3998,7 +3998,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@ -4008,7 +4008,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@ -8309,6 +8309,9 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif scheduler_running = 1; } -- cgit v1.2.3 From 4d2732c63e0c05cfef2a74868d08eace922dfc3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:10 +1030 Subject: sched: convert idle_balance() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2f8ea99df16..154a95fcea7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3676,7 +3676,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3687,7 +3690,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3702,6 +3705,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* -- cgit v1.2.3 From a0e902452da16b79d7c9230630ed8a595d14fa85 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert rebalance_domains() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 154a95fcea7..67383e7f1cc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3850,7 +3850,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3875,7 +3879,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3909,6 +3913,8 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* -- cgit v1.2.3 From f17c860760927c2a8e41a021eab3317e4415e962 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert sys_sched_getaffinity() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space in the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Some jiggling here to make sure we always exit at the bottom (so we hit the free_cpumask_var there). Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 67383e7f1cc..6deff24349b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5499,19 +5499,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** -- cgit v1.2.3 From e76bd8d9850c2296a7e8e24c9dce9b5e6b55fe2f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: avoid stack var in move_task_off_dead_cpu Impact: stack usage reduction With some care, we can avoid needing a temporary cpumask (we can't really allocate here, since we can't fail). This version calls cpuset_cpus_allowed_locked() with the task_rq_lock held. I'm fairly sure this works, but there might be a deadlock hiding. And of course, we can't get rid of the last cpumask on stack until we can use cpumask_of_node instead of node_to_cpumask. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 78 +++++++++++++++++++++++++++------------------------------- 1 file changed, 36 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6deff24349b..f7dee2029e4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6112,52 +6112,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; - cpumask_t mask; struct rq *rq; int dest_cpu; + /* FIXME: Use cpumask_of_node here. */ + cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); + const struct cpumask *nodemask = &_nodemask; + +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rq = task_rq_lock(p, &flags); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + task_rq_unlock(rq, &flags); - do { - /* On same node? */ - node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu)); - - cpus_and(mask, *pnodemask, p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &mask); - - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = cpumask_any_and(cpu_online_mask, - &p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = cpumask_any_and(cpu_online_mask, - &p->cpus_allowed); - task_rq_unlock(rq, &flags); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* -- cgit v1.2.3 From 5a16f3d30ca4e3f166d691220c003066a14e32b5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:11 +1030 Subject: sched: convert struct (sys_)sched_setaffinity() to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space on the stack. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Note the removal of the initializer of new_mask: since the first thing we did was "cpus_and(new_mask, new_mask, cpus_allowed)" I just changed that to "cpumask_and(new_mask, in_mask, cpus_allowed);". Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f7dee2029e4..2d4ff91e0c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5378,8 +5378,7 @@ out_unlock: long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@ -5401,6 +5400,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) @@ -5410,24 +5417,28 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: put_task_struct(p); put_online_cpus(); return retval; @@ -5453,14 +5464,17 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } long sched_getaffinity(pid_t pid, cpumask_t *mask) -- cgit v1.2.3 From d5dd3db1dce73cdd5c45c5a3498c51bd21b8864b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert sched_domain_debug to cpumask_var_t. Impact: stack usage reduction Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves stack space. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. In this case, we always alloced, but we don't need to any more. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2d4ff91e0c9..24012c2a889 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6706,7 +6706,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_t *groupmask; + cpumask_var_t groupmask; int level = 0; if (!sd) { @@ -6716,8 +6716,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); return; } @@ -6730,7 +6729,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!sd) break; } - kfree(groupmask); + free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) -- cgit v1.2.3 From dcc30a35f71bcf51f1e9b336dc5e41923071509a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert cpu_isolated_map to cpumask_var_t. Impact: stack usage reduction, (future) size reduction, cleanup Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. We can also use cpulist_parse() instead of doing it manually in isolated_cpu_setup. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 24012c2a889..526618fe4a7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6917,19 +6917,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } /* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; +static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); + cpulist_parse(str, *cpu_isolated_map); return 1; } @@ -7727,7 +7720,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@ -7826,7 +7819,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -7985,7 +7978,9 @@ static int update_runtime(struct notifier_block *nfb, void __init sched_init_smp(void) { - cpumask_t non_isolated_cpus; + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@ -7994,10 +7989,10 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -8012,9 +8007,10 @@ void __init sched_init_smp(void) init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); } #else void __init sched_init_smp(void) @@ -8334,6 +8330,7 @@ void __init sched_init(void) #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); #endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); scheduler_running = 1; } -- cgit v1.2.3 From 4212823fb459eacc8098dd420bb68ebb9917989d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:12 +1030 Subject: sched: convert falback_doms to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 526618fe4a7..42588ad93b2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7697,10 +7697,10 @@ static struct sched_domain_attr *dattr_cur; /* * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. */ -static cpumask_t fallback_doms; +static cpumask_var_t fallback_doms; void __attribute__((weak)) arch_update_cpu_topology(void) { @@ -7719,7 +7719,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) - doms_cur = &fallback_doms; + doms_cur = fallback_doms; cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); @@ -7818,7 +7818,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - doms_new = &fallback_doms; + doms_new = fallback_doms; cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -7838,7 +7838,7 @@ match2: } /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) + if (doms_cur != fallback_doms) kfree(doms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; @@ -8011,6 +8011,8 @@ void __init sched_init_smp(void) BUG(); sched_init_granularity(); free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); } #else void __init sched_init_smp(void) -- cgit v1.2.3 From 68e74568fbe5854952355e942acca51f138096d9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert struct cpupri_vec cpumask_var_t. Impact: stack usage reduction, (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. The fact cpupro_init is called both before and after the slab is available makes for an ugly parameter unfortunately. We also use cpumask_any_and to get rid of a temporary in cpupri_find. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++++-- kernel/sched_cpupri.c | 39 ++++++++++++++++++++++++++++----------- kernel/sched_cpupri.h | 5 +++-- 3 files changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 42588ad93b2..94fa333c1e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6792,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + cpupri_cleanup(&rd->cpupri); + free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6834,7 +6836,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) alloc_bootmem_cpumask_var(&def_root_domain.span); alloc_bootmem_cpumask_var(&def_root_domain.online); alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri); + cpupri_init(&rd->cpupri, true); return 0; } @@ -6845,9 +6847,12 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - cpupri_init(&rd->cpupri); + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; return 0; +free_rto_mask: + free_cpumask_var(rd->rto_mask); free_online: free_cpumask_var(rd->online); free_span: diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 52154fefab7..018b7be1db2 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -67,24 +67,21 @@ static int convert_prio(int prio) * Returns: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - cpumask_t *lowest_mask) + struct cpumask *lowest_mask) { int idx = 0; int task_pri = convert_prio(p->prio); for_each_cpupri_active(cp->pri_active, idx) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - cpumask_t mask; if (idx >= task_pri) break; - cpus_and(mask, p->cpus_allowed, vec->mask); - - if (cpus_empty(mask)) + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - *lowest_mask = mask; + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); return 1; } @@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); - cpu_clear(cpu, vec->mask); + cpumask_clear_cpu(cpu, vec->mask); spin_unlock_irqrestore(&vec->lock, flags); } @@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_lock_irqsave(&vec->lock, flags); - cpu_set(cpu, vec->mask); + cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); @@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context + * @bootmem: true if allocations need to use bootmem * - * Returns: (void) + * Returns: -ENOMEM if memory fails. */ -void cpupri_init(struct cpupri *cp) +int cpupri_init(struct cpupri *cp, bool bootmem) { int i; @@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp) spin_lock_init(&vec->lock); vec->count = 0; - cpus_clear(vec->mask); + if (bootmem) + alloc_bootmem_cpumask_var(&vec->mask); + else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + goto cleanup; } for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; + +cleanup: + for (i--; i >= 0; i--) + free_cpumask_var(cp->pri_to_cpu[i].mask); + return -ENOMEM; } +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ + int i; + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) + free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index f25811b0f93..642a94ef8a0 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -14,7 +14,7 @@ struct cpupri_vec { spinlock_t lock; int count; - cpumask_t mask; + cpumask_var_t mask; }; struct cpupri { @@ -27,7 +27,8 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, cpumask_t *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -void cpupri_init(struct cpupri *cp); +int cpupri_init(struct cpupri *cp, bool bootmem); +void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) #define cpupri_init() do { } while (0) -- cgit v1.2.3 From 24600ce89a819a8f2fb4fd69fd777218a82ade20 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert check_preempt_equal_prio to cpumask_var_t. Impact: stack reduction for large NR_CPUS Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves stack space. We simply return if the allocation fails: since we don't use it we could just pass NULL to cpupri_find and have it handle that. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 820fc422c6d..1fa13624293 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_t mask; + cpumask_var_t mask; if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, mask)) + goto free; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) + goto free; /* * There appears to be other cpus that can accept @@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); +free: + free_cpumask_var(mask); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 0e3900e6d3b04c44737ebc505604dcd8ed30e354 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:13 +1030 Subject: sched: convert local_cpu_mask to cpumask_var_t. Impact: (future) size reduction for large NR_CPUS. Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves space for small nr_cpu_ids but big CONFIG_NR_CPUS. cpumask_var_t is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_rt.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94fa333c1e7..f2be6187003 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,6 +8018,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); } #else void __init sched_init_smp(void) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1fa13624293..1f0e99d1a8c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -962,7 +962,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) return next; } -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { @@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -1551,3 +1551,12 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ + +/* Note that this is never called for !SMP, but that's OK. */ +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} -- cgit v1.2.3 From 96f874e26428ab5d2db681c100210c254775e154 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 02:35:14 +1030 Subject: sched: convert remaining old-style cpumask operators Impact: Trivial API conversion NR_CPUS -> nr_cpu_ids cpumask_t -> struct cpumask sizeof(cpumask_t) -> cpumask_size() cpumask_a = cpumask_b -> cpumask_copy(&cpumask_a, &cpumask_b) cpu_set() -> cpumask_set_cpu() first_cpu() -> cpumask_first() cpumask_of_cpu() -> cpumask_of() cpus_* -> cpumask_* There are some FIXMEs where we all archs to complete infrastructure (patches have been sent): cpu_coregroup_map -> cpu_coregroup_mask node_to_cpumask* -> cpumask_of_node There is also one FIXME where we pass an array of cpumasks to partition_sched_domains(): this implies knowing the definition of 'struct cpumask' and the size of a cpumask. This will be fixed in a future patch. Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 212 ++++++++++++++++++++++++++++------------------------ kernel/sched_fair.c | 4 +- kernel/sched_rt.c | 18 ++--- 3 files changed, 124 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f2be6187003..eba6a156d33 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2829,7 +2829,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; @@ -2895,7 +2895,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@ -3070,7 +3070,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3387,7 +3387,7 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -3396,7 +3396,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@ -3426,7 +3426,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -3434,7 +3434,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3494,8 +3494,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@ -3512,7 +3512,8 @@ redo: /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@ -3587,7 +3588,7 @@ out: */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3596,7 +3597,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3640,8 +3641,8 @@ redo: double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } @@ -5376,7 +5377,7 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; @@ -5445,13 +5446,13 @@ out_put_task: } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@ -5477,7 +5478,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return retval; } -long sched_getaffinity(pid_t pid, cpumask_t *mask) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@ -5494,7 +5495,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@ -5872,7 +5873,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@ -5956,7 +5957,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@ -5964,13 +5965,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@ -5978,12 +5979,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { @@ -6028,7 +6029,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@ -6629,13 +6630,13 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) + struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd)); - cpus_clear(*groupmask); + cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6936,24 +6937,25 @@ __setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) { struct sched_group *first = NULL, *last = NULL; int i; - cpus_clear(*covered); + cpumask_clear(covered); for_each_cpu(i, span) { struct sched_group *sg; @@ -6970,7 +6972,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, *covered); + cpumask_set_cpu(j, covered); cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) @@ -7035,9 +7037,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static void sched_domain_node_span(int node, cpumask_t *span) +static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; + /* FIXME: use cpumask_of_node() */ node_to_cpumask_ptr(nodemask, node); int i; @@ -7081,8 +7084,8 @@ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu).sg; @@ -7100,22 +7103,21 @@ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu).sg; @@ -7127,18 +7129,18 @@ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; #ifdef CONFIG_SCHED_MC + /* FIXME: Use cpu_coregroup_mask. */ *mask = cpu_coregroup_map(cpu); cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); #else group = cpu; #endif @@ -7159,14 +7161,16 @@ static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { int group; + /* FIXME: use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - cpus_and(*nodemask, *pnodemask, *cpu_map); - group = first_cpu(*nodemask); + cpumask_and(nodemask, pnodemask, cpu_map); + group = cpumask_first(nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group).sg; @@ -7202,7 +7206,8 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { int cpu, i; @@ -7215,10 +7220,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, i); cpus_and(*nodemask, *pnodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; if (sg == NULL) @@ -7236,7 +7242,8 @@ next_sg: } } #else /* !CONFIG_NUMA */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { } #endif /* CONFIG_NUMA */ @@ -7366,7 +7373,7 @@ static void set_domain_attribute(struct sched_domain *sd, * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const cpumask_t *cpu_map, +static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { int i, err = -ENOMEM; @@ -7416,7 +7423,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, } #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; #endif /* @@ -7425,12 +7432,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; + /* FIXME: use cpumask_of_node */ *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); @@ -7491,9 +7499,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, @@ -7505,9 +7513,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { + /* FIXME: Use cpu_coregroup_mask */ *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) + if (i != cpumask_first(this_core_map)) continue; init_sched_build_groups(this_core_map, cpu_map, @@ -7518,9 +7527,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; init_sched_build_groups(nodemask, cpu_map, @@ -7541,17 +7551,18 @@ static int __build_sched_domains(const cpumask_t *cpu_map, struct sched_group *sg, *prev; int j; + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); - cpus_clear(*covered); + cpumask_clear(covered); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { + if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; } sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); + cpumask_and(domainspan, domainspan, cpu_map); sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, i); @@ -7570,21 +7581,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sg->__cpu_power = 0; cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; - cpus_or(*covered, *covered, *nodemask); + cpumask_or(covered, covered, nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, n); - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) break; - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) + cpumask_and(tmpmask, tmpmask, pnodemask); + if (cpumask_empty(tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group) + @@ -7598,7 +7610,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sg->__cpu_power = 0; cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); + cpumask_or(covered, covered, tmpmask); prev->next = sg; prev = sg; } @@ -7634,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, tmpmask); init_numa_sched_groups_power(sg); } @@ -7690,12 +7702,12 @@ error: #endif } -static int build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const struct cpumask *cpu_map) { return __build_sched_domains(cpu_map, NULL); } -static cpumask_t *doms_cur; /* current sched domains */ +static struct cpumask *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -7716,13 +7728,13 @@ void __attribute__((weak)) arch_update_cpu_topology(void) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const struct cpumask *cpu_map) { int err; arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms_cur) doms_cur = fallback_doms; cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); @@ -7733,8 +7745,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); } @@ -7743,15 +7755,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map, * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ -static void detach_destroy_domains(const cpumask_t *cpu_map) +static void detach_destroy_domains(const struct cpumask *cpu_map) { - cpumask_t tmpmask; + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@ -7776,7 +7789,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the @@ -7790,13 +7803,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * the single partition 'fallback_doms', it also forces the domains * to be rebuilt. * - * If doms_new == NULL it will be replaced with cpu_online_map. + * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; @@ -7811,7 +7825,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) + if (cpumask_equal(&doms_cur[i], &doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } @@ -7831,7 +7845,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) + if (cpumask_equal(&doms_new[i], &doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bba00402ed9..08ffffd4a41 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1017,7 +1017,7 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) + * hence we need to mask them out (cpu_active_mask) * * Returns the CPU we should wake onto. */ @@ -1244,7 +1244,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) goto out; /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1f0e99d1a8c..fb3964579a8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -923,7 +923,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -997,7 +997,7 @@ static int find_lowest_rq(struct task_struct *task) * I guess we might want to change cpupri_find() to ignore those * in the first place. */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * At this point we have built a mask of cpus representing the @@ -1007,7 +1007,7 @@ static int find_lowest_rq(struct task_struct *task) * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ - if (cpu_isset(cpu, *lowest_mask)) + if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* @@ -1064,8 +1064,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || task_running(rq, task) || !task->se.on_rq)) { @@ -1315,9 +1315,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, } static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - int weight = cpus_weight(*new_mask); + int weight = cpumask_weight(new_mask); BUG_ON(!rt_task(p)); @@ -1338,7 +1338,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, update_rt_migration(rq); } - p->cpus_allowed = *new_mask; + cpumask_copy(&p->cpus_allowed, new_mask); p->rt.nr_cpus_allowed = weight; } -- cgit v1.2.3 From bf4d83f66476086c6b50dc52aac00d71ad70494e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:57:51 +1030 Subject: sched: convert nohz struct to cpumask_var_t, fix Impact: build fix Fix the !CONFIG_SMP case. Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index eba6a156d33..1aa840a9f58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8349,10 +8349,12 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ scheduler_running = 1; } -- cgit v1.2.3 From 3d8cbdf8650f44d95333ca645d950832a0653f35 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:58:41 +1030 Subject: sched: convert local_cpu_mask to cpumask_var_t, fix Impact: build fix for !CONFIG_SMP Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fb3964579a8..94aab72f6a0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1381,6 +1381,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, if (!rq->rt.rt_nr_running) pull_rt_task(rq); } + +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} #endif /* CONFIG_SMP */ /* @@ -1552,11 +1560,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) } #endif /* CONFIG_SCHED_DEBUG */ -/* Note that this is never called for !SMP, but that's OK. */ -static inline void init_sched_rt_class(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); -} -- cgit v1.2.3 From 1224e376f2a7e3c7ab19ef37099a78597978a696 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 25 Nov 2008 09:59:20 +1030 Subject: sched: avoid stack var in move_task_off_dead_cpu, fix Impact: locking fix We can't call cpuset_cpus_allowed_locked() with the rq lock held. However, the rq lock merely protects us from (1) cpu_online_mask changing and (2) someone else changing p->cpus_allowed. The first can't happen because we're being called from a cpu hotplug notifier. The second doesn't really matter: we are forcing the task off a CPU it was affine to, so we're not doing very well anyway. So we remove the rq lock from this path, and all is good. Signed-off-by: Rusty Russell Acked-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1aa840a9f58..3f5bfdc3d94 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6126,8 +6126,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - struct rq *rq; int dest_cpu; /* FIXME: Use cpumask_of_node here. */ cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); @@ -6146,10 +6144,8 @@ again: /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { - rq = task_rq_lock(p, &flags); cpuset_cpus_allowed_locked(p, &p->cpus_allowed); dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - task_rq_unlock(rq, &flags); /* * Don't tell them about moving exiting tasks or -- cgit v1.2.3 From 98a79d6a50181ca1ecf7400eda01d5dc1bc0dbf0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:19:41 +1030 Subject: cpumask: centralize cpu_online_map and cpu_possible_map Impact: cleanup Each SMP arch defines these themselves. Move them to a central location. Twists: 1) Some archs (m32, parisc, s390) set possible_map to all 1, so we add a CONFIG_INIT_ALL_POSSIBLE for this rather than break them. 2) mips and sparc32 '#define cpu_possible_map phys_cpu_present_map'. Those archs simply have phys_cpu_present_map replaced everywhere. 3) Alpha defined cpu_possible_map to cpu_present_map; this is tricky so I just manipulate them both in sync. 4) IA64, cris and m32r have gratuitous 'extern cpumask_t cpu_possible_map' declarations. Signed-off-by: Rusty Russell Reviewed-by: Grant Grundler Tested-by: Tony Luck Acked-by: Ingo Molnar Cc: Mike Travis Cc: ink@jurassic.park.msu.ru Cc: rmk@arm.linux.org.uk Cc: starvik@axis.com Cc: tony.luck@intel.com Cc: takata@linux-m32r.org Cc: ralf@linux-mips.org Cc: grundler@parisc-linux.org Cc: paulus@samba.org Cc: schwidefsky@de.ibm.com Cc: lethal@linux-sh.org Cc: wli@holomorphy.com Cc: davem@davemloft.net Cc: jdike@addtoit.com Cc: mingo@redhat.com --- kernel/cpu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ea32e8d68b..bae131a1211 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,19 +24,20 @@ cpumask_t cpu_present_map __read_mostly; EXPORT_SYMBOL(cpu_present_map); -#ifndef CONFIG_SMP - /* * Represents all cpu's that are currently online. */ -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); +#ifdef CONFIG_INIT_ALL_POSSIBLE cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +#else +cpumask_t cpu_possible_map __read_mostly; +#endif EXPORT_SYMBOL(cpu_possible_map); -#else /* CONFIG_SMP */ - +#ifdef CONFIG_SMP /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); -- cgit v1.2.3 From 29c0177e6a4ac094302bed54a1d4bbb6b740a9ef Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:25 +1030 Subject: cpumask: change cpumask_scnprintf, cpumask_parse_user, cpulist_parse, and cpulist_scnprintf to take pointers. Impact: change calling convention of existing cpumask APIs Most cpumask functions started with cpus_: these have been replaced by cpumask_ ones which take struct cpumask pointers as expected. These four functions don't have good replacement names; fortunately they're rarely used, so we just change them over. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Cc: paulus@samba.org Cc: mingo@redhat.com Cc: tony.luck@intel.com Cc: ralf@linux-mips.org Cc: Greg Kroah-Hartman Cc: cl@linux-foundation.org Cc: srostedt@redhat.com --- kernel/cpuset.c | 4 ++-- kernel/irq/proc.c | 4 ++-- kernel/profile.c | 4 ++-- kernel/sched.c | 4 ++-- kernel/sched_stats.h | 2 +- kernel/taskstats.c | 2 +- kernel/trace/trace.c | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 96c0ba13b8c..39c1a4c1c5a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (!*buf) { cpus_clear(trialcs.cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs.cpus_allowed); + retval = cpulist_parse(buf, &trialcs.cpus_allowed); if (retval < 0) return retval; @@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) mask = cs->cpus_allowed; mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, mask); + return cpulist_scnprintf(page, PAGE_SIZE, &mask); } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d257e7d6a8a..f293349d49d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -47,7 +47,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, irq_balancing_disabled(irq)) return -EIO; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; @@ -95,7 +95,7 @@ static ssize_t default_affinity_write(struct file *file, cpumask_t new_value; int err; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; diff --git a/kernel/profile.c b/kernel/profile.c index dc41827fbfe..7d620dfdde5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -442,7 +442,7 @@ void profile_tick(int type) static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); + int len = cpumask_scnprintf(page, count, (cpumask_t *)data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file, unsigned long full_count = count, err; cpumask_t new_value; - err = cpumask_parse_user(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, &new_value); if (err) return err; diff --git a/kernel/sched.c b/kernel/sched.c index e4bb1dd7b30..d2d16d1273b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6666,7 +6666,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); + cpulist_scnprintf(str, sizeof(str), &sd->span); cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6720,7 +6720,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpus_or(*groupmask, *groupmask, group->cpumask); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), &group->cpumask); printk(KERN_CONT " %s", str); group = group->next; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 7dbf72a2b02..6beff1e4eea 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -42,7 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, &sd->span); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bd6be76303c..6d7dc4ec4aa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask) if (!data) return -ENOMEM; nla_strlcpy(data, na, len); - ret = cpulist_parse(data, *mask); + ret = cpulist_parse(data, mask); kfree(data); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e3252f30..d2e75479dc5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2126,7 +2126,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2147,7 +2147,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, int err, cpu; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); if (err) goto err_unlock; -- cgit v1.2.3 From 0de26520c7cabf36e1de090ea8092f011a6106ce Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:26 +1030 Subject: cpumask: make irq_set_affinity() take a const struct cpumask Impact: change existing irq_chip API Not much point with gentle transition here: the struct irq_chip's setaffinity method signature needs to change. Fortunately, not widely used code, but hits a few architectures. Note: In irq_select_affinity() I save a temporary in by mangling irq_desc[irq].affinity directly. Ingo, does this break anything? (Folded in fix from KOSAKI Motohiro) Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Reviewed-by: Grant Grundler Acked-by: Ingo Molnar Cc: ralf@linux-mips.org Cc: grundler@parisc-linux.org Cc: jeremy@xensource.com Cc: KOSAKI Motohiro --- kernel/irq/chip.c | 2 +- kernel/irq/manage.c | 22 ++++++++++------------ kernel/irq/migration.c | 14 +++++++------- kernel/irq/proc.c | 29 +++++++++++++++++++---------- kernel/time/tick-common.c | 6 +++--- 5 files changed, 40 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 10b5092e9bf..58d8e31daa4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -45,7 +45,7 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpus_setall(desc->affinity); + cpumask_setall(&desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 801addda3c4..10ad2f87ed9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq) * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = cpumask; + cpumask_copy(&desc->pending_mask, cpumask); } #else - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) */ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { - cpumask_t mask; - if (!irq_can_set_affinity(irq)) return 0; - cpus_and(mask, cpu_online_map, irq_default_affinity); - /* * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpus_intersects(desc->affinity, cpu_online_map)) - mask = desc->affinity; + if (cpumask_any_and(&desc->affinity, cpu_online_mask) + < nr_cpu_ids) + goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - desc->affinity = mask; - desc->chip->set_affinity(irq, mask); + cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); +set_affinity: + desc->chip->set_affinity(irq, &desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9db681d9581..bd72329e630 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,7 +4,6 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); - cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -19,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(desc->pending_mask))) + if (unlikely(cpumask_empty(&desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -27,8 +26,6 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, desc->pending_mask, cpu_online_map); - /* * If there was a valid mask to work with, please * do the disable, re-program, enable sequence. @@ -41,10 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(!cpus_empty(tmp))) { - desc->chip->set_affinity(irq,tmp); + if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) + < nr_cpu_ids)) { + cpumask_and(&desc->affinity, + &desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, &desc->affinity); } - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f293349d49d..8e91c976252 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; - cpumask_t new_value; + cpumask_var_t new_value; int err; if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; - err = cpumask_parse_user(buffer, count, &new_value); + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto free_cpumask; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(*new_value)) { + err = -EINVAL; + goto free_cpumask; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) + if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity_usr(irq) ? -EINVAL : count; - - irq_set_affinity(irq, new_value); + err = irq_select_affinity_usr(irq) ? -EINVAL : count; + } else { + irq_set_affinity(irq, new_value); + err = count; + } - return count; +free_cpumask: + free_cpumask_var(new_value); + return err; } static int irq_affinity_proc_open(struct inode *inode, struct file *file) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df12434b43c..ab65d217583 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, - const cpumask_t *cpumask) + const struct cpumask *cpumask) { ktime_t next_event; void (*handler)(struct clock_event_device *) = NULL; @@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpus_equal(newdev->cpumask, *cpumask)) - irq_set_affinity(newdev->irq, *cpumask); + if (!cpumask_equal(&newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current -- cgit v1.2.3 From 320ab2b0b1e08e3805a3e1084a2f0eb1938d5d67 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 13 Dec 2008 21:20:26 +1030 Subject: cpumask: convert struct clock_event_device to cpumask pointers. Impact: change calling convention of existing clock_event APIs struct clock_event_timer's cpumask field gets changed to take pointer, as does the ->broadcast function. Another single-patch change. For safety, we BUG_ON() in clockevents_register_device() if it's not set. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- kernel/time/clockevents.c | 2 ++ kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 8 ++++---- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f8d968063ce..ea2f48af83c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -166,6 +166,8 @@ static void clockevents_notify_released(void) void clockevents_register_device(struct clock_event_device *dev) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(!dev->cpumask); + /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f98a1b7b16e..9590af2327b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask) */ cpu = first_cpu(mask); td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(mask); + td->evtdev->broadcast(&mask); } } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index ab65d217583..f8372be7412 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -171,7 +171,7 @@ static void tick_setup_device(struct tick_device *td, * When the device is not per cpu, pin the interrupt to the * current cpu: */ - if (!cpumask_equal(&newdev->cpumask, cpumask)) + if (!cpumask_equal(newdev->cpumask, cpumask)) irq_set_affinity(newdev->irq, cpumask); /* @@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - if (!cpu_isset(cpu, newdev->cpumask)) + if (!cpumask_test_cpu(cpu, newdev->cpumask)) goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) goto out_bc; } -- cgit v1.2.3 From afb8a9b70b86866a60e08b2956ae4e1406390336 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 18 Dec 2008 23:26:09 +0530 Subject: sched: framework for sched_mc/smt_power_savings=N Impact: extend range of /sys/devices/system/cpu/sched_mc_power_savings Currently the sched_mc/smt_power_savings variable is a boolean, which either enables or disables topology based power savings. This patch extends the behaviour of the variable from boolean to multivalued, such that based on the value, we decide how aggressively do we want to perform powersavings balance at appropriate sched domain based on topology. Variable levels of power saving tunable would benefit end user to match the required level of power savings vs performance trade-off depending on the system configuration and workloads. This version makes the sched_mc_power_savings global variable to take more values (0,1,2). Later versions can have a single tunable called sched_power_savings instead of sched_{mc,smt}_power_savings. Signed-off-by: Gautham R Shenoy Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b309027bf9e..56b285cd535 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7906,14 +7906,25 @@ int arch_reinit_sched_domains(void) static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { int ret; + unsigned int level = 0; - if (buf[0] != '0' && buf[0] != '1') + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) return -EINVAL; if (smt) - sched_smt_power_savings = (buf[0] == '1'); + sched_smt_power_savings = level; else - sched_mc_power_savings = (buf[0] == '1'); + sched_mc_power_savings = level; ret = arch_reinit_sched_domains(); -- cgit v1.2.3 From d5679bd11916eba5c8ee9033003e1a5ce56ece9a Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:16 +0530 Subject: sched: favour lower logical cpu number for sched_mc balance Impact: change load-balancing direction to match that of irqbalanced Just in case two groups have identical load, prefer to move load to lower logical cpu number rather than the present logic of moving to higher logical number. find_busiest_group() tries to look for a group_leader that has spare capacity to take more tasks and freeup an appropriate least loaded group. Just in case there is a tie and the load is equal, then the group with higher logical number is favoured. This conflicts with user space irqbalance daemon that will move interrupts to lower logical number if the system utilisation is very low. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 56b285cd535..94b9d11e331 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3241,7 +3241,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group)) > cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; @@ -3257,7 +3257,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group)) < cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; -- cgit v1.2.3 From 7a09b1a27b1e5a4957e4af9951420fea02c44fba Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:22 +0530 Subject: sched: nominate preferred wakeup cpu Impact: extend load-balancing code (no change in behavior yet) When the system utilisation is low and more cpus are idle, then the process waking up from sleep should prefer to wakeup an idle cpu from semi-idle cpu package (multi core package) rather than a completely idle cpu package which would waste power. Use the sched_mc balance logic in find_busiest_group() to nominate a preferred wakeup cpu. This info can be stored in appropriate sched_domain, but updating this info in all copies of sched_domain is not practical. Hence this information is stored in root_domain struct which is one copy per partitioned sched domain. The root_domain can be accessed from each cpu's runqueue and there is one copy per partitioned sched domain. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 94b9d11e331..c1b8b3031eb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -509,6 +509,14 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif }; /* @@ -3384,6 +3392,10 @@ out_balanced: if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + first_cpu(group_leader->cpumask); + } return group_min; } #endif -- cgit v1.2.3 From 7eb52dfa70dbf5232b5b83ec4357e6bebaa8fde8 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:29 +0530 Subject: sched: bias task wakeups to preferred semi-idle packages Impact: tweak task wakeup to save power more agressively Preferred wakeup cpu (from a semi idle package) has been nominated in find_busiest_group() in the previous patch. Use this information in sched_mc_preferred_wakeup_cpu in function wake_idle() to bias task wakeups if the following conditions are satisfied: - The present cpu that is trying to wakeup the process is idle and waking the target process on this cpu will potentially wakeup a completely idle package - The previous cpu on which the target process ran is also idle and hence selecting the previous cpu may wakeup a semi idle cpu package - The task being woken up is allowed to run in the nominated cpu (cpu affinity and restrictions) Basically if both the current cpu and the previous cpu on which the task ran is idle, select the nominated cpu from semi idle cpu package for running the new task that is waking up. Cache hotness is considered since the actual biasing happens in wake_idle() only if the application is cache cold. This technique will effectively move short running bursty jobs in a mostly idle system. Wakeup biasing for power savings gets automatically disabled if system utilisation increases due to the fact that the probability of finding both this_cpu and prev_cpu idle decreases. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ffffd4a41..36b5e34fa99 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1026,6 +1026,24 @@ static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; + unsigned int chosen_wakeup_cpu; + int this_cpu; + + /* + * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu + * are idle and this is not a kernel thread and this task's affinity + * allows it to be moved to preferred cpu, then just move! + */ + + this_cpu = smp_processor_id(); + chosen_wakeup_cpu = + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && + idle_cpu(cpu) && idle_cpu(this_cpu) && + p->mm && !(p->flags & PF_KTHREAD) && + cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) + return chosen_wakeup_cpu; /* * If it is idle, then it is the best cpu to run this task. -- cgit v1.2.3 From ad273b32e482cdef306eac32b28d97f513a022f4 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Thu, 18 Dec 2008 23:26:36 +0530 Subject: sched: activate active load balancing in new idle cpus Impact: tweak task balancing to save power more agressively Active load balancing is a process by which migration thread is woken up on the target CPU in order to pull current running task on another package into this newly idle package. This method is already in use with normal load_balance(), this patch introduces this method to new idle cpus when sched_mc is set to POWERSAVINGS_BALANCE_WAKEUP. This logic provides effective consolidation of short running daemon jobs in a almost idle system The side effect of this patch may be ping-ponging of tasks if the system is moderately utilised. May need to adjust the iterations before triggering. Signed-off-by: Vaidyanathan Srinivasan Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1b8b3031eb..8fc0d5aa43b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3670,10 +3670,64 @@ redo: } if (!ld_moved) { + int active_balance; + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + if (active_balance) + wake_up_process(busiest->migration_thread); + } else sd->nr_balance_failed = 0; -- cgit v1.2.3 From 9924da434a13668fceb208d56dbdf86d166862cc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 00:53:40 +0100 Subject: sched: fix warning in kernel/sched.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix cpumask conversion bug this warning: kernel/sched.c: In function ‘find_busiest_group’: kernel/sched.c:3429: warning: passing argument 1 of ‘__first_cpu’ from incompatible pointer type shows that we forgot to convert a new patch to the new cpumask APIs. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8fc0d5aa43b..ae5ca3f9e77 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3394,7 +3394,7 @@ out_balanced: *imbalance = min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - first_cpu(group_leader->cpumask); + cpumask_first(sched_group_cpus(group_leader)); } return group_min; } -- cgit v1.2.3 From 3fe0313e6ec572e6bb3f9d247316a834336db4be Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 26 Oct 2008 20:50:26 +0100 Subject: Hibernate: Call platform_begin before swsusp_shrink_memory Call platform_begin() before swsusp_shrink_memory() so that we can always allocate enough memory to save the ACPI NVS region from platform_begin(). Signed-off-by: Zhang Rui Acked-by: Nigel Cunningham Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/disk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index c9d74083746..096fe4899ea 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -259,12 +259,12 @@ int hibernation_snapshot(int platform_mode) { int error, ftrace_save; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + error = platform_begin(platform_mode); if (error) return error; - error = platform_begin(platform_mode); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); if (error) goto Close; -- cgit v1.2.3 From 3f4b0ef7f2899c91b1d6958779f084b44dd59d32 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 26 Oct 2008 20:52:15 +0100 Subject: ACPI hibernate: Add a mechanism to save/restore ACPI NVS memory According to the ACPI Specification 3.0b, Section 15.3.2, "OSPM will call the _PTS control method some time before entering a sleeping state, to allow the platform's AML code to update this memory image before entering the sleeping state. After the system awakes from an S4 state, OSPM will restore this memory area and call the _WAK control method to enable the BIOS to reclaim its memory image." For this reason, implement a mechanism allowing us to save the NVS memory during hibernation and to restore it during the subsequent resume. Based on a patch by Zhang Rui. Signed-off-by: Rafael J. Wysocki Acked-by: Nigel Cunningham Cc: Zhang Rui Signed-off-by: Len Brown --- kernel/power/swsusp.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 023ff2a31d8..a92c9145155 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -262,3 +262,125 @@ int swsusp_shrink_memory(void) return 0; } + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * hibernate_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + hibernate_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * hibernate_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} -- cgit v1.2.3 From 69643279a88dea000ac2f858091d0e365f778245 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Nov 2008 21:32:44 +0100 Subject: Hibernate: Do not oops on resume if image data are incorrect During resume from hibernation using the userland interface image data are being passed from the used space process to the kernel. These data need not be valid, but currently the kernel sometimes oopses if it gets invalid image data, which is wrong. Make the kernel return error codes to the user space in such cases. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/snapshot.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5d2ab836e99..955c8cc9183 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -519,6 +519,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); +} + /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -1459,9 +1467,7 @@ load_header(struct swsusp_info *info) * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set * the corresponding bit in the memory bitmap @bm */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1469,8 +1475,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - memory_bm_set_bit(bm, buf[j]); + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; } + + return 0; } /* List of "safe" pages that may be used to store data loaded from the suspend @@ -1608,7 +1619,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_page = page; if (safe_highmem_pages > 0) { @@ -1677,7 +1688,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) static inline void * get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) { - return NULL; + return ERR_PTR(-EINVAL); } static inline void copy_last_highmem_page(void) {} @@ -1788,8 +1799,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); if (PageHighMem(page)) return get_highmem_page_buffer(page, ca); @@ -1805,7 +1821,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); pbe->address = safe_pages_list; @@ -1868,7 +1884,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return error; } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + if (handle->prev == nr_meta_pages) { error = prepare_image(&orig_bm, ©_bm); if (error) @@ -1879,12 +1898,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); } } else { copy_last_highmem_page(); handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); if (handle->buffer != buffer) handle->sync_read = 0; } -- cgit v1.2.3 From 846705deb059c352cc0e5806d5964f815b8c6d98 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Nov 2008 18:00:24 -0500 Subject: Hibernate: Take overlapping zones into account (rev. 2) It has been requested to make hibernation work with memory hotplugging enabled and for this purpose the hibernation code has to be reworked to take the possible overlapping of zones into account. Thus, rework the hibernation memory bitmaps code to prevent duplication of PFNs from occuring and add checks to make sure that one page frame will not be marked as saveable many times. Additionally, use list.h lists instead of open-coded lists to implement the memory bitmaps. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/snapshot.c | 325 +++++++++++++++++++++++++----------------------- 1 file changed, 166 insertions(+), 159 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 955c8cc9183..ec9f153b2fc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - /** * Data types related to memory bitmaps. * @@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { - struct bm_block *next; /* next element of the list */ + struct list_head hook; /* hook into a list of bitmap blocks */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ unsigned long *data; /* bitmap representing pages */ @@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb) return bb->end_pfn - bb->start_pfn; } -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; - /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct zone_bitmap *zone_bm; struct bm_block *block; int bit; }; struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -273,11 +259,7 @@ struct memory_bitmap { static void memory_bm_position_reset(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); bm->cur.bit = 0; } @@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects + * @nr_blocks - number of blocks to allocate + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) { - struct bm_block *bblist = NULL; + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); while (nr_blocks-- > 0) { struct bm_block *bb; bb = chain_alloc(ca, sizeof(struct bm_block)); if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; + return -ENOMEM; + list_add(&bb->hook, list); } - return bblist; + + return 0; } +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + /** - * create_zone_bm_list - create a list of zone bitmap objects + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; + + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { - struct zone_bitmap *zbmlist = NULL; + struct zone *zone; - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; + INIT_LIST_HEAD(list); - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; + for_each_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + if (!populated_zone(zone)) + continue; + + zone_start = zone->zone_start_pfn; + zone_end = zone->zone_start_pfn + zone->spanned_pages; - zbm->next = zbmlist; - zbmlist = zbm; + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } } - return zbmlist; + + return 0; } /** * memory_bm_create - allocate memory for a memory bitmap */ - static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; + struct list_head mem_extents; + struct mem_extent *ext; + int error; chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); - /* Compute the number of zones */ - nr = 0; - for_each_zone(zone) - if (populated_zone(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone(zone) { - unsigned long pfn; + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; - if (!populated_zone(zone)) - continue; + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; + bb = list_entry(bm->blocks.prev, struct bm_block, hook); - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; - ptr = get_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { + if (pages >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ - pfn += nr; + pfn += pages; } bb->end_pfn = pfn; - bb = bb->next; } - zone_bm = zone_bm->next; } + bm->p_list = ca.chain; memory_bm_position_reset(bm); - return 0; + Exit: + free_mem_extents(&mem_extents); + return error; - Free: + Error: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; + goto Exit; } /** * memory_bm_free - free memory occupied by the memory bitmap @bm */ - static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct zone_bitmap *zone_bm; + struct bm_block *bb; - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; + + INIT_LIST_HEAD(&bm->blocks); } /** @@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. */ - static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - - if (!zone_bm) - return -EFAULT; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; - while (pfn >= bb->end_pfn) { - bb = bb->next; + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; - BUG_ON(!bb); - } - zone_bm->cur_block = bb; + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; pfn -= bb->start_pfn; + bm->cur.bit = pfn + 1; *bit_nr = pfn; *addr = bb->data; return 0; @@ -538,29 +548,21 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; struct bm_block *bb; int bit; + bb = bm->cur.block; do { - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = bb->next; - bm->cur.block = bb; - bm->cur.bit = 0; - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - bm->cur.bit = 0; - } - } while (zone_bm); + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + memory_bm_position_reset(bm); return BM_END_OF_MAP; @@ -816,8 +818,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ - -static struct page *saveable_highmem_page(unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -825,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(!PageHighMem(page)); @@ -854,13 +857,16 @@ unsigned int count_highmem_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(pfn)) + if (saveable_highmem_page(zone, pfn)) n++; } return n; } #else -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -871,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ - -static struct page *saveable_page(unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -880,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(PageHighMem(page)); @@ -911,7 +918,7 @@ unsigned int count_data_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if(saveable_page(pfn)) + if (saveable_page(zone, pfn)) n++; } return n; @@ -952,7 +959,7 @@ static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? - saveable_highmem_page(pfn) : saveable_page(pfn); + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) @@ -983,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } } #else -#define page_is_saveable(zone, pfn) saveable_page(pfn) +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { -- cgit v1.2.3 From baa5835df10254762aedb6cb23a9c1508f969736 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 8 Dec 2008 00:52:49 +0100 Subject: Hibernate: Replace unnecessary evaluation of pfn_to_page() Replace one evaluation of pfn_to_page() in copy_data_pages() with the value of a local variable containing the right number already. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ec9f153b2fc..f5fc2d7680f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -981,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); + dst = kmap_atomic(d_page, KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { -- cgit v1.2.3 From 36dffab679c7eeb91c2507400cf4da6e9e01164e Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Sat, 20 Dec 2008 10:06:38 +0530 Subject: sched: nominate preferred wakeup cpu, fix Andrew Morton reported: > kernel/sched.c: In function 'schedule': > kernel/sched.c:3679: warning: 'active_balance' may be used uninitialized in this function > > This warning is correct - the code is buggy. In sched.c load_balance_newidle, there's real potential use of uninitialised variable - fix it. Signed-off-by: Vaidyanathan Srinivasan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ae5ca3f9e77..756d981d91a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3670,7 +3670,7 @@ redo: } if (!ld_moved) { - int active_balance; + int active_balance = 0; schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && -- cgit v1.2.3 From 51bc39f4ba35bae153b32145077fb1109bcae14c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 26 Dec 2008 12:23:00 +0900 Subject: hrtimer: remove #include Impact: cleanup can be removed and should be, because: - hrtimer doesn't use any irq feature. - shouldn't be include from generic code. Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 47e63349d1b..0ad3f3d6d10 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -32,7 +32,6 @@ */ #include -#include #include #include #include -- cgit v1.2.3 From f9af0e70911e9d6cc9a68f784dca86415486084d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 26 Dec 2008 12:24:24 +0900 Subject: irq: for_each_irq_desc() move to irqnr.h Impact: cleanup before CONFIG_SPARSE_IRQ age, for_each_irq_desc() sat in irqnr.h and could be called from generic code. CONFIG_SPARSE_IRQ breaks this assumption, but SPARSE_IRQ version for_each_irq_desc() also can move into irqnr.h easily. Also, this patch unifies CONFIG_SPARSE_IRQ and !CONFIG_SPARSE_IRQ for_each_irq_desc(). Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6492400cb50..4db7d2df86b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -203,7 +203,7 @@ out_unlock: return desc; } -#else +#else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { @@ -218,7 +218,16 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -#endif +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} + +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +{ + return irq_to_desc(irq); +} +#endif /* !CONFIG_SPARSE_IRQ */ /* * What should we do if we get a hw irq event on an illegal vector? -- cgit v1.2.3 From 26ddd8d5cac8a563953d5febe8c6e40909f7bce1 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 26 Dec 2008 14:24:10 +0900 Subject: proc: remove ifdef CONFIG_SPARSE_IRQ from stat.c Impact: cleanup irq_desc can be NULL when CONFIG_SPARSE_IRQ=y only. therefore, NULL checking can move into kstat_irqs_cpu() of SPARSE_IRQ version. Signed-off-by: KOSAKI Motohiro Acked-by: "Yinghai Lu" Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 4db7d2df86b..03479dfdebb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -448,7 +448,7 @@ void early_init_irq_lock_class(void) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc->kstat_irqs[cpu]; + return desc ? desc->kstat_irqs[cpu] : 0; } #endif EXPORT_SYMBOL(kstat_irqs_cpu); -- cgit v1.2.3 From 18eefedfe8ad33e8fc7614c13359e29a9fab4644 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 26 Dec 2008 12:29:48 +0900 Subject: irq: simplify for_each_irq_desc() usage Impact: cleanup all for_each_irq_desc() usage point have !desc check. then its check can move into for_each_irq_desc() macro. Signed-off-by: KOSAKI Motohiro Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/autoprobe.c | 15 --------------- kernel/irq/handle.c | 3 --- kernel/irq/spurious.c | 5 ----- 3 files changed, 23 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 650ce4102a6..cc0f7321b8c 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -40,9 +40,6 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* @@ -71,9 +68,6 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -92,9 +86,6 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); status = desc->status; @@ -133,9 +124,6 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); status = desc->status; @@ -178,9 +166,6 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { - if (!desc) - continue; - spin_lock_irq(&desc->lock); status = desc->status; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 03479dfdebb..7dbdfe52469 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -437,9 +437,6 @@ void early_init_irq_lock_class(void) int i; for_each_irq_desc(i, desc) { - if (!desc) - continue; - lockdep_set_class(&desc->lock, &irq_desc_lock_class); } } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3738107531f..dd364c11e56 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -91,9 +91,6 @@ static int misrouted_irq(int irq) int i, ok = 0; for_each_irq_desc(i, desc) { - if (!desc) - continue; - if (!i) continue; @@ -115,8 +112,6 @@ static void poll_spurious_irqs(unsigned long dummy) for_each_irq_desc(i, desc) { unsigned int status; - if (!desc) - continue; if (!i) continue; -- cgit v1.2.3 From 00c23634879062d1c38d60128bf150c394a359e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 23 Dec 2008 17:29:00 -0800 Subject: sparseirq: remove duplicated arch_early_irq_init() Impact: clean up We already have a weak copy of this function in init/main.c Signed-off-by: Yinghai Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 7dbdfe52469..06b05a4d300 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -56,10 +56,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); -void __init __attribute__((weak)) arch_early_irq_init(void) -{ -} - #ifdef CONFIG_SPARSE_IRQ static struct irq_desc irq_desc_init = { .irq = -1, -- cgit v1.2.3 From be4d638c1597580ed2294d899d9f1a2cd10e462c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 26 Dec 2008 22:23:43 +1030 Subject: cpumask: Replace cpu_coregroup_map with cpu_coregroup_mask cpu_coregroup_map returned a cpumask_t: it's going away. (Note, the sched part of this patch won't apply meaningfully to the sched tree, but I'm posting it to show the goal). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Jens Axboe Cc: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d2d16d1273b..42929239830 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7119,7 +7119,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, { int group; #ifdef CONFIG_SCHED_MC - *mask = cpu_coregroup_map(cpu); + *mask = *cpu_coregroup_mask(cpu); cpus_and(*mask, *mask, *cpu_map); group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) @@ -7485,7 +7485,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, sd = &per_cpu(core_domains, i); SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); + sd->span = *cpu_coregroup_mask(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7528,7 +7528,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SCHED_CPUMASK_VAR(this_core_map, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); - *this_core_map = cpu_coregroup_map(i); + *this_core_map = *cpu_coregroup_mask(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); if (i != first_cpu(*this_core_map)) continue; -- cgit v1.2.3 From 8b07cd44511f3aa78dd912cca6493275a6787dc5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Dec 2008 19:10:04 +0100 Subject: sparseirq: do not printk when migrating IRQ descriptors Impact: reduce printk noise There were a couple of leftover KERN_DEBUG debugging printks, remove them. Also clarify an error message. Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 089c3746358..a565ce3a4fb 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -74,10 +74,8 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n", - irq, cpu, node); if (!desc) { - printk(KERN_ERR "can not get new irq_desc for moving\n"); + printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); /* still use old one */ desc = old_desc; goto out_unlock; @@ -106,8 +104,6 @@ struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) return desc; old_cpu = desc->cpu; - printk(KERN_DEBUG - "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu); if (old_cpu != cpu) { node = cpu_to_node(cpu); old_node = cpu_to_node(old_cpu); -- cgit v1.2.3 From 793f7b12a0c95e7bfec1badf9628043fb78fd440 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Dec 2008 19:02:20 +0100 Subject: sparseirq: fix desc->lock init Impact: cleanup init_one_irq_desc() does not initialize the desc->lock properly - you cannot init a lock by memcpying some other lock on it. This happens to work right now (because irq_desc_init is never in use), but it's a dangerous construct nevertheless, so fix it. Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 ++ kernel/irq/numa_migrate.c | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 06b05a4d300..893da67b778 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -93,6 +93,8 @@ void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu) static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); + + spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP desc->cpu = cpu; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index a565ce3a4fb..ecf765c6a77 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -42,6 +42,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, struct irq_desc *desc, int cpu) { memcpy(desc, old_desc, sizeof(struct irq_desc)); + spin_lock_init(&desc->lock); desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); -- cgit v1.2.3 From 13a0c3c269b223f60abfac8a9811d77111a8b4ba Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 26 Dec 2008 02:05:47 -0800 Subject: sparseirq: work around compiler optimizing away __weak functions Impact: fix panic on null pointer with sparseirq Some GCC versions seem to inline the weak global function, when that function is empty. Work it around, by making the functions return a (dummy) integer. Signed-off-by: Yinghai Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 893da67b778..0bef3ecb7a0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -86,8 +86,9 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) desc->kstat_irqs = (unsigned int *)ptr; } -void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) { + return 0; } static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) @@ -132,7 +133,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm /* FIXME: use bootmem alloc ...*/ static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; -void __init early_irq_init(void) +int __init early_irq_init(void) { struct irq_desc *desc; int legacy_count; @@ -151,7 +152,7 @@ void __init early_irq_init(void) for (i = legacy_count; i < NR_IRQS; i++) irq_desc_ptrs[i] = NULL; - arch_early_irq_init(); + return arch_early_irq_init(); } struct irq_desc *irq_to_desc(unsigned int irq) -- cgit v1.2.3 From fa6beb37b0d9bc00f90f11154eeed9502d8b0a37 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Dec 2008 20:24:09 -0800 Subject: sparseirq: set lock_class for legacy irq when sparse_irq is selected Impact: add lockdep annotation to legacy IRQ descs Warnings resulting out of this were not seen in practice, but it's prudent to initialize the legacy descriptors to the lock class as well, symmetric to how we do it with other descriptors. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 0bef3ecb7a0..e1cf4e391ca 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -145,6 +145,7 @@ int __init early_irq_init(void) for (i = 0; i < legacy_count; i++) { desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy[i]; + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); irq_desc_ptrs[i] = desc + i; } -- cgit v1.2.3 From 12026ea16a618b289fcf457661aed24f57323a20 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 26 Dec 2008 22:38:15 -0800 Subject: sparseirq: fix hang with !SPARSE_IRQ Impact: fix hang Suresh report his two sockets system only works with SPARSE_IRQ enable it turns out we miss the setting desc->irq so provide early_irq_init() even !SPARSE_IRQ to set desc->irq Reported-by: "Siddha, Suresh B" Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e1cf4e391ca..157c04c3b15 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -218,6 +218,21 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; +int __init early_irq_init(void) +{ + struct irq_desc *desc; + int count; + int i; + + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + + for (i = 0; i < count; i++) + desc[i].irq = i; + + return arch_early_irq_init(); +} + struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; -- cgit v1.2.3 From b2e2fe99628c4f944c3075258e536197b5a4f3f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 29 Dec 2008 00:16:45 +0100 Subject: sparseirq: work around __weak alias bug Impact: fix boot crash if the kernel is built with certain GCC versions GCC has a bug with __weak alias functions: if the functions are in the same compilation unit as their call site, GCC can decide to inline them - and thus rob the linker of the opportunity to override the weak alias with the real thing. This can lead to the boot crash reported by Kamalesh Babulal: ACPI: Core revision 20080926 Setting APIC routing to flat BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 IP: [] add_pin_to_irq_cpu+0x14/0x74 PGD 0 Oops: 0000 [#1] SMP [...] So move the arch_init_chip_data() function from handle.c to manage.c. Reported-by: Kamalesh Babulal Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 5 ----- kernel/irq/manage.c | 9 +++++++++ 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 157c04c3b15..c20db0be917 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -86,11 +86,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) desc->kstat_irqs = (unsigned int *)ptr; } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) -{ - return 0; -} - static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 46953a06f4a..c2741b02ad3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -261,6 +261,15 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); +/* + * [ Not in kernel/irq/handle.c, so that GCC does not + * inline the __weak alias: ] + */ +int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +{ + return 0; +} + static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); -- cgit v1.2.3 From 43a256322ac1fc105c181b3cade3b9bfc0b63ca1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 28 Dec 2008 16:01:13 -0800 Subject: sparseirq: move __weak symbols into separate compilation unit GCC has a bug with __weak alias functions: if the functions are in the same compilation unit as their call site, GCC can decide to inline them - and thus rob the linker of the opportunity to override the weak alias with the real thing. So move all the IRQ handling related __weak symbols to kernel/irq/chip.c. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 9 --------- kernel/softirq.c | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c2741b02ad3..46953a06f4a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -261,15 +261,6 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); -/* - * [ Not in kernel/irq/handle.c, so that GCC does not - * inline the __weak alias: ] - */ -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) -{ - return 0; -} - static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); diff --git a/kernel/softirq.c b/kernel/softirq.c index e7c69a720d6..daf46358d2d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -797,3 +797,23 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) } EXPORT_SYMBOL(on_each_cpu); #endif + +/* + * [ These __weak aliases are kept in a separate compilation unit, so that + * GCC does not inline them incorrectly. ] + */ + +int __init __weak early_irq_init(void) +{ + return 0; +} + +int __init __weak arch_early_irq_init(void) +{ + return 0; +} + +int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +{ + return 0; +} -- cgit v1.2.3 From b3199c025d1646e25e7d1d640dd605db251dccf8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:14 +1030 Subject: cpumask: switch over to cpu_online/possible/active/present_mask: core Impact: cleanup This implements the obsolescent cpu_online_map in terms of cpu_online_mask, rather than the other way around. Same for the other maps. The documentation comments are also updated to refer to _mask rather than _map. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/cpu.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index bae131a1211..3ddc509b19c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,30 +15,8 @@ #include #include -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -/* - * Represents all cpu's that are currently online. - */ -cpumask_t cpu_online_map __read_mostly; -EXPORT_SYMBOL(cpu_online_map); - -#ifdef CONFIG_INIT_ALL_POSSIBLE -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -#else -cpumask_t cpu_possible_map __read_mostly; -#endif -EXPORT_SYMBOL(cpu_possible_map); - #ifdef CONFIG_SMP -/* Serializes the updates to cpu_online_map, cpu_present_map */ +/* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); @@ -65,8 +43,6 @@ void __init cpu_hotplug_init(void) cpu_hotplug.refcount = 0; } -cpumask_t cpu_active_map; - #ifdef CONFIG_HOTPLUG_CPU void get_online_cpus(void) @@ -97,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); /* * The following two API's must be used when attempting - * to serialize the updates to cpu_online_map, cpu_present_map. + * to serialize the updates to cpu_online_mask, cpu_present_mask. */ void cpu_maps_update_begin(void) { @@ -503,3 +479,24 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap); const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); + +#ifdef CONFIG_INIT_ALL_POSSIBLE +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly + = CPU_BITS_ALL; +#else +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +#endif +const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); +EXPORT_SYMBOL(cpu_possible_mask); + +static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); +EXPORT_SYMBOL(cpu_online_mask); + +static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); +EXPORT_SYMBOL(cpu_present_mask); + +static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); +EXPORT_SYMBOL(cpu_active_mask); -- cgit v1.2.3 From 3fa41520696fec2815e2d88fbcccdda77ba4d693 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: make set_cpu_*/init_cpu_* out-of-line They're only for use in boot/cpu hotplug code anyway, and this avoids the use of deprecated cpu_*_map. Stephen Rothwell points out that gcc 4.2.4 (on powerpc at least) didn't like the cast away of const anyway: include/linux/cpumask.h: In function 'set_cpu_possible': include/linux/cpumask.h:1052: warning: passing argument 2 of 'cpumask_set_cpu' discards qualifiers from pointer target type So this kills two birds with one stone. Signed-off-by: Rusty Russell --- kernel/cpu.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 3ddc509b19c..2c9f78f3a2f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -500,3 +500,50 @@ EXPORT_SYMBOL(cpu_present_mask); static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); EXPORT_SYMBOL(cpu_active_mask); + +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); +} + +void set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); +} + +void set_cpu_online(unsigned int cpu, bool online) +{ + if (online) + cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); +} + +void set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); +} + +void init_cpu_present(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_present_bits), src); +} + +void init_cpu_possible(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_possible_bits), src); +} + +void init_cpu_online(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_online_bits), src); +} -- cgit v1.2.3 From 54b11e6d57a10aa9d0009efd93873e17bffd5d30 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:16 +1030 Subject: cpumask: smp_call_function_many() Impact: Implementation change to remove cpumask_t from stack. Actually change smp_call_function_mask() to smp_call_function_many(). We avoid cpumasks on the stack in this version. (S390 has its own version, but that's going away apparently). We have to do some dancing to figure out if 0 or 1 other cpus are in the mask supplied and the online mask without allocating a tmp cpumask. It's still fairly cheap. We allocate the cpumask at the end of the call_function_data structure: if allocation fails we fallback to smp_call_function_single rather than using the baroque quiescing code (which needs a cpumask on stack). (Thanks to Hiroshi Shimamoto for spotting several bugs in previous versions!) Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Hiroshi Shimamoto Cc: npiggin@suse.de Cc: axboe@kernel.dk --- kernel/smp.c | 139 +++++++++++++++++++++-------------------------------------- 1 file changed, 49 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 75c8dde58c5..9f0eafed139 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -24,8 +24,8 @@ struct call_function_data { struct call_single_data csd; spinlock_t lock; unsigned int refs; - cpumask_t cpumask; struct rcu_head rcu_head; + unsigned long cpumask_bits[]; }; struct call_single_queue { @@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function_queue, csd.list) { int refs; - if (!cpu_isset(cpu, data->cpumask)) + if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) continue; data->csd.func(data->csd.info); spin_lock(&data->lock); - cpu_clear(cpu, data->cpumask); + cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits)); WARN_ON(data->refs == 0); data->refs--; refs = data->refs; @@ -266,51 +266,13 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } -/* Dummy function */ -static void quiesce_dummy(void *unused) -{ -} - -/* - * Ensure stack based data used in call function mask is safe to free. - * - * This is needed by smp_call_function_mask when using on-stack data, because - * a single call function queue is shared by all CPUs, and any CPU may pick up - * the data item on the queue at any time before it is deleted. So we need to - * ensure that all CPUs have transitioned through a quiescent state after - * this call. - * - * This is a very slow function, implemented by sending synchronous IPIs to - * all possible CPUs. For this reason, we have to alloc data rather than use - * stack based data even in the case of synchronous calls. The stack based - * data is then just used for deadlock/oom fallback which will be very rare. - * - * If a faster scheme can be made, we could go back to preferring stack based - * data -- the data allocation/free is non-zero cost. - */ -static void smp_call_function_mask_quiesce_stack(cpumask_t mask) -{ - struct call_single_data data; - int cpu; - - data.func = quiesce_dummy; - data.info = NULL; - - for_each_cpu_mask(cpu, mask) { - data.flags = CSD_FLAG_WAIT; - generic_exec_single(cpu, &data); - } -} - /** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. + * smp_call_function_many(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. - * * If @wait is true, then returns once @func has returned. Note that @wait * will be implicitly turned on in case of allocation failures, since * we fall back to on-stack allocation. @@ -319,53 +281,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask) * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, - int wait) +void smp_call_function_many(const struct cpumask *mask, + void (*func)(void *), void *info, + bool wait) { - struct call_function_data d; - struct call_function_data *data = NULL; - cpumask_t allbutself; + struct call_function_data *data; unsigned long flags; - int cpu, num_cpus; - int slowpath = 0; + int cpu, next_cpu; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); - cpu = smp_processor_id(); - allbutself = cpu_online_map; - cpu_clear(cpu, allbutself); - cpus_and(mask, mask, allbutself); - num_cpus = cpus_weight(mask); - - /* - * If zero CPUs, return. If just a single CPU, turn this request - * into a targetted single call instead since it's faster. - */ - if (!num_cpus) - return 0; - else if (num_cpus == 1) { - cpu = first_cpu(mask); - return smp_call_function_single(cpu, func, info, wait); + /* So, what's a CPU they want? Ignoring this one. */ + cpu = cpumask_first_and(mask, cpu_online_mask); + if (cpu == smp_processor_id()) + cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + /* No online cpus? We're done. */ + if (cpu >= nr_cpu_ids) + return; + + /* Do we have another CPU which isn't us? */ + next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + if (next_cpu == smp_processor_id()) + next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); + + /* Fastpath: do that cpu by itself. */ + if (next_cpu >= nr_cpu_ids) { + smp_call_function_single(cpu, func, info, wait); + return; } - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) { - data->csd.flags = CSD_FLAG_ALLOC; - if (wait) - data->csd.flags |= CSD_FLAG_WAIT; - } else { - data = &d; - data->csd.flags = CSD_FLAG_WAIT; - wait = 1; - slowpath = 1; + data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); + if (unlikely(!data)) { + /* Slow path. */ + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpumask_test_cpu(cpu, mask)) + smp_call_function_single(cpu, func, info, wait); + } + return; } spin_lock_init(&data->lock); + data->csd.flags = CSD_FLAG_ALLOC; + if (wait) + data->csd.flags |= CSD_FLAG_WAIT; data->csd.func = func; data->csd.info = info; - data->refs = num_cpus; - data->cpumask = mask; + cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); + data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); spin_lock_irqsave(&call_function_lock, flags); list_add_tail_rcu(&data->csd.list, &call_function_queue); @@ -377,18 +343,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(mask); + arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits)); /* optionally wait for the CPUs to complete */ - if (wait) { + if (wait) csd_flag_wait(&data->csd); - if (unlikely(slowpath)) - smp_call_function_mask_quiesce_stack(mask); - } - - return 0; } -EXPORT_SYMBOL(smp_call_function_mask); +EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. @@ -396,7 +357,7 @@ EXPORT_SYMBOL(smp_call_function_mask); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. + * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. In case of allocation @@ -407,12 +368,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function(void (*func)(void *), void *info, int wait) { - int ret; - preempt_disable(); - ret = smp_call_function_mask(cpu_online_map, func, info, wait); + smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); - return ret; + return 0; } EXPORT_SYMBOL(smp_call_function); -- cgit v1.2.3 From ce47d974f71af26d00832e83a43ac79bec272d99 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:05:17 +1030 Subject: cpumask: arch_send_call_function_ipi_mask: core Impact: new API to reduce stack usage We're weaning the core code off handing cpumask's around on-stack. This introduces arch_send_call_function_ipi_mask(). Signed-off-by: Rusty Russell --- kernel/smp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 9f0eafed139..172b1826890 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -266,6 +266,12 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } +/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ +#ifndef arch_send_call_function_ipi_mask +#define arch_send_call_function_ipi_mask(maskp) \ + arch_send_call_function_ipi(*(maskp)) +#endif + /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -343,7 +349,7 @@ void smp_call_function_many(const struct cpumask *mask, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits)); + arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); /* optionally wait for the CPUs to complete */ if (wait) -- cgit v1.2.3 From 42d35d48ce7cefb9429880af19d1c329d1554e7a Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 29 Dec 2008 15:49:53 -0800 Subject: futex: make futex_(get|put)_key() calls symmetric Impact: cleanup This patch makes the calls to futex_get_key_refs() and futex_drop_key_refs() explicitly symmetric by only "putting" keys we successfully "got". Also cleanup a couple return points that didn't "put" after a successful "get". Build and boot tested on an x86_64 system. Signed-off-by: Darren Hart Cc: Signed-off-by: Ingo Molnar --- kernel/futex.c | 67 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b4f87bac91c..c5ac55cc0c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -723,8 +723,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; } @@ -748,7 +748,7 @@ retryfull: goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -770,12 +770,12 @@ retry: * but we might get them from range checking */ ret = op_ret; - goto out; + goto out_put_keys; #endif if (unlikely(op_ret != -EFAULT)) { ret = op_ret; - goto out; + goto out_put_keys; } /* @@ -789,7 +789,7 @@ retry: ret = futex_handle_fault((unsigned long)uaddr2, attempt); if (ret) - goto out; + goto out_put_keys; goto retry; } @@ -827,10 +827,11 @@ retry: spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); - +out: return ret; } @@ -847,13 +848,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_q *this, *next; int ret, drop_count = 0; - retry: +retry: ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -875,7 +876,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, if (!ret) goto retry; - return ret; + goto out_put_keys; } if (curval != *cmpval) { ret = -EAGAIN; @@ -920,9 +921,11 @@ out_unlock: while (--drop_count >= 0) drop_futex_key_refs(&key1); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); +out: return ret; } @@ -983,7 +986,7 @@ static int unqueue_me(struct futex_q *q) int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ - retry: +retry: lock_ptr = q->lock_ptr; barrier(); if (lock_ptr != NULL) { @@ -1165,11 +1168,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; hb = queue_lock(&q); @@ -1197,6 +1200,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, if (unlikely(ret)) { queue_unlock(&q, hb); + put_futex_key(fshared, &q.key); ret = get_user(uval, uaddr); @@ -1206,7 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, } ret = -EWOULDBLOCK; if (uval != val) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1298,11 +1302,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, return -ERESTART_RESTARTBLOCK; } - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - - out_release_sem: put_futex_key(fshared, &q.key); + +out: return ret; } @@ -1351,16 +1355,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; - retry_unlocked: +retry_unlocked: hb = queue_lock(&q); - retry_locked: +retry_locked: ret = lock_taken = 0; /* @@ -1381,14 +1385,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, */ if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { ret = -EDEADLK; - goto out_unlock_release_sem; + goto out_unlock_put_key; } /* * Surprise - we got the lock. Just return to userspace: */ if (unlikely(!curval)) - goto out_unlock_release_sem; + goto out_unlock_put_key; uval = curval; @@ -1424,7 +1428,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, * We took the lock due to owner died take over. */ if (unlikely(lock_taken)) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* * We dont have the lock. Look up the PI state (or create it if @@ -1463,7 +1467,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, goto retry_locked; } default: - goto out_unlock_release_sem; + goto out_unlock_put_key; } } @@ -1554,16 +1558,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, destroy_hrtimer_on_stack(&to->timer); return ret != -EINTR ? ret : -ERESTARTNOINTR; - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - out_release_sem: +out_put_key: put_futex_key(fshared, &q.key); +out: if (to) destroy_hrtimer_on_stack(&to->timer); return ret; - uaddr_faulted: +uaddr_faulted: /* * We have to r/w *(int __user *)uaddr, and we have to modify it * atomically. Therefore, if we continue to fault after get_user() @@ -1576,7 +1581,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) - goto out_release_sem; + goto out_put_key; goto retry_unlocked; } @@ -1668,9 +1673,9 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; pi_faulted: -- cgit v1.2.3 From 1c5745aa380efb6417b5681104b007c8612fb496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Dec 2008 23:05:28 +0100 Subject: sched_clock: prevent scd->clock from moving backwards, take #2 Redo: 5b7dba4: sched_clock: prevent scd->clock from moving backwards which had to be reverted due to s2ram hangs: ca7e716: Revert "sched_clock: prevent scd->clock from moving backwards" ... this time with resume restoring GTOD later in the sequence taken into account as well. The "timekeeping_suspended" flag is not very nice but we cannot call into GTOD before it has been properly resumed and the scheduler will run very early in the resume sequence. Cc: Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 5 ++++- kernel/time/timekeeping.c | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096ddfe..a0b0852414c 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); @@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { + if (timekeeping_suspended) + return; + sched_clock_tick(); touch_softlockup_watchdog(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fa05e88aa76..900f1b6598d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + static struct timespec xtime_cache __attribute__ ((aligned (16))); void update_xtime_cache(u64 nsec) { @@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts) unsigned long seq; s64 nsecs; + WARN_ON(timekeeping_suspended); + do { seq = read_seqbegin(&xtime_lock); @@ -299,8 +304,6 @@ void __init timekeeping_init(void) write_sequnlock_irqrestore(&xtime_lock, flags); } -/* flag for if timekeeping is suspended */ -static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -- cgit v1.2.3 From 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:37 +0100 Subject: [PATCH] fix scaled & unscaled cputime accounting The utimescaled / stimescaled fields in the task structure and the global cpustat should be set on all architectures. On s390 the calls to account_user_time_scaled and account_system_time_scaled never have been added. In addition system time that is accounted as guest time to the user time of a process is accounted to the scaled system time instead of the scaled user time. To fix the bugs and to prevent future forgetfulness this patch merges account_system_time_scaled into account_system_time and account_user_time_scaled into account_user_time. Cc: Benjamin Herrenschmidt Cc: Hidetoshi Seto Cc: Tony Luck Cc: Jeremy Fitzhardinge Cc: Chris Wright Cc: Michael Neuling Acked-by: Paul Mackerras Signed-off-by: Martin Schwidefsky --- kernel/sched.c | 41 ++++++++++++++++------------------------- kernel/time/tick-sched.c | 5 +++-- kernel/timer.c | 12 +++++------- 3 files changed, 24 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fff1c4a20b6..5b03679ff71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4080,13 +4080,17 @@ unsigned long long task_delta_exec(struct task_struct *p) * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime) +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; + /* Add user time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); /* Add user time to cpustat. */ @@ -4103,51 +4107,49 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { cputime64_t tmp; struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; tmp = cputime_to_cputime64(cputime); + /* Add guest time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); + /* Add guest time to cpustat. */ cpustat->user = cputime64_add(cpustat->user, tmp); cpustat->guest = cputime64_add(cpustat->guest, tmp); } -/* - * Account scaled user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->utimescaled = cputime_add(p->utimescaled, cputime); -} - /* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) + cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime); + account_guest_time(p, cputime, cputime_scaled); return; } + /* Add system time to process. */ p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -4166,17 +4168,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, acct_update_integrals(p); } -/* - * Account scaled system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - */ -void account_system_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->stimescaled = cputime_add(p->stimescaled, cputime); -} - /* * Account for involuntary wait time. * @p: the process from which the cpu time has been stolen diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8f3fc2582d3..1f2fce2479f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -420,6 +420,7 @@ void tick_nohz_restart_sched_tick(void) int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long ticks; + cputime_t cputime; ktime_t now; local_irq_disable(); @@ -452,8 +453,8 @@ void tick_nohz_restart_sched_tick(void) */ if (ticks && ticks < LONG_MAX) { add_preempt_count(HARDIRQ_OFFSET); - account_system_time(current, HARDIRQ_OFFSET, - jiffies_to_cputime(ticks)); + cputime = jiffies_to_cputime(ticks); + account_system_time(current, HARDIRQ_OFFSET, cputime, cputime); sub_preempt_count(HARDIRQ_OFFSET); } diff --git a/kernel/timer.c b/kernel/timer.c index 566257d1dc1..b5efb528aa1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1023,13 +1023,11 @@ void account_process_tick(struct task_struct *p, int user_tick) { cputime_t one_jiffy = jiffies_to_cputime(1); - if (user_tick) { - account_user_time(p, one_jiffy); - account_user_time_scaled(p, cputime_to_scaled(one_jiffy)); - } else { - account_system_time(p, HARDIRQ_OFFSET, one_jiffy); - account_system_time_scaled(p, cputime_to_scaled(one_jiffy)); - } + if (user_tick) + account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy)); + else + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + cputime_to_scaled(one_jiffy)); } #endif -- cgit v1.2.3 From 79741dd35713ff4f6fd0eafd59fa94e8a4ba922d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:38 +0100 Subject: [PATCH] idle cputime accounting The cpu time spent by the idle process actually doing something is currently accounted as idle time. This is plain wrong, the architectures that support VIRT_CPU_ACCOUNTING=y can do better: distinguish between the time spent doing nothing and the time spent by idle doing work. The first is accounted with account_idle_time and the second with account_system_time. The architectures that use the account_xxx_time interface directly and not the account_xxx_ticks interface now need to do the check for the idle process in their arch code. In particular to improve the system vs true idle time accounting the arch code needs to measure the true idle time instead of just testing for the idle process. To improve the tick based accounting as well we would need an architecture primitive that can tell us if the pt_regs of the interrupted context points to the magic instruction that halts the cpu. In addition idle time is no more added to the stime of the idle process. This field now contains the system time of the idle process as it should be. On systems without VIRT_CPU_ACCOUNTING this will always be zero as every tick that occurs while idle is running will be accounted as idle time. This patch contains the necessary common code changes to be able to distinguish idle system time and true idle time. The architectures with support for VIRT_CPU_ACCOUNTING need some changes to exploit this. Signed-off-by: Martin Schwidefsky --- kernel/sched.c | 80 ++++++++++++++++++++++++++++++++++++++---------- kernel/time/tick-sched.c | 13 ++++---- kernel/timer.c | 13 -------- 3 files changed, 69 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5b03679ff71..635eaffe1e4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4139,7 +4139,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { @@ -4158,37 +4157,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else - cpustat->idle = cputime64_add(cpustat->idle, tmp); + cpustat->system = cputime64_add(cpustat->system, tmp); + /* Account for system time used */ acct_update_integrals(p); } /* * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen * @steal: the cpu time spent in involuntary wait */ -void account_steal_time(struct task_struct *p, cputime_t steal) +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp = cputime_to_cputime64(steal); + cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); } +#endif + /* * Use precise platform statistics if available: */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1f2fce2479f..611fa4c0baa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -419,8 +419,9 @@ void tick_nohz_restart_sched_tick(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING unsigned long ticks; - cputime_t cputime; +#endif ktime_t now; local_irq_disable(); @@ -442,6 +443,7 @@ void tick_nohz_restart_sched_tick(void) tick_do_update_jiffies64(now); cpu_clear(cpu, nohz_cpu_mask); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -451,12 +453,9 @@ void tick_nohz_restart_sched_tick(void) /* * We might be one off. Do not randomly account a huge number of ticks! */ - if (ticks && ticks < LONG_MAX) { - add_preempt_count(HARDIRQ_OFFSET); - cputime = jiffies_to_cputime(ticks); - account_system_time(current, HARDIRQ_OFFSET, cputime, cputime); - sub_preempt_count(HARDIRQ_OFFSET); - } + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif touch_softlockup_watchdog(); /* diff --git a/kernel/timer.c b/kernel/timer.c index b5efb528aa1..dee3f641a7a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1018,19 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now) } #endif -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy = jiffies_to_cputime(1); - - if (user_tick) - account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy)); - else - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, - cputime_to_scaled(one_jiffy)); -} -#endif - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. -- cgit v1.2.3 From 4f4b6c1a94a8735bbdc030a2911cf395495645b6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:15 +1030 Subject: cpumask: prepare for iterators to only go to nr_cpu_ids/nr_cpumask_bits.: core Impact: cleanup In future, all cpumask ops will only be valid (in general) for bit numbers < nr_cpu_ids. So use that instead of NR_CPUS in iterators and other comparisons. This is always safe: no cpu number can be >= nr_cpu_ids, and nr_cpu_ids is initialized to NR_CPUS at boot. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar Acked-by: James Morris Cc: Eric Biederman --- kernel/kexec.c | 2 +- kernel/smp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ac0fde7b54d..3fb855ad6aa 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) struct elf_prstatus prstatus; u32 *buf; - if ((cpu < 0) || (cpu >= NR_CPUS)) + if ((cpu < 0) || (cpu >= nr_cpu_ids)) return; /* Using ELF notes here is opportunistic. diff --git a/kernel/smp.c b/kernel/smp.c index 172b1826890..5cfa0e5e3e8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { + } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { -- cgit v1.2.3 From 9e01c1b74c9531e301c900edaa92a99fcb7738f2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:22 +1030 Subject: cpumask: convert kernel trace functions Impact: Reduce future memory usage, use new cpumask API. (Eventually, cpumask_var_t will be allocated based on nr_cpu_ids, not NR_CPUS). Convert kernel trace functions to use struct cpumask API: 1) Use cpumask_copy/cpumask_test_cpu/for_each_cpu. 2) Use cpumask_var_t and alloc_cpumask_var/free_cpumask_var everywhere. 3) Use on_each_cpu instead of playing with current->cpus_allowed. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 42 ++++++++++++++++++------------- kernel/trace/trace.c | 60 ++++++++++++++++++++++++++------------------ kernel/trace/trace_sysprof.c | 13 +++------- 3 files changed, 64 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d601a7c458..a9d9760dc7b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -195,7 +195,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ - for_each_cpu_mask(cpu, buffer->cpumask) + for_each_cpu(cpu, buffer->cpumask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) @@ -267,7 +267,7 @@ struct ring_buffer { unsigned pages; unsigned flags; int cpus; - cpumask_t cpumask; + cpumask_var_t cpumask; atomic_t record_disabled; struct mutex mutex; @@ -458,6 +458,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (!buffer) return NULL; + if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + goto fail_free_buffer; + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; @@ -465,14 +468,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (buffer->pages == 1) buffer->pages++; - buffer->cpumask = cpu_possible_map; + cpumask_copy(buffer->cpumask, cpu_possible_mask); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) - goto fail_free_buffer; + goto fail_free_cpumask; for_each_buffer_cpu(buffer, cpu) { buffer->buffers[cpu] = @@ -492,6 +495,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) } kfree(buffer->buffers); + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); + fail_free_buffer: kfree(buffer); return NULL; @@ -510,6 +516,8 @@ ring_buffer_free(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); + free_cpumask_var(buffer->cpumask); + kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); @@ -1283,7 +1291,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1396,7 +1404,7 @@ int ring_buffer_write(struct ring_buffer *buffer, cpu = raw_smp_processor_id(); - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; @@ -1478,7 +1486,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1498,7 +1506,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; @@ -1515,7 +1523,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1532,7 +1540,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; @@ -1850,7 +1858,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct buffer_page *reader; int nr_loops = 0; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; cpu_buffer = buffer->buffers[cpu]; @@ -2025,7 +2033,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_event *event; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2062,7 +2070,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) struct ring_buffer_iter *iter; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kmalloc(sizeof(*iter), GFP_KERNEL); @@ -2172,7 +2180,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2228,7 +2236,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - if (!cpu_isset(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; cpu_buffer = buffer->buffers[cpu]; @@ -2252,8 +2260,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; - if (!cpu_isset(cpu, buffer_a->cpumask) || - !cpu_isset(cpu, buffer_b->cpumask)) + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) return -EINVAL; /* At least make sure the two buffers are somewhat the same */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e91f43b6ba..5d04e27f3b4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -89,10 +89,10 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_t __read_mostly tracing_buffer_mask; +static cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ - for_each_cpu_mask(cpu, tracing_buffer_mask) + for_each_cpu(cpu, tracing_buffer_mask) /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -2646,13 +2646,7 @@ static struct file_operations show_traces_fops = { /* * Only trace on a CPU if the bitmask is set: */ -static cpumask_t tracing_cpumask = CPU_MASK_ALL; - -/* - * When tracing/tracing_cpu_mask is modified then this holds - * the new bitmask we are about to install: - */ -static cpumask_t tracing_cpumask_new; +static cpumask_var_t tracing_cpumask; /* * The tracer itself will not take this lock, but still we want @@ -2674,7 +2668,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2693,9 +2687,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { int err, cpu; + cpumask_var_t tracing_cpumask_new; + + if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) goto err_unlock; @@ -2706,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpu_isset(cpu, tracing_cpumask) && - !cpu_isset(cpu, tracing_cpumask_new)) { + if (cpumask_test_cpu(cpu, tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&global_trace.data[cpu]->disabled); } - if (!cpu_isset(cpu, tracing_cpumask) && - cpu_isset(cpu, tracing_cpumask_new)) { + if (!cpumask_test_cpu(cpu, tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&global_trace.data[cpu]->disabled); } } __raw_spin_unlock(&ftrace_max_lock); local_irq_enable(); - tracing_cpumask = tracing_cpumask_new; + cpumask_copy(tracing_cpumask, tracing_cpumask_new); mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask_new); return count; err_unlock: mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask); return err; } @@ -3752,7 +3752,6 @@ void ftrace_dump(void) static DEFINE_SPINLOCK(ftrace_dump_lock); /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static cpumask_t mask; static int dump_ran; unsigned long flags; int cnt = 0, cpu; @@ -3786,8 +3785,6 @@ void ftrace_dump(void) * and then release the locks again. */ - cpus_clear(mask); - while (!trace_empty(&iter)) { if (!cnt) @@ -3823,19 +3820,28 @@ __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; int i; + int ret = -ENOMEM; - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_buffer_mask = cpu_possible_map; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(tracing_cpumask, cpu_all_mask); + + /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - return 0; + goto out_free_cpumask; } global_trace.entries = ring_buffer_size(global_trace.buffer); + #ifdef CONFIG_TRACER_MAX_TRACE max_tr.buffer = ring_buffer_alloc(trace_buf_size, TRACE_BUFFER_FLAGS); @@ -3843,7 +3849,7 @@ __init static int tracer_alloc_buffers(void) printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); - return 0; + goto out_free_cpumask; } max_tr.entries = ring_buffer_size(max_tr.buffer); WARN_ON(max_tr.entries != global_trace.entries); @@ -3873,8 +3879,14 @@ __init static int tracer_alloc_buffers(void) &trace_panic_notifier); register_die_notifier(&trace_die_notifier); + ret = 0; - return 0; +out_free_cpumask: + free_cpumask_var(tracing_cpumask); +out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); +out: + return ret; } early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index a5779bd975d..eaca5ad803f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -196,9 +196,9 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void start_stack_timer(int cpu) +static void start_stack_timer(void *unused) { - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; @@ -208,14 +208,7 @@ static void start_stack_timer(int cpu) static void start_stack_timers(void) { - cpumask_t saved_mask = current->cpus_allowed; - int cpu; - - for_each_online_cpu(cpu) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - start_stack_timer(cpu); - } - set_cpus_allowed_ptr(current, &saved_mask); + on_each_cpu(start_stack_timer, NULL, 1); } static void stop_stack_timer(int cpu) -- cgit v1.2.3 From 4462344ee9ea9224d026801b877887f2f39774a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:23 +1030 Subject: cpumask: convert kernel trace functions further Impact: Reduce future memory usage, use new cpumask API. Since the last patch was created and acked, more old cpumask users slipped into kernel/trace. Mostly trivial conversions, except struct trace_iterator's "started" member becomes a cpumask_var_t. Signed-off-by: Rusty Russell --- kernel/trace/trace.c | 12 +++++++++--- kernel/trace/trace.h | 2 +- kernel/trace/trace_boot.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hw_branches.c | 6 +++--- kernel/trace/trace_power.c | 2 +- 6 files changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d04e27f3b4..c580233add9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1811,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpu_isset(iter->cpu, iter->started)) + if (cpumask_test_cpu(iter->cpu, iter->started)) return; - cpu_set(iter->cpu, iter->started); + cpumask_set_cpu(iter->cpu, iter->started); trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } @@ -3114,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (!iter) return -ENOMEM; + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&trace_types_lock); /* trace pipe does not show start of buffer */ - cpus_setall(iter->started); + cpumask_setall(iter->started); iter->tr = &global_trace; iter->trace = current_trace; @@ -3134,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + free_cpumask_var(iter->started); kfree(iter); atomic_dec(&tracing_reader); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc7a4f86403..4d3d381bfd9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -368,7 +368,7 @@ struct trace_iterator { loff_t pos; long idx; - cpumask_t started; + cpumask_var_t started; }; int tracing_is_enabled(void); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3ccebde2848..366c8c333e1 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); tracing_sched_switch_assign_trace(tr); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4bf39fcae97..930c08e5b38 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu) int i; int ret; int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); /* diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index b6a3e20a49a..649df22d435 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); } @@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); } @@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter) { int cpu; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); } diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index a7172a352f6..7bda248daf5 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr) trace_power_enabled = 1; - for_each_cpu_mask(cpu, cpu_possible_map) + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); return 0; } -- cgit v1.2.3 From f1fc057c79cb2d27602fb3ad08a031f13459ef27 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:23 +1030 Subject: cpumask: remove any_online_cpu() users: kernel/ Impact: Remove obsolete API usage any_online_cpu() is a good name, but it takes a cpumask_t, not a pointer. There are several places where any_online_cpu() doesn't really want a mask arg at all. Replace all callers with cpumask_any() and cpumask_any_and(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/softirq.c | 2 +- kernel/softlockup.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 466e75ce271..b7568d7def2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -733,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 1ab790c67b1..492f0c72fec 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -303,7 +303,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - check_cpu = any_online_cpu(cpu_online_map); + check_cpu = cpumask_any(cpu_online_mask); wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU @@ -313,7 +313,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) cpumask_t temp_cpu_online_map = cpu_online_map; cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = any_online_cpu(temp_cpu_online_map); + check_cpu = cpumask_any(&temp_cpu_online_map); } break; @@ -323,7 +323,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: p = per_cpu(watchdog_task, hotcpu); -- cgit v1.2.3 From a45185d2d7108b01b90b9e0293377be4d6346dde Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:24 +1030 Subject: cpumask: convert kernel/compat.c Impact: Reduce stack usage, use new cpumask API. Straightforward conversion; cpumasks' size is given by cpumask_size() (now a variable rather than fixed) and on-stack cpu masks use cpumask_var_t. Signed-off-by: Rusty Russell --- kernel/compat.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 8eafe3eb50d..d52e2ec1deb 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -454,16 +454,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, } static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, - unsigned len, cpumask_t *new_mask) + unsigned len, struct cpumask *new_mask) { unsigned long *k; - if (len < sizeof(cpumask_t)) - memset(new_mask, 0, sizeof(cpumask_t)); - else if (len > sizeof(cpumask_t)) - len = sizeof(cpumask_t); + if (len < cpumask_size()) + memset(new_mask, 0, cpumask_size()); + else if (len > cpumask_size()) + len = cpumask_size(); - k = cpus_addr(*new_mask); + k = cpumask_bits(new_mask); return compat_get_bitmap(k, user_mask_ptr, len * 8); } @@ -471,40 +471,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask); if (retval) - return retval; + goto out; - return sched_setaffinity(pid, &new_mask); + retval = sched_setaffinity(pid, new_mask); +out: + free_cpumask_var(new_mask); + return retval; } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; unsigned long *k; - unsigned int min_length = sizeof(cpumask_t); + unsigned int min_length = cpumask_size(); - if (NR_CPUS <= BITS_PER_COMPAT_LONG) + if (nr_cpu_ids <= BITS_PER_COMPAT_LONG) min_length = sizeof(compat_ulong_t); if (len < min_length) return -EINVAL; - ret = sched_getaffinity(pid, &mask); + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); if (ret < 0) - return ret; + goto out; - k = cpus_addr(mask); + k = cpumask_bits(mask); ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret) - return ret; + if (ret == 0) + ret = min_length; - return min_length; +out: + free_cpumask_var(mask); + return ret; } int get_compat_itimerspec(struct itimerspec *dst, -- cgit v1.2.3 From e7577c50f2fb2d1c167e2c04a4b4c2cc042acb82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:25 +1030 Subject: cpumask: convert kernel/workqueue.c Impact: Reduce memory usage, use new cpumask API. cpu_populated_map becomes a cpumask_var_t, and cpu_singlethread_map is simply a cpumask pointer: it's simply the cpumask containing the first possible CPU anyway. Signed-off-by: Rusty Russell --- kernel/workqueue.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4952322cba4..2f445833ae3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); static int singlethread_cpu __read_mostly; -static cpumask_t cpu_singlethread_map __read_mostly; +static const struct cpumask *cpu_singlethread_map __read_mostly; /* * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD * flushes cwq->worklist. This means that flush_workqueue/wait_on_work @@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly; * use cpu_possible_map, the cpumask below is more a documentation * than optimization. */ -static cpumask_t cpu_populated_map __read_mostly; +static cpumask_var_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ static inline int is_wq_single_threaded(struct workqueue_struct *wq) @@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq) return wq->singlethread; } -static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) +static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) { return is_wq_single_threaded(wq) - ? &cpu_singlethread_map : &cpu_populated_map; + ? cpu_singlethread_map : cpu_populated_map; } static @@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) */ void flush_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; might_sleep(); @@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work) { struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - const cpumask_t *cpu_map; + const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const cpumask_t *cpu_map = wq_cpu_map(wq); + const struct cpumask *cpu_map = wq_cpu_map(wq); int cpu; cpu_maps_update_begin(); @@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - cpu_set(cpu, cpu_populated_map); + cpumask_set_cpu(cpu, cpu_populated_map); } undo: list_for_each_entry(wq, &workqueues, list) { @@ -964,7 +964,7 @@ undo: switch (action) { case CPU_UP_CANCELED: case CPU_POST_DEAD: - cpu_clear(cpu, cpu_populated_map); + cpumask_clear_cpu(cpu, cpu_populated_map); } return ret; @@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu); void __init init_workqueues(void) { - cpu_populated_map = cpu_online_map; - singlethread_cpu = first_cpu(cpu_possible_map); - cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); + alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + + cpumask_copy(cpu_populated_map, cpu_online_mask); + singlethread_cpu = cpumask_first(cpu_possible_mask); + cpu_singlethread_map = cpumask_of(singlethread_cpu); hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); -- cgit v1.2.3 From 6b954823c24f04ed026a8517f6bab5abda279db8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:25 +1030 Subject: cpumask: convert kernel time functions Impact: Use new APIs Convert kernel/time functions to use struct cpumask *. Note the ugly bitmap declarations in tick-broadcast.c. These should be cpumask_var_t, but there was no obvious initialization function to put the alloc_cpumask_var() calls in. This was safe. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/time/clocksource.c | 6 +-- kernel/time/tick-broadcast.c | 113 ++++++++++++++++++++++--------------------- kernel/time/tick-common.c | 6 +-- 3 files changed, 64 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9ed2eec9752..32141b15d63 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -148,7 +148,7 @@ static void clocksource_watchdog(unsigned long data) int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); if (next_cpu >= nr_cpu_ids) - next_cpu = first_cpu(cpu_online_map); + next_cpu = cpumask_first(cpu_online_mask); watchdog_timer.expires += WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, next_cpu); } @@ -173,7 +173,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -195,7 +195,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + cpumask_first(cpu_online_mask)); } } } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9590af2327b..356fac57a18 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,7 +28,9 @@ */ struct tick_device tick_broadcast_device; -static cpumask_t tick_broadcast_mask; +/* FIXME: Use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); +static DECLARE_BITMAP(tmpmask, NR_CPUS); static DEFINE_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void) return &tick_broadcast_device; } -cpumask_t *tick_get_broadcast_mask(void) +struct cpumask *tick_get_broadcast_mask(void) { - return &tick_broadcast_mask; + return to_cpumask(tick_broadcast_mask); } /* @@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) clockevents_exchange_device(NULL, dev); tick_broadcast_device.evtdev = dev; - if (!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(dev); return 1; } @@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; - cpu_set(cpu, tick_broadcast_mask); + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_clear_oneshot(cpu); } } @@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) } /* - * Broadcast the event to the cpus, which are set in the mask + * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(cpumask_t mask) +static void tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; @@ -135,22 +137,21 @@ static void tick_do_broadcast(cpumask_t mask) /* * Check, if the current cpu is in the mask */ - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); td = &per_cpu(tick_cpu_device, cpu); td->evtdev->event_handler(td->evtdev); } - if (!cpus_empty(mask)) { + if (!cpumask_empty(mask)) { /* * It might be necessary to actually check whether the devices * have different broadcast functions. For now, just use the * one of the first device. This works as long as we have this * misfeature only on x86 (lapic) */ - cpu = first_cpu(mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(&mask); + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); + td->evtdev->broadcast(mask); } } @@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask) */ static void tick_do_periodic_broadcast(void) { - cpumask_t mask; - spin_lock(&tick_broadcast_lock); - cpus_and(mask, cpu_online_map, tick_broadcast_mask); - tick_do_broadcast(mask); + cpumask_and(to_cpumask(tmpmask), + cpu_online_mask, tick_get_broadcast_mask()); + tick_do_broadcast(to_cpumask(tmpmask)); spin_unlock(&tick_broadcast_lock); } @@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpus_empty(tick_broadcast_mask); + bc_stopped = cpumask_empty(tick_get_broadcast_mask()); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpu_isset(cpu, tick_broadcast_mask)) { - cpu_set(cpu, tick_broadcast_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpu_isset(cpu, tick_broadcast_mask)) { - cpu_clear(cpu, tick_broadcast_mask); + cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why) break; } - if (cpus_empty(tick_broadcast_mask)) { + if (cpumask_empty(tick_get_broadcast_mask())) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -272,7 +272,7 @@ out: */ void tick_broadcast_on_off(unsigned long reason, int *oncpu) { - if (!cpu_isset(*oncpu, cpu_online_map)) + if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else @@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpu_clear(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpus_empty(tick_broadcast_mask)) + if (bc && cpumask_empty(tick_get_broadcast_mask())) clockevents_shutdown(bc); } @@ -342,10 +342,10 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if(!cpus_empty(tick_broadcast_mask)) + if (!cpumask_empty(tick_get_broadcast_mask())) tick_broadcast_start_periodic(bc); - broadcast = cpu_isset(smp_processor_id(), - tick_broadcast_mask); + broadcast = cpumask_test_cpu(smp_processor_id(), + tick_get_broadcast_mask()); break; case TICKDEV_MODE_ONESHOT: broadcast = tick_resume_broadcast_oneshot(bc); @@ -360,14 +360,15 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -static cpumask_t tick_broadcast_oneshot_mask; +/* FIXME: use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); /* - * Debugging: see timer_list.c + * Exposed for debugging: see timer_list.c */ -cpumask_t *tick_get_broadcast_oneshot_mask(void) +struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return &tick_broadcast_oneshot_mask; + return to_cpumask(tick_broadcast_oneshot_mask); } static int tick_broadcast_set_event(ktime_t expires, int force) @@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu) static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; - cpumask_t mask; ktime_t now, next_event; int cpu; @@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - mask = CPU_MASK_NONE; + cpumask_clear(to_cpumask(tmpmask)); now = ktime_get(); /* Find all expired events */ - for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { + for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, to_cpumask(tmpmask)); else if (td->evtdev->next_event.tv64 < next_event.tv64) next_event.tv64 = td->evtdev->next_event.tv64; } @@ -424,7 +424,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(mask); + tick_do_broadcast(to_cpumask(tmpmask)); /* * Two reasons for reprogram: @@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) goto out; if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_set(cpu, tick_broadcast_oneshot_mask); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); if (dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(dev->next_event, 1); } } else { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_clear_cpu(cpu, + tick_get_broadcast_oneshot_mask()); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); if (dev->next_event.tv64 != KTIME_MAX) tick_program_event(dev->next_event, 1); @@ -502,10 +503,11 @@ out: */ static void tick_broadcast_clear_oneshot(int cpu) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); } -static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) { struct tick_device *td; int cpu; @@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; int cpu = smp_processor_id(); - cpumask_t mask; bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - mask = tick_broadcast_mask; - cpu_clear(cpu, mask); - cpus_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, mask); - - if (was_periodic && !cpus_empty(mask)) { - tick_broadcast_init_next_event(&mask, tick_next_period); + cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); + cpumask_or(tick_get_broadcast_oneshot_mask(), + tick_get_broadcast_oneshot_mask(), + to_cpumask(tmpmask)); + + if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_next_period); tick_broadcast_set_event(tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; @@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpu_clear(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f8372be7412..63e05d423a0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = NULL; } clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); @@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup) } /* Transfer the do_timer job away from this cpu */ if (*cpup == tick_do_timer_cpu) { - int cpu = first_cpu(cpu_online_map); + int cpu = cpumask_first(cpu_online_mask); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); -- cgit v1.2.3 From d036e67b40f52bdd95392390108defbac7e53837 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert kernel/irq Impact: Reduce stack usage, use new cpumask API. ALPHA mod! Main change is that irq_default_affinity becomes a cpumask_var_t, so treat it as a pointer (this effects alpha). Signed-off-by: Rusty Russell --- kernel/irq/manage.c | 11 +++++++++-- kernel/irq/proc.c | 32 +++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 61c4a9b6216..cd0cd8dcb34 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,8 +16,15 @@ #include "internals.h" #ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; -cpumask_t irq_default_affinity = CPU_MASK_ALL; +static int init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL); + cpumask_setall(irq_default_affinity); + return 0; +} +core_initcall(init_irq_default_affinity); /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) @@ -127,7 +134,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); + cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); set_affinity: desc->chip->set_affinity(irq, &desc->affinity); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d2c0e5ee53c..2abd3a7716e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - cpumask_t *mask = &desc->affinity; + const struct cpumask *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) @@ -93,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = { static int default_affinity_show(struct seq_file *m, void *v) { - seq_cpumask(m, &irq_default_affinity); + seq_cpumask(m, irq_default_affinity); seq_putc(m, '\n'); return 0; } @@ -101,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v) static ssize_t default_affinity_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - cpumask_t new_value; + cpumask_var_t new_value; int err; - err = cpumask_parse_user(buffer, count, &new_value); + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); if (err) - return err; + goto out; - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; + if (!is_affinity_mask_valid(new_value)) { + err = -EINVAL; + goto out; + } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - if (!cpus_intersects(new_value, cpu_online_map)) - return -EINVAL; + if (!cpumask_intersects(new_value, cpu_online_mask)) { + err = -EINVAL; + goto out; + } - irq_default_affinity = new_value; + cpumask_copy(irq_default_affinity, new_value); + err = count; - return count; +out: + free_cpumask_var(new_value); + return err; } static int default_affinity_open(struct inode *inode, struct file *file) -- cgit v1.2.3 From bd232f97b30f6bb630efa136a777647545db3039 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:26 +1030 Subject: cpumask: convert RCU implementations Impact: use new cpumask API. rcu_ctrlblk contains a cpumask, and it's highly optimized so I don't want a cpumask_var_t (ie. a pointer) for the CONFIG_CPUMASK_OFFSTACK case. It could use a dangling bitmap, and be allocated in __rcu_init to save memory, but for the moment we use a bitmap. (Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK, so we use a bitmap here to show we really mean it). We remove on-stack cpumasks, using cpumask_var_t for rcu_torture_shuffle_tasks() and for_each_cpu_and in force_quiescent_state(). Signed-off-by: Rusty Russell --- kernel/rcuclassic.c | 32 +++++++++++++++++--------------- kernel/rcupreempt.c | 19 ++++++++++--------- kernel/rcutorture.c | 27 +++++++++++++++------------ 3 files changed, 42 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index e503a002f33..0ff9b05706a 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; @@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp, struct rcu_ctrlblk *rcp) { int cpu; - cpumask_t cpumask; unsigned long flags; set_need_resched(); @@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp, * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. * - * cpu_online_map is updated by the _cpu_down() + * cpu_online_mask is updated by the _cpu_down() * using __stop_machine(). Since we're in irqs disabled * section, __stop_machine() is not exectuting, hence - * the cpu_online_map is stable. + * the cpu_online_mask is stable. * * However, a cpu might have been offlined _just_ before * we disabled irqs while entering here. @@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp, * notification, leading to the offlined cpu's bit * being set in the rcp->cpumask. * - * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent * sending smp_reschedule() to an offlined CPU. */ - cpus_and(cpumask, rcp->cpumask, cpu_online_map); - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask_nr(cpu, cpumask) - smp_send_reschedule(cpu); + for_each_cpu_and(cpu, + to_cpumask(rcp->cpumask), cpu_online_mask) { + if (cpu != rdp->cpu) + smp_send_reschedule(cpu); + } } spin_unlock_irqrestore(&rcp->lock, flags); } @@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for_each_possible_cpu(cpu) { - if (cpu_isset(cpu, rcp->cpumask)) + if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) printk(" %d", cpu); } printk(" (detected by %d, t=%ld jiffies)\n", @@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) long delta; delta = jiffies - rcp->jiffies_stall; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && + delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rcp); @@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + cpumask_andnot(to_cpumask(rcp->cpumask), + cpu_online_mask, &nohz_cpu_mask); rcp->signaled = 0; } @@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) */ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) { - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { + cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); + if (cpumask_empty(to_cpumask(rcp->cpumask))) { /* batch completed ! */ rcp->completed = rcp->cur; rcu_start_batch(rcp); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 04982659875..f9dc8f3720f 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] = { "idle", "waitack", "waitzero", "waitmb" }; #endif /* #ifdef CONFIG_RCU_TRACE */ -static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; +static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly + = CPU_BITS_NONE; /* * Enum and per-CPU flag to determine when each CPU has seen @@ -758,7 +759,7 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; dyntick_save_progress_counter(cpu); } @@ -776,7 +777,7 @@ rcu_try_flip_waitack(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitack_needed(cpu) && per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); @@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void) /* Check to see if the sum of the "last" counters is zero. */ RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; if (sum != 0) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); @@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; dyntick_save_progress_counter(cpu); } @@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void) int cpu; RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) + for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) if (rcu_try_flip_waitmb_needed(cpu) && per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); @@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu) RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; - cpu_clear(cpu, rcu_cpu_online_map); + cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); @@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu) struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpu_set(cpu, rcu_cpu_online_map); + cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); /* @@ -1430,7 +1431,7 @@ void __init __rcu_init(void) * We don't need protection against CPU-Hotplug here * since * a) If a CPU comes online while we are iterating over the - * cpu_online_map below, we would only end up making a + * cpu_online_mask below, we would only end up making a * duplicate call to rcu_online_cpu() which sets the corresponding * CPU's mask in the rcu_cpu_online_map. * diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b3106552210..3245b40952c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -868,49 +868,52 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_t tmp_mask; + cpumask_var_t tmp_mask; int i; - cpus_setall(tmp_mask); + if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) + BUG(); + + cpumask_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } + if (num_online_cpus() == 1) + goto out; if (rcu_idle_cpu != -1) - cpu_clear(rcu_idle_cpu, tmp_mask); + cpumask_clear_cpu(rcu_idle_cpu, tmp_mask); - set_cpus_allowed_ptr(current, &tmp_mask); + set_cpus_allowed_ptr(current, tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed_ptr(reader_tasks[i], - &tmp_mask); + tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], - &tmp_mask); + tmp_mask); } if (writer_task) - set_cpus_allowed_ptr(writer_task, &tmp_mask); + set_cpus_allowed_ptr(writer_task, tmp_mask); if (stats_task) - set_cpus_allowed_ptr(stats_task, &tmp_mask); + set_cpus_allowed_ptr(stats_task, tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; else rcu_idle_cpu--; +out: put_online_cpus(); + free_cpumask_var(tmp_mask); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the -- cgit v1.2.3 From c309b917cab55799ea489d7b5f1b77025d9f8462 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:27 +1030 Subject: cpumask: convert kernel/profile.c Impact: Reduce kernel memory usage, use new cpumask API. Avoid a static cpumask_t for prof_cpu_mask, and an on-stack cpumask_t in prof_cpu_mask_write_proc. Both become cpumask_var_t. prof_cpu_mask is only allocated when profiling is on, but the NULL checks are optimized out by gcc for the !CPUMASK_OFFSTACK case. Also removed some strange and unnecessary casts. Signed-off-by: Rusty Russell --- kernel/profile.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 4cb7d68fed8..d18e2d2654f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +static cpumask_var_t prof_cpu_mask; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -113,9 +113,13 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); if (!slab_is_available()) { prof_buffer = alloc_bootmem(buffer_bytes); + alloc_bootmem_cpumask_var(&prof_cpu_mask); return 0; } + if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); if (prof_buffer) return 0; @@ -128,6 +132,7 @@ int __ref profile_init(void) if (prof_buffer) return 0; + free_cpumask_var(prof_cpu_mask); return -ENOMEM; } @@ -386,13 +391,15 @@ out_free: return NOTIFY_BAD; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cpu_set(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - cpu_clear(cpu, prof_cpu_mask); + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); if (per_cpu(cpu_profile_hits, cpu)[0]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); per_cpu(cpu_profile_hits, cpu)[0] = NULL; @@ -430,7 +437,8 @@ void profile_tick(int type) if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) + if (!user_mode(regs) && prof_cpu_mask != NULL && + cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } @@ -442,7 +450,7 @@ void profile_tick(int type) static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, (cpumask_t *)data); + int len = cpumask_scnprintf(page, count, data); if (count - len < 2) return -EINVAL; len += sprintf(page + len, "\n"); @@ -452,16 +460,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, static int prof_cpu_mask_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { - cpumask_t *mask = (cpumask_t *)data; + struct cpumask *mask = data; unsigned long full_count = count, err; - cpumask_t new_value; + cpumask_var_t new_value; - err = cpumask_parse_user(buffer, count, &new_value); - if (err) - return err; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; - *mask = new_value; - return full_count; + err = cpumask_parse_user(buffer, count, new_value); + if (!err) { + cpumask_copy(mask, new_value); + err = full_count; + } + free_cpumask_var(new_value); + return err; } void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) @@ -472,7 +484,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); if (!entry) return; - entry->data = (void *)&prof_cpu_mask; + entry->data = prof_cpu_mask; entry->read_proc = prof_cpu_mask_read_proc; entry->write_proc = prof_cpu_mask_write_proc; } -- cgit v1.2.3 From e0b582ec56f1a1d8b30ebf340a7b91fb09f26c8c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:28 +1030 Subject: cpumask: convert kernel/cpu.c Impact: Reduce kernel stack and memory usage, use new cpumask API. Use cpumask_var_t for take_cpu_down() stack var, and frozen_cpus. Note that notify_cpu_starting() can be called before core_initcall allocates frozen_cpus, but the NULL check is optimized out by gcc for the CONFIG_CPUMASK_OFFSTACK=n case. Signed-off-by: Rusty Russell --- kernel/cpu.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2c9f78f3a2f..47fff3b63cb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -194,7 +194,7 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - cpumask_t old_allowed, tmp; + cpumask_var_t old_allowed; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { @@ -208,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; + if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) + return -ENOMEM; + cpu_hotplug_begin(); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); @@ -222,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } /* Ensure that we are not runnable on dying cpu */ - old_allowed = current->cpus_allowed; - cpus_setall(tmp); - cpu_clear(cpu, tmp); - set_cpus_allowed_ptr(current, &tmp); - tmp = cpumask_of_cpu(cpu); + cpumask_copy(old_allowed, ¤t->cpus_allowed); + set_cpus_allowed_ptr(current, + cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); - err = __stop_machine(take_cpu_down, &tcd_param, &tmp); + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, @@ -254,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); out_allowed: - set_cpus_allowed_ptr(current, &old_allowed); + set_cpus_allowed_ptr(current, old_allowed); out_release: cpu_hotplug_done(); if (!err) { @@ -262,6 +263,7 @@ out_release: hcpu) == NOTIFY_BAD) BUG(); } + free_cpumask_var(old_allowed); return err; } @@ -280,7 +282,7 @@ int __ref cpu_down(unsigned int cpu) /* * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_map. + * using stale version of the cpu_active_mask. * This is not strictly necessary becuase stop_machine() * that we run down the line already provides the required * synchronization. But it's really a side effect and we do not @@ -344,7 +346,7 @@ out_notify: int __cpuinit cpu_up(unsigned int cpu) { int err = 0; - if (!cpu_isset(cpu, cpu_possible_map)) { + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) @@ -369,25 +371,25 @@ out: } #ifdef CONFIG_PM_SLEEP_SMP -static cpumask_t frozen_cpus; +static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; cpu_maps_update_begin(); - first_cpu = first_cpu(cpu_online_map); + first_cpu = cpumask_first(cpu_online_mask); /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; error = _cpu_down(cpu, 1); if (!error) { - cpu_set(cpu, frozen_cpus); + cpumask_set_cpu(cpu, frozen_cpus); printk("CPU%d is down\n", cpu); } else { printk(KERN_ERR "Error taking CPU%d down: %d\n", @@ -413,11 +415,11 @@ void __ref enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); cpu_hotplug_disabled = 0; - if (cpus_empty(frozen_cpus)) + if (cpumask_empty(frozen_cpus)) goto out; printk("Enabling non-boot CPUs ...\n"); - for_each_cpu_mask_nr(cpu, frozen_cpus) { + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); @@ -425,10 +427,18 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } - cpus_clear(frozen_cpus); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); } + +static int alloc_frozen_cpus(void) +{ + if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) + return -ENOMEM; + return 0; +} +core_initcall(alloc_frozen_cpus); #endif /* CONFIG_PM_SLEEP_SMP */ /** @@ -444,7 +454,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) unsigned long val = CPU_STARTING; #ifdef CONFIG_PM_SLEEP_SMP - if (cpu_isset(cpu, frozen_cpus)) + if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) val = CPU_STARTING_FROZEN; #endif /* CONFIG_PM_SLEEP_SMP */ raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); @@ -456,7 +466,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< Date: Thu, 1 Jan 2009 10:12:28 +1030 Subject: cpumask: convert rest of files in kernel/ Impact: Reduce stack usage, use new cpumask API. Mainly changing cpumask_t to 'struct cpumask' and similar simple API conversion. Two conversions worth mentioning: 1) we use cpumask_any_but to avoid a temporary in kernel/softlockup.c, 2) Use cpumask_var_t in taskstats_user_cmd(). Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Balbir Singh Cc: Ingo Molnar --- kernel/power/poweroff.c | 2 +- kernel/softlockup.c | 6 ++---- kernel/stop_machine.c | 8 ++++---- kernel/taskstats.c | 39 ++++++++++++++++++++++++--------------- 4 files changed, 31 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 72016f05147..97890831e1b 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { /* run sysrq poweroff on boot cpu */ - schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 492f0c72fec..d9188c66278 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -310,10 +310,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: if (hotcpu == check_cpu) { - cpumask_t temp_cpu_online_map = cpu_online_map; - - cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = cpumask_any(&temp_cpu_online_map); + /* Pick any other online cpu. */ + check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); } break; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 24e8ceacc38..286c41722e8 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -69,10 +69,10 @@ static void stop_cpu(struct work_struct *unused) int err; if (!active_cpus) { - if (cpu == first_cpu(cpu_online_map)) + if (cpu == cpumask_first(cpu_online_mask)) smdata = &active; } else { - if (cpu_isset(cpu, *active_cpus)) + if (cpumask_test_cpu(cpu, active_cpus)) smdata = &active; } /* Simple state machine */ @@ -109,7 +109,7 @@ static int chill(void *unused) return 0; } -int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { struct work_struct *sm_work; int i, ret; @@ -142,7 +142,7 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) return ret; } -int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) +int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { int ret; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 6d7dc4ec4aa..888adbcca30 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,18 +290,17 @@ ret: return; } -static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) { struct listener_list *listeners; struct listener *s, *tmp; unsigned int cpu; - cpumask_t mask = *maskp; - if (!cpus_subset(mask, cpu_possible_map)) + if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; if (isadd == REGISTER) { - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); if (!s) @@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) /* Deregister or cleanup */ cleanup: - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { @@ -335,7 +334,7 @@ cleanup: return 0; } -static int parse(struct nlattr *na, cpumask_t *mask) +static int parse(struct nlattr *na, struct cpumask *mask) { char *data; int len; @@ -428,23 +427,33 @@ err: static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { - int rc = 0; + int rc; struct sk_buff *rep_skb; struct taskstats *stats; size_t size; - cpumask_t mask; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) - return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, REGISTER); + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, REGISTER); + goto free_return_rc; + } - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) + goto free_return_rc; + if (rc == 0) { + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +free_return_rc: + free_cpumask_var(mask); return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, DEREGISTER); + } + free_cpumask_var(mask); /* * Size includes space for nested attributes -- cgit v1.2.3 From 5db0e1e9e0f30f160b832a0b5cd1131954bf4f6e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 1 Jan 2009 10:12:29 +1030 Subject: cpumask: replace for_each_cpu_mask_nr with for_each_cpu in kernel/time/ Impact: cleanup Simple replacement, now the _nr is redundant. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Ingo Molnar --- kernel/time/clocksource.c | 3 ++- kernel/time/tick-broadcast.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 32141b15d63..ca89e1593f0 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,7 +145,8 @@ static void clocksource_watchdog(unsigned long data) * Cycle through CPUs to check if the CPUs stay * synchronized to each other. */ - int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); + int next_cpu = cpumask_next(raw_smp_processor_id(), + cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 356fac57a18..118a3b3b3f9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -512,7 +512,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, struct tick_device *td; int cpu; - for_each_cpu_mask_nr(cpu, *mask) { + for_each_cpu(cpu, mask) { td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev) td->evtdev->next_event = expires; -- cgit v1.2.3 From 0a582440ff546e2c6610d1acec325e91b4efd313 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 2 Jan 2009 12:16:42 +0100 Subject: sched: fix sched_slice() Impact: fix bad-interactivity buglet Fix sched_slice() to emit a sane result whether a task is currently enqueued or not. Signed-off-by: Mike Galbraith Tested-by: Jayson King Signed-off-by: Ingo Molnar kernel/sched_fair.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) --- kernel/sched_fair.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5ad4440f0fc..b808563f4f1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -385,20 +385,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, } #endif -/* - * delta *= P[w / rw] - */ -static inline unsigned long -calc_delta_weight(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - se->load.weight, &cfs_rq_of(se)->load); - } - - return delta; -} - /* * delta /= w */ @@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; + u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); - if (unlikely(!se->on_rq)) - nr_running++; + for_each_sched_entity(se) { + struct load_weight *load = &cfs_rq->load; - return calc_delta_weight(__sched_period(nr_running), se); + if (unlikely(!se->on_rq)) { + struct load_weight lw = cfs_rq->load; + + update_load_add(&lw, se->load.weight); + load = &lw; + } + slice = calc_delta_mine(slice, se->load.weight, load); + } + return slice; } /* -- cgit v1.2.3 From 90621c40cc4ab7b0a414311ce37e7cc7173403b6 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 29 Dec 2008 19:43:21 -0800 Subject: futex: catch certain assymetric (get|put)_futex_key calls Impact: add debug check Following up on my previous key reference accounting patches, this patch will catch puts on keys that haven't been "got". This won't catch nested get/put mismatches though. Build and boot tested, with minimal desktop activity and a run of the open_posix_testsuite in LTP for testing. No warnings logged. Signed-off-by: Darren Hart Cc: Signed-off-by: Ingo Molnar --- kernel/futex.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c5ac55cc0c1..206d4c90688 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key) */ static void drop_futex_key_refs(union futex_key *key) { - if (!key->both.ptr) + if (!key->both.ptr) { + /* If we're here then we tried to put a key we failed to get */ + WARN_ON_ONCE(1); return; + } switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: -- cgit v1.2.3 From 263ec6457bb23d57b575ede18ff6c3d11e0b4e96 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Jan 2009 13:16:09 +0100 Subject: cpumask: convert RCU implementations, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup This warning: kernel/rcuclassic.c: In function ‘rcu_start_batch’: kernel/rcuclassic.c:397: warning: passing argument 1 of ‘cpumask_andnot’ from incompatible pointer type triggers because one usage site of rcp->cpumask was not converted to to_cpumask(rcp->cpumask). There's no ill effects of this bug. Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 6ec495f60ea..490934fc7ac 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -394,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) * unnecessarily. */ smp_mb(); - cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask); + cpumask_andnot(to_cpumask(rcp->cpumask), + cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } -- cgit v1.2.3 From 6bdf197b04b3ae7c85785bc5a9576f1bcb0ac7c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 3 Jan 2009 12:50:46 +0100 Subject: ia64: cpumask fix for is_affinity_mask_valid() Impact: build fix on ia64 ia64's default_affinity_write() still had old cpumask_t usage: /home/mingo/tip/kernel/irq/proc.c: In function `default_affinity_write': /home/mingo/tip/kernel/irq/proc.c:114: error: incompatible type for argument 1 of `is_affinity_mask_valid' make[3]: *** [kernel/irq/proc.o] Error 1 make[3]: *** Waiting for unfinished jobs.... update it to cpumask_var_t. Signed-off-by: Ingo Molnar --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 2abd3a7716e..aae3f742bce 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -54,7 +54,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (err) goto free_cpumask; - if (!is_affinity_mask_valid(*new_value)) { + if (!is_affinity_mask_valid(new_value)) { err = -EINVAL; goto free_cpumask; } -- cgit v1.2.3 From 6ca09dfc9f180d038dcef93c167a833f43a8246f Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Wed, 31 Dec 2008 18:08:45 -0800 Subject: sched: put back some stack hog changes that were undone in kernel/sched.c Impact: prevents panic from stack overflow on numa-capable machines. Some of the "removal of stack hogs" changes in kernel/sched.c by using node_to_cpumask_ptr were undone by the early cpumask API updates, and causes a panic due to stack overflow. This patch undoes those changes by using cpumask_of_node() which returns a 'const struct cpumask *'. In addition, cpu_coregoup_map is replaced with cpu_coregroup_mask further reducing stack usage. (Both of these updates removed 9 FIXME's!) Also: Pick up some remaining changes from the old 'cpumask_t' functions to the new 'struct cpumask *' functions. Optimize memory traffic by allocating each percpu local_cpu_mask on the same node as the referring cpu. Signed-off-by: Mike Travis Acked-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 53 +++++++++++++++-------------------------------------- kernel/sched_rt.c | 3 ++- 2 files changed, 17 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 27ba1d642f0..dd862d70e71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3715,7 +3715,7 @@ redo: * don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { double_unlock_balance(this_rq, busiest); all_pinned = 1; return ld_moved; @@ -6220,9 +6220,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - /* FIXME: Use cpumask_of_node here. */ - cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); - const struct cpumask *nodemask = &_nodemask; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: /* Look for allowed, online CPU in same node. */ @@ -7133,21 +7131,18 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; - /* FIXME: use cpumask_of_node() */ - node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(*span); + cpumask_clear(span); nodes_clear(used_nodes); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(node)); node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, &used_nodes); - node_to_cpumask_ptr_next(nodemask, next_node); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(next_node)); } } #endif /* CONFIG_NUMA */ @@ -7227,9 +7222,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, { int group; #ifdef CONFIG_SCHED_MC - /* FIXME: Use cpu_coregroup_mask. */ - *mask = cpu_coregroup_map(cpu); - cpus_and(*mask, *mask, *cpu_map); + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); @@ -7259,10 +7252,8 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, struct cpumask *nodemask) { int group; - /* FIXME: use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - cpumask_and(nodemask, pnodemask, cpu_map); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); group = cpumask_first(nodemask); if (sg) @@ -7313,10 +7304,8 @@ static void free_sched_groups(const struct cpumask *cpu_map, for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; - /* FIXME: Use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, i); - cpus_and(*nodemask, *pnodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) continue; @@ -7525,9 +7514,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - /* FIXME: use cpumask_of_node */ - *nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > @@ -7568,9 +7555,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); - *sched_domain_span(sd) = cpu_coregroup_map(i); - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, + cpu_coregroup_mask(i)); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7606,9 +7592,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - /* FIXME: Use cpu_coregroup_mask */ - *this_core_map = cpu_coregroup_map(i); - cpus_and(*this_core_map, *this_core_map, *cpu_map); + cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); if (i != cpumask_first(this_core_map)) continue; @@ -7620,9 +7604,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - /* FIXME: Use cpumask_of_node */ - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) continue; @@ -7644,11 +7626,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_group *sg, *prev; int j; - /* FIXME: Use cpumask_of_node */ - *nodemask = node_to_cpumask(i); cpumask_clear(covered); - - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; @@ -7679,8 +7658,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; - /* FIXME: Use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, n); cpumask_complement(notcovered, covered); cpumask_and(tmpmask, notcovered, cpu_map); @@ -7688,7 +7665,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (cpumask_empty(tmpmask)) break; - cpumask_and(tmpmask, tmpmask, pnodemask); + cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); if (cpumask_empty(tmpmask)) continue; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 833b6d44483..954e1a81b79 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1383,7 +1383,8 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); + alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 8916edef5888c5d8fe283714416a9ca95b4c3431 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sun, 4 Jan 2009 05:40:37 +0900 Subject: getrusage: RUSAGE_THREAD should return ru_utime and ru_stime Impact: task stats regression fix Original getrusage(RUSAGE_THREAD) implementation can return ru_utime and ru_stime. But commit "f06febc: timers: fix itimer/many thread hang" broke it. this patch restores it. Signed-off-by: KOSAKI Motohiro Acked-by: Roland McGrath Signed-off-by: Ingo Molnar --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d356d79e84a..61dbfd4a54d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1627,6 +1627,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { + utime = task_utime(current); + stime = task_stime(current); accumulate_thread_rusage(p, r); goto out; } -- cgit v1.2.3 From 4f6b434fee2402b3decdeae9d16eb648725ae426 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Dec 2008 19:50:34 -0500 Subject: don't reallocate buffer in every audit_sockaddr() No need to do that more than once per process lifetime; allocating/freeing on each sendto/accept/etc. is bloody pointless. Signed-off-by: Al Viro --- kernel/auditsc.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4819f371197..c2e43ebb1b6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -174,12 +174,6 @@ struct audit_aux_data_socketcall { unsigned long args[0]; }; -struct audit_aux_data_sockaddr { - struct audit_aux_data d; - int len; - char a[0]; -}; - struct audit_aux_data_fd_pair { struct audit_aux_data d; int fd[2]; @@ -234,7 +228,8 @@ struct audit_context { struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; - + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; /* Save things to print about task_struct */ pid_t pid, ppid; uid_t uid, euid, suid, fsuid; @@ -921,6 +916,7 @@ static inline void audit_free_context(struct audit_context *context) free_tree_refs(context); audit_free_aux(context); kfree(context->filterkey); + kfree(context->sockaddr); kfree(context); context = previous; } while (context); @@ -1383,13 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, " a%d=%lx", i, axs->args[i]); break; } - case AUDIT_SOCKADDR: { - struct audit_aux_data_sockaddr *axs = (void *)aux; - - audit_log_format(ab, "saddr="); - audit_log_n_hex(ab, axs->a, axs->len); - break; } - case AUDIT_FD_PAIR: { struct audit_aux_data_fd_pair *axs = (void *)aux; audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); @@ -1421,6 +1410,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_end(ab); } + if (context->sockaddr_len) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); + if (ab) { + audit_log_format(ab, "saddr="); + audit_log_n_hex(ab, (void *)context->sockaddr, + context->sockaddr_len); + audit_log_end(ab); + } + } + for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; @@ -1689,6 +1688,7 @@ void audit_syscall_exit(int valid, long return_code) context->aux_pids = NULL; context->target_pid = 0; context->target_sid = 0; + context->sockaddr_len = 0; kfree(context->filterkey); context->filterkey = NULL; tsk->audit_context = context; @@ -2468,22 +2468,20 @@ int __audit_fd_pair(int fd1, int fd2) */ int audit_sockaddr(int len, void *a) { - struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->len = len; - memcpy(ax->a, a, len); + if (!context->sockaddr) { + void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); + if (!p) + return -ENOMEM; + context->sockaddr = p; + } - ax->d.type = AUDIT_SOCKADDR; - ax->d.next = context->aux; - context->aux = (void *)ax; + context->sockaddr_len = len; + memcpy(context->sockaddr, a, len); return 0; } -- cgit v1.2.3 From f3298dc4f2277874d40cb4fc3a6e277317d6603b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 03:16:51 -0500 Subject: sanitize audit_socketcall * don't bother with allocations * now that it can't fail, make it return void Signed-off-by: Al Viro --- kernel/auditsc.c | 66 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c2e43ebb1b6..5cda66466e1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,6 @@ struct audit_aux_data_execve { struct mm_struct *mm; }; -struct audit_aux_data_socketcall { - struct audit_aux_data d; - int nargs; - unsigned long args[0]; -}; - struct audit_aux_data_fd_pair { struct audit_aux_data d; int fd[2]; @@ -247,6 +241,14 @@ struct audit_context { struct audit_tree_refs *trees, *first_trees; int tree_count; + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + }; + #if AUDIT_DEBUG int put_count; int ino_count; @@ -1226,6 +1228,27 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); } +static void show_special(struct audit_context *context) +{ + struct audit_buffer *ab; + int i; + + ab = audit_log_start(context, GFP_KERNEL, context->type); + if (!ab) + return; + + switch (context->type) { + case AUDIT_SOCKETCALL: { + int nargs = context->socketcall.nargs; + audit_log_format(ab, "nargs=%d", nargs); + for (i = 0; i < nargs; i++) + audit_log_format(ab, " a%d=%lx", i, + context->socketcall.args[i]); + break; } + } + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1372,13 +1395,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_execve_info(context, &ab, axi); break; } - case AUDIT_SOCKETCALL: { - struct audit_aux_data_socketcall *axs = (void *)aux; - audit_log_format(ab, "nargs=%d", axs->nargs); - for (i=0; inargs; i++) - audit_log_format(ab, " a%d=%lx", i, axs->args[i]); - break; } - case AUDIT_FD_PAIR: { struct audit_aux_data_fd_pair *axs = (void *)aux; audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); @@ -1410,6 +1426,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_end(ab); } + if (context->type) + show_special(context); + if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); if (ab) { @@ -1689,6 +1708,7 @@ void audit_syscall_exit(int valid, long return_code) context->target_pid = 0; context->target_sid = 0; context->sockaddr_len = 0; + context->type = 0; kfree(context->filterkey); context->filterkey = NULL; tsk->audit_context = context; @@ -2406,27 +2426,17 @@ int audit_bprm(struct linux_binprm *bprm) * @nargs: number of args * @args: args array * - * Returns 0 for success or NULL context or < 0 on error. */ -int audit_socketcall(int nargs, unsigned long *args) +void audit_socketcall(int nargs, unsigned long *args) { - struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->nargs = nargs; - memcpy(ax->args, args, nargs * sizeof(unsigned long)); + return; - ax->d.type = AUDIT_SOCKETCALL; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_SOCKETCALL; + context->socketcall.nargs = nargs; + memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); } /** -- cgit v1.2.3 From a33e6751003c5ade603737d828b1519d980ce392 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 03:40:06 -0500 Subject: sanitize audit_ipc_obj() * get rid of allocations * make it return void * simplify callers Signed-off-by: Al Viro --- kernel/auditsc.c | 88 ++++++++++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5cda66466e1..73504313264 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -247,6 +247,12 @@ struct audit_context { int nargs; long args[6]; } socketcall; + struct { + uid_t uid; + gid_t gid; + mode_t mode; + u32 osid; + } ipc; }; #if AUDIT_DEBUG @@ -605,19 +611,12 @@ static int audit_filter_rules(struct task_struct *tsk, } } /* Find ipc objects that match */ - if (ctx) { - struct audit_aux_data *aux; - for (aux = ctx->aux; aux; - aux = aux->next) { - if (aux->type == AUDIT_IPC) { - struct audit_aux_data_ipcctl *axi = (void *)aux; - if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { - ++result; - break; - } - } - } - } + if (!ctx || ctx->type != AUDIT_IPC) + break; + if (security_audit_rule_match(ctx->ipc.osid, + f->type, f->op, + f->lsm_rule, ctx)) + ++result; } break; case AUDIT_ARG0: @@ -1228,7 +1227,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); } -static void show_special(struct audit_context *context) +static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; int i; @@ -1245,6 +1244,23 @@ static void show_special(struct audit_context *context) audit_log_format(ab, " a%d=%lx", i, context->socketcall.args[i]); break; } + case AUDIT_IPC: { + u32 osid = context->ipc.osid; + + audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + context->ipc.uid, context->ipc.gid, context->ipc.mode); + if (osid) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx(osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", osid); + *call_panic = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + break; } } audit_log_end(ab); } @@ -1363,26 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); break; } - case AUDIT_IPC: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "ouid=%u ogid=%u mode=%#o", - axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - break; } - case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, @@ -1427,7 +1423,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } if (context->type) - show_special(context); + show_special(context, &call_panic); if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); @@ -2349,25 +2345,15 @@ int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) * audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_ipc_obj(struct kern_ipc_perm *ipcp) +void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->uid = ipcp->uid; - ax->gid = ipcp->gid; - ax->mode = ipcp->mode; - security_ipc_getsecid(ipcp, &ax->osid); - ax->d.type = AUDIT_IPC; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.uid = ipcp->uid; + context->ipc.gid = ipcp->gid; + context->ipc.mode = ipcp->mode; + security_ipc_getsecid(ipcp, &context->ipc.osid); + context->type = AUDIT_IPC; } /** -- cgit v1.2.3 From e816f370cbadd2afea9f1a42f232d0636137d563 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 03:47:15 -0500 Subject: sanitize audit_ipc_set_perm() * get rid of allocations * make it return void * simplify callers Signed-off-by: Al Viro --- kernel/auditsc.c | 59 +++++++++++++++++++++++++------------------------------- 1 file changed, 26 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 73504313264..fbed62e05bc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -151,16 +151,6 @@ struct audit_aux_data_mq_getsetattr { struct mq_attr mqstat; }; -struct audit_aux_data_ipcctl { - struct audit_aux_data d; - struct ipc_perm p; - unsigned long qbytes; - uid_t uid; - gid_t gid; - mode_t mode; - u32 osid; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -252,6 +242,11 @@ struct audit_context { gid_t gid; mode_t mode; u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + mode_t perm_mode; + unsigned long qbytes; } ipc; }; @@ -1260,6 +1255,19 @@ static void show_special(struct audit_context *context, int *call_panic) security_release_secctx(ctx, len); } } + if (context->ipc.has_perm) { + audit_log_end(ab); + ab = audit_log_start(context, GFP_KERNEL, + AUDIT_IPC_SET_PERM); + audit_log_format(ab, + "qbytes=%lx ouid=%u ogid=%u mode=%#o", + context->ipc.qbytes, + context->ipc.perm_uid, + context->ipc.perm_gid, + context->ipc.perm_mode); + if (!ab) + return; + } break; } } audit_log_end(ab); @@ -1379,13 +1387,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); break; } - case AUDIT_IPC_SET_PERM: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", - axi->qbytes, axi->uid, axi->gid, axi->mode); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2352,6 +2353,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) context->ipc.uid = ipcp->uid; context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; + context->ipc.has_perm = 0; security_ipc_getsecid(ipcp, &context->ipc.osid); context->type = AUDIT_IPC; } @@ -2363,26 +2365,17 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @gid: msgq group id * @mode: msgq mode (permissions) * - * Returns 0 for success or NULL context or < 0 on error. + * Called only after audit_ipc_obj(). */ -int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->qbytes = qbytes; - ax->uid = uid; - ax->gid = gid; - ax->mode = mode; - - ax->d.type = AUDIT_IPC_SET_PERM; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.qbytes = qbytes; + context->ipc.perm_uid = uid; + context->ipc.perm_gid = gid; + context->ipc.perm_mode = mode; + context->ipc.has_perm = 1; } int audit_bprm(struct linux_binprm *bprm) -- cgit v1.2.3 From 7392906ea915b9a2c14dea32b3604b4e178f82f7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 06:58:59 -0500 Subject: sanitize audit_mq_getsetattr() * get rid of allocations * make it return void * don't duplicate parts of audit_dummy_context() Signed-off-by: Al Viro --- kernel/auditsc.c | 54 +++++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fbed62e05bc..c50178c7e24 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -145,12 +145,6 @@ struct audit_aux_data_mq_notify { struct sigevent notification; }; -struct audit_aux_data_mq_getsetattr { - struct audit_aux_data d; - mqd_t mqdes; - struct mq_attr mqstat; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -248,6 +242,10 @@ struct audit_context { mode_t perm_mode; unsigned long qbytes; } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; }; #if AUDIT_DEBUG @@ -1269,6 +1267,15 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_GETSETATTR: { + struct mq_attr *attr = &context->mq_getsetattr.mqstat; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + context->mq_getsetattr.mqdes, + attr->mq_flags, attr->mq_maxmsg, + attr->mq_msgsize, attr->mq_curmsgs); + break; } } audit_log_end(ab); } @@ -1377,16 +1384,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->notification.sigev_signo); break; } - case AUDIT_MQ_GETSETATTR: { - struct audit_aux_data_mq_getsetattr *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " - "mq_curmsgs=%ld ", - axi->mqdes, - axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, - axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2316,30 +2313,13 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) * @mqdes: MQ descriptor * @mqstat: MQ flags * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { - struct audit_aux_data_mq_getsetattr *ax; struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->mqdes = mqdes; - ax->mqstat = *mqstat; - - ax->d.type = AUDIT_MQ_GETSETATTR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_getsetattr.mqdes = mqdes; + context->mq_getsetattr.mqstat = *mqstat; + context->type = AUDIT_MQ_GETSETATTR; } /** -- cgit v1.2.3 From 20114f71b27cafeb7c7e41d2b0f0b68c3fbb022b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2008 07:16:12 -0500 Subject: sanitize audit_mq_notify() * don't copy_from_user() twice * don't bother with allocations * don't duplicate parts of audit_dummy_context() * make it return void Signed-off-by: Al Viro --- kernel/auditsc.c | 56 ++++++++++++++++---------------------------------------- 1 file changed, 16 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c50178c7e24..3ece960de89 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -139,12 +139,6 @@ struct audit_aux_data_mq_sendrecv { struct timespec abs_timeout; }; -struct audit_aux_data_mq_notify { - struct audit_aux_data d; - mqd_t mqdes; - struct sigevent notification; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -246,6 +240,10 @@ struct audit_context { mqd_t mqdes; struct mq_attr mqstat; } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; }; #if AUDIT_DEBUG @@ -1267,6 +1265,11 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_NOTIFY: { + audit_log_format(ab, "mqdes=%d sigev_signo=%d", + context->mq_notify.mqdes, + context->mq_notify.sigev_signo); + break; } case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, @@ -1376,14 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); break; } - case AUDIT_MQ_NOTIFY: { - struct audit_aux_data_mq_notify *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d sigev_signo=%d", - axi->mqdes, - axi->notification.sigev_signo); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2274,38 +2269,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, * @mqdes: MQ descriptor * @u_notification: Notification event * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { - struct audit_aux_data_mq_notify *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_notification != NULL) { - if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->notification, 0, sizeof(ax->notification)); - - ax->mqdes = mqdes; + if (notification) + context->mq_notify.sigev_signo = notification->sigev_signo; + else + context->mq_notify.sigev_signo = 0; - ax->d.type = AUDIT_MQ_NOTIFY; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_notify.mqdes = mqdes; + context->type = AUDIT_MQ_NOTIFY; } /** -- cgit v1.2.3 From c32c8af43b9adde8d6f938d8e6328c13b8de79ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 03:46:48 -0500 Subject: sanitize AUDIT_MQ_SENDRECV * logging the original value of *msg_prio in mq_timedreceive(2) is insane - the argument is write-only (i.e. syscall always ignores the original value and only overwrites it). * merge __audit_mq_timed{send,receive} * don't do copy_from_user() twice * don't mess with allocations in auditsc part * ... and don't bother checking !audit_enabled and !context in there - we'd already checked for audit_dummy_context(). Signed-off-by: Al Viro --- kernel/auditsc.c | 127 +++++++++++++------------------------------------------ 1 file changed, 29 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3ece960de89..140c4745347 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -131,14 +131,6 @@ struct audit_aux_data_mq_open { struct mq_attr attr; }; -struct audit_aux_data_mq_sendrecv { - struct audit_aux_data d; - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -244,6 +236,12 @@ struct audit_context { mqd_t mqdes; int sigev_signo; } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; }; #if AUDIT_DEBUG @@ -1265,6 +1263,16 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_SENDRECV: { + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + context->mq_sendrecv.mqdes, + context->mq_sendrecv.msg_len, + context->mq_sendrecv.msg_prio, + context->mq_sendrecv.abs_timeout.tv_sec, + context->mq_sendrecv.abs_timeout.tv_nsec); + break; } case AUDIT_MQ_NOTIFY: { audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, @@ -1370,15 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts axi->attr.mq_curmsgs); break; } - case AUDIT_MQ_SENDRECV: { - struct audit_aux_data_mq_sendrecv *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", - axi->mqdes, axi->msg_len, axi->msg_prio, - axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); - break; } - case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); @@ -2171,97 +2170,29 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) } /** - * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec __user *u_abs_timeout) -{ - struct audit_aux_data_mq_sendrecv *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); - - ax->mqdes = mqdes; - ax->msg_len = msg_len; - ax->msg_prio = msg_prio; - - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive - * @mqdes: MQ descriptor - * @msg_len: Message length - * @u_msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time + * @abs_timeout: Message timeout in absolute time * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, - unsigned int __user *u_msg_prio, - const struct timespec __user *u_abs_timeout) +void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec *abs_timeout) { - struct audit_aux_data_mq_sendrecv *ax; struct audit_context *context = current->audit_context; + struct timespec *p = &context->mq_sendrecv.abs_timeout; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_msg_prio != NULL) { - if (get_user(ax->msg_prio, u_msg_prio)) { - kfree(ax); - return -EFAULT; - } - } else - ax->msg_prio = 0; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + if (abs_timeout) + memcpy(p, abs_timeout, sizeof(struct timespec)); + else + memset(p, 0, sizeof(struct timespec)); - ax->mqdes = mqdes; - ax->msg_len = msg_len; + context->mq_sendrecv.mqdes = mqdes; + context->mq_sendrecv.msg_len = msg_len; + context->mq_sendrecv.msg_prio = msg_prio; - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_SENDRECV; } /** -- cgit v1.2.3 From 564f6993ffef656aebaf46cf2f1f6cb4f5c97207 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 04:02:26 -0500 Subject: sanitize audit_mq_open() * don't bother with allocations * don't do double copy_from_user() * don't duplicate parts of check for audit_dummy_context() Signed-off-by: Al Viro --- kernel/auditsc.c | 65 ++++++++++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 140c4745347..83e946f1cdd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -124,13 +124,6 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 -struct audit_aux_data_mq_open { - struct audit_aux_data d; - int oflag; - mode_t mode; - struct mq_attr attr; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -242,6 +235,11 @@ struct audit_context { unsigned int msg_prio; struct timespec abs_timeout; } mq_sendrecv; + struct { + int oflag; + mode_t mode; + struct mq_attr attr; + } mq_open; }; #if AUDIT_DEBUG @@ -1263,6 +1261,16 @@ static void show_special(struct audit_context *context, int *call_panic) return; } break; } + case AUDIT_MQ_OPEN: { + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + context->mq_open.oflag, context->mq_open.mode, + context->mq_open.attr.mq_flags, + context->mq_open.attr.mq_maxmsg, + context->mq_open.attr.mq_msgsize, + context->mq_open.attr.mq_curmsgs); + break; } case AUDIT_MQ_SENDRECV: { audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " @@ -1368,15 +1376,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { - case AUDIT_MQ_OPEN: { - struct audit_aux_data_mq_open *axi = (void *)aux; - audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " - "mq_msgsize=%ld mq_curmsgs=%ld", - axi->oflag, axi->mode, axi->attr.mq_flags, - axi->attr.mq_maxmsg, axi->attr.mq_msgsize, - axi->attr.mq_curmsgs); - break; } case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; @@ -2135,38 +2134,20 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) * @mode: mode bits * @u_attr: queue attributes * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) +void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) { - struct audit_aux_data_mq_open *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_attr != NULL) { - if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->attr, 0, sizeof(ax->attr)); + if (attr) + memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); + else + memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); - ax->oflag = oflag; - ax->mode = mode; + context->mq_open.oflag = oflag; + context->mq_open.mode = mode; - ax->d.type = AUDIT_MQ_OPEN; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_OPEN; } /** -- cgit v1.2.3 From 157cf649a735a2f7e8dba0ed08e6e38b6c30d886 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 04:57:47 -0500 Subject: sanitize audit_fd_pair() * no allocations * return void Signed-off-by: Al Viro --- kernel/auditsc.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 83e946f1cdd..327e65d5067 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -131,11 +131,6 @@ struct audit_aux_data_execve { struct mm_struct *mm; }; -struct audit_aux_data_fd_pair { - struct audit_aux_data d; - int fd[2]; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -241,6 +236,7 @@ struct audit_context { struct mq_attr attr; } mq_open; }; + int fds[2]; #if AUDIT_DEBUG int put_count; @@ -1382,11 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_execve_info(context, &ab, axi); break; } - case AUDIT_FD_PAIR: { - struct audit_aux_data_fd_pair *axs = (void *)aux; - audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); - break; } - case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1416,6 +1407,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->type) show_special(context, &call_panic); + if (context->fds[0] >= 0) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); + if (ab) { + audit_log_format(ab, "fd0=%d fd1=%d", + context->fds[0], context->fds[1]); + audit_log_end(ab); + } + } + if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); if (ab) { @@ -1696,6 +1696,7 @@ void audit_syscall_exit(int valid, long return_code) context->target_sid = 0; context->sockaddr_len = 0; context->type = 0; + context->fds[0] = -1; kfree(context->filterkey); context->filterkey = NULL; tsk->audit_context = context; @@ -2291,29 +2292,12 @@ void audit_socketcall(int nargs, unsigned long *args) * @fd1: the first file descriptor * @fd2: the second file descriptor * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_fd_pair(int fd1, int fd2) +void __audit_fd_pair(int fd1, int fd2) { struct audit_context *context = current->audit_context; - struct audit_aux_data_fd_pair *ax; - - if (likely(!context)) { - return 0; - } - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) { - return -ENOMEM; - } - - ax->fd[0] = fd1; - ax->fd[1] = fd2; - - ax->d.type = AUDIT_FD_PAIR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->fds[0] = fd1; + context->fds[1] = fd2; } /** -- cgit v1.2.3 From 57f71a0af4244d9ba3c0bce74b1d2e66e8d520bd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Jan 2009 14:52:57 -0500 Subject: sanitize audit_log_capset() * no allocations * return void * don't duplicate checked for dummy context Signed-off-by: Al Viro --- kernel/auditsc.c | 44 ++++++++++++++++---------------------------- kernel/capability.c | 4 +--- 2 files changed, 17 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 327e65d5067..c76a58215f5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -235,6 +235,10 @@ struct audit_context { mode_t mode; struct mq_attr attr; } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; }; int fds[2]; @@ -1291,6 +1295,12 @@ static void show_special(struct audit_context *context, int *call_panic) attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } + case AUDIT_CAPSET: { + audit_log_format(ab, "pid=%d", context->capset.pid); + audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); + audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); + audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + break; } } audit_log_end(ab); } @@ -1392,14 +1402,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); break; } - case AUDIT_CAPSET: { - struct audit_aux_data_capset *axs = (void *)aux; - audit_log_format(ab, "pid=%d", axs->pid); - audit_log_cap(ab, "cap_pi", &axs->cap.inheritable); - audit_log_cap(ab, "cap_pp", &axs->cap.permitted); - audit_log_cap(ab, "cap_pe", &axs->cap.effective); - break; } - } audit_log_end(ab); } @@ -2456,29 +2458,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, * Record the aguments userspace sent to sys_capset for later printing by the * audit system if applicable */ -int __audit_log_capset(pid_t pid, +void __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old) { - struct audit_aux_data_capset *ax; struct audit_context *context = current->audit_context; - - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->d.type = AUDIT_CAPSET; - ax->d.next = context->aux; - context->aux = (void *)ax; - - ax->pid = pid; - ax->cap.effective = new->cap_effective; - ax->cap.inheritable = new->cap_effective; - ax->cap.permitted = new->cap_permitted; - - return 0; + context->capset.pid = pid; + context->capset.cap.effective = new->cap_effective; + context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.permitted = new->cap_permitted; + context->type = AUDIT_CAPSET; } /** diff --git a/kernel/capability.c b/kernel/capability.c index 36b4b4daebe..c598d9d5be4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (ret < 0) goto error; - ret = audit_log_capset(pid, new, current_cred()); - if (ret < 0) - return ret; + audit_log_capset(pid, new, current_cred()); return commit_creds(new); -- cgit v1.2.3 From 1a9d0797b8977d413435277bf9661efbbd584693 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 12:04:02 -0500 Subject: audit_update_lsm_rules() misses the audit_inode_hash[] ones Signed-off-by: Al Viro --- kernel/auditfilter.c | 77 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9fd85a4640a..0febaa0f784 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1778,6 +1778,41 @@ unlock_and_return: return result; } +static int update_lsm_rule(struct audit_entry *entry) +{ + struct audit_entry *nentry; + struct audit_watch *watch; + struct audit_tree *tree; + int err = 0; + + if (!security_audit_rule_known(&entry->rule)) + return 0; + + watch = entry->rule.watch; + tree = entry->rule.tree; + nentry = audit_dupe_rule(&entry->rule, watch); + if (IS_ERR(nentry)) { + /* save the first error encountered for the + * return value */ + err = PTR_ERR(nentry); + audit_panic("error updating LSM filters"); + if (watch) + list_del(&entry->rule.rlist); + list_del_rcu(&entry->list); + } else { + if (watch) { + list_add(&nentry->rule.rlist, &watch->rules); + list_del(&entry->rule.rlist); + } else if (tree) + list_replace_init(&entry->rule.rlist, + &nentry->rule.rlist); + list_replace_rcu(&entry->list, &nentry->list); + } + call_rcu(&entry->rcu, audit_free_rule_rcu); + + return err; +} + /* This function will re-initialize the lsm_rule field of all applicable rules. * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the @@ -1785,42 +1820,24 @@ unlock_and_return: * updated rule. */ int audit_update_lsm_rules(void) { - struct audit_entry *entry, *n, *nentry; - struct audit_watch *watch; - struct audit_tree *tree; + struct audit_entry *e, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { - if (!security_audit_rule_known(&entry->rule)) - continue; - - watch = entry->rule.watch; - tree = entry->rule.tree; - nentry = audit_dupe_rule(&entry->rule, watch); - if (IS_ERR(nentry)) { - /* save the first error encountered for the - * return value */ - if (!err) - err = PTR_ERR(nentry); - audit_panic("error updating LSM filters"); - if (watch) - list_del(&entry->rule.rlist); - list_del_rcu(&entry->list); - } else { - if (watch) { - list_add(&nentry->rule.rlist, - &watch->rules); - list_del(&entry->rule.rlist); - } else if (tree) - list_replace_init(&entry->rule.rlist, - &nentry->rule.rlist); - list_replace_rcu(&entry->list, &nentry->list); - } - call_rcu(&entry->rcu, audit_free_rule_rcu); + list_for_each_entry_safe(e, n, &audit_filter_list[i], list) { + int res = update_lsm_rule(e); + if (!err) + err = res; + } + } + for (i=0; i< AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry_safe(e, n, &audit_inode_hash[i], list) { + int res = update_lsm_rule(e); + if (!err) + err = res; } } -- cgit v1.2.3 From 0590b9335a1c72a3f0defcc6231287f7817e07c8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Dec 2008 23:45:27 -0500 Subject: fixing audit rule ordering mess, part 1 Problem: ordering between the rules on exit chain is currently lost; all watch and inode rules are listed after everything else _and_ exit,never on one kind doesn't stop exit,always on another from being matched. Solution: assign priorities to rules, keep track of the current highest-priority matching rule and its result (always/never). Signed-off-by: Al Viro --- kernel/audit.h | 5 +--- kernel/auditfilter.c | 17 +++++++++-- kernel/auditsc.c | 79 ++++++++++++++++++++++++++++------------------------ 3 files changed, 58 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 9d6717412fe..16f18cac661 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return __audit_signal_info(sig, t); return 0; } -extern enum audit_state audit_filter_inodes(struct task_struct *, - struct audit_context *); -extern void audit_set_auditable(struct audit_context *); +extern void audit_filter_inodes(struct task_struct *, struct audit_context *); #else #define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED -#define audit_set_auditable(c) #endif diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0febaa0f784..995a2e86808 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -919,6 +919,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; + new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; new->watch = NULL; @@ -987,9 +988,8 @@ static void audit_update_watch(struct audit_parent *parent, /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context && - audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) - audit_set_auditable(current->audit_context); + if (invalidating && current->audit_context) + audit_filter_inodes(current, current->audit_context); nwatch = audit_dupe_watch(owatch); if (IS_ERR(nwatch)) { @@ -1258,6 +1258,9 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, return ret; } +static u64 prio_low = ~0ULL/2; +static u64 prio_high = ~0ULL/2 - 1; + /* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry, struct list_head *list) @@ -1319,6 +1322,14 @@ static inline int audit_add_rule(struct audit_entry *entry, } } + entry->rule.prio = ~0ULL; + if (entry->rule.listnr == AUDIT_FILTER_EXIT) { + if (entry->rule.flags & AUDIT_FILTER_PREPEND) + entry->rule.prio = ++prio_high; + else + entry->rule.prio = --prio_low; + } + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c76a58215f5..19d2c2747c8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -165,14 +165,14 @@ struct audit_tree_refs { struct audit_context { int dummy; /* must be the first element */ int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state; + enum audit_state state, current_state; unsigned int serial; /* serial number for record */ struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ unsigned long argv[4]; /* syscall arguments */ int return_valid; /* return code is valid */ long return_code;/* syscall return code */ - int auditable; /* 1 if record should be written */ + u64 prio; int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -630,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk, return 0; } } - if (rule->filterkey && ctx) - ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + + if (ctx) { + if (rule->prio <= ctx->prio) + return 0; + if (rule->filterkey) { + kfree(ctx->filterkey); + ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + } + ctx->prio = rule->prio; + } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; @@ -685,6 +693,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, audit_filter_rules(tsk, &e->rule, ctx, NULL, &state)) { rcu_read_unlock(); + ctx->current_state = state; return state; } } @@ -698,15 +707,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, * buckets applicable to the inode numbers in audit_names[]. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ -enum audit_state audit_filter_inodes(struct task_struct *tsk, - struct audit_context *ctx) +void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { int i; struct audit_entry *e; enum audit_state state; if (audit_pid && tsk->tgid == audit_pid) - return AUDIT_DISABLED; + return; rcu_read_lock(); for (i = 0; i < ctx->name_count; i++) { @@ -723,17 +731,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk, if ((e->rule.mask[word] & bit) == bit && audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { rcu_read_unlock(); - return state; + ctx->current_state = state; + return; } } } rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; } -void audit_set_auditable(struct audit_context *ctx) +static void audit_set_auditable(struct audit_context *ctx) { - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } } static inline struct audit_context *audit_get_context(struct task_struct *tsk, @@ -764,23 +775,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, else context->return_code = return_code; - if (context->in_syscall && !context->dummy && !context->auditable) { - enum audit_state state; - - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - if (state == AUDIT_RECORD_CONTEXT) { - context->auditable = 1; - goto get_context; - } - - state = audit_filter_inodes(tsk, context); - if (state == AUDIT_RECORD_CONTEXT) - context->auditable = 1; - + if (context->in_syscall && !context->dummy) { + audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(tsk, context); } -get_context: - tsk->audit_context = NULL; return context; } @@ -790,8 +789,7 @@ static inline void audit_free_names(struct audit_context *context) int i; #if AUDIT_DEBUG == 2 - if (context->auditable - ||context->put_count + context->ino_count != context->name_count) { + if (context->put_count + context->ino_count != context->name_count) { printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", @@ -842,6 +840,7 @@ static inline void audit_zero_context(struct audit_context *context, { memset(context, 0, sizeof(*context)); context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; } static inline struct audit_context *audit_alloc_context(enum audit_state state) @@ -1543,7 +1542,7 @@ void audit_free(struct task_struct *tsk) * We use GFP_ATOMIC here because we might be doing this * in the context of the idle thread */ /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); audit_free_context(context); @@ -1627,15 +1626,17 @@ void audit_syscall_entry(int arch, int major, state = context->state; context->dummy = !audit_n_rules; - if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) + if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { + context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); + } if (likely(state == AUDIT_DISABLED)) return; context->serial = 0; context->ctime = CURRENT_TIME; context->in_syscall = 1; - context->auditable = !!(state == AUDIT_RECORD_CONTEXT); + context->current_state = state; context->ppid = 0; } @@ -1643,17 +1644,20 @@ void audit_finish_fork(struct task_struct *child) { struct audit_context *ctx = current->audit_context; struct audit_context *p = child->audit_context; - if (!p || !ctx || !ctx->auditable) + if (!p || !ctx) + return; + if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) return; p->arch = ctx->arch; p->major = ctx->major; memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); p->ctime = ctx->ctime; p->dummy = ctx->dummy; - p->auditable = ctx->auditable; p->in_syscall = ctx->in_syscall; p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); p->ppid = current->pid; + p->prio = ctx->prio; + p->current_state = ctx->current_state; } /** @@ -1677,11 +1681,11 @@ void audit_syscall_exit(int valid, long return_code) if (likely(!context)) return; - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); context->in_syscall = 0; - context->auditable = 0; + context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; if (context->previous) { struct audit_context *new_context = context->previous; @@ -2091,7 +2095,10 @@ int auditsc_get_stamp(struct audit_context *ctx, t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } return 1; } -- cgit v1.2.3 From e45aa212ea81d39b38ba158df344dc3a500153e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Dec 2008 01:17:50 -0500 Subject: audit rules ordering, part 2 Fix the actual rule listing; add per-type lists _not_ used for matching, with all exit,... sitting on one such list. Simplifies "do something for all rules" logics, while we are at it... Signed-off-by: Al Viro --- kernel/audit_tree.c | 1 + kernel/auditfilter.c | 95 +++++++++++++++++++++------------------------------- 2 files changed, 40 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8b509441f49..48bddad2a3d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree) audit_log_end(ab); rule->tree = NULL; list_del_rcu(&entry->list); + list_del(&entry->rule.list); call_rcu(&entry->rcu, audit_free_rule_rcu); } } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 995a2e86808..5d4edc6f7a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #error Fix audit_filter_list initialiser #endif }; +static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_rules_list[0]), + LIST_HEAD_INIT(audit_rules_list[1]), + LIST_HEAD_INIT(audit_rules_list[2]), + LIST_HEAD_INIT(audit_rules_list[3]), + LIST_HEAD_INIT(audit_rules_list[4]), + LIST_HEAD_INIT(audit_rules_list[5]), +}; DEFINE_MUTEX(audit_filter_mutex); @@ -1007,12 +1015,15 @@ static void audit_update_watch(struct audit_parent *parent, list_del_rcu(&oentry->list); nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); - else { + } else { int h = audit_hash_ino((u32)ino); list_add(&nentry->rule.rlist, &nwatch->rules); list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); } call_rcu(&oentry->rcu, audit_free_rule_rcu); @@ -1077,6 +1088,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) audit_log_end(ab); } list_del(&r->rlist); + list_del(&r->list); list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); } @@ -1331,9 +1343,13 @@ static inline int audit_add_rule(struct audit_entry *entry, } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + list_add(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { + list_add_tail(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_tail_rcu(&entry->list, list); } #ifdef CONFIG_AUDITSYSCALL @@ -1415,6 +1431,7 @@ static inline int audit_del_rule(struct audit_entry *entry, audit_remove_tree_rule(&e->rule); list_del_rcu(&e->list); + list_del(&e->rule.list); call_rcu(&e->rcu, audit_free_rule_rcu); #ifdef CONFIG_AUDITSYSCALL @@ -1443,30 +1460,16 @@ out: static void audit_list(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *entry; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; irule); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(entry, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule *rule; - rule = audit_krule_to_rule(&entry->rule); + rule = audit_krule_to_rule(r); if (unlikely(!rule)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, @@ -1485,30 +1488,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q) static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *e; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; irule); - if (unlikely(!data)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data) + data->buflen); - if (skb) - skb_queue_tail(q, skb); - kfree(data); - } - } - for (i=0; i< AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(e, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule_data *data; - data = audit_krule_to_data(&e->rule); + data = audit_krule_to_data(r); if (unlikely(!data)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, @@ -1789,35 +1778,37 @@ unlock_and_return: return result; } -static int update_lsm_rule(struct audit_entry *entry) +static int update_lsm_rule(struct audit_krule *r) { + struct audit_entry *entry = container_of(r, struct audit_entry, rule); struct audit_entry *nentry; struct audit_watch *watch; struct audit_tree *tree; int err = 0; - if (!security_audit_rule_known(&entry->rule)) + if (!security_audit_rule_known(r)) return 0; - watch = entry->rule.watch; - tree = entry->rule.tree; - nentry = audit_dupe_rule(&entry->rule, watch); + watch = r->watch; + tree = r->tree; + nentry = audit_dupe_rule(r, watch); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ err = PTR_ERR(nentry); audit_panic("error updating LSM filters"); if (watch) - list_del(&entry->rule.rlist); + list_del(&r->rlist); list_del_rcu(&entry->list); + list_del(&r->list); } else { if (watch) { list_add(&nentry->rule.rlist, &watch->rules); - list_del(&entry->rule.rlist); + list_del(&r->rlist); } else if (tree) - list_replace_init(&entry->rule.rlist, - &nentry->rule.rlist); + list_replace_init(&r->rlist, &nentry->rule.rlist); list_replace_rcu(&entry->list, &nentry->list); + list_replace(&r->list, &nentry->rule.list); } call_rcu(&entry->rcu, audit_free_rule_rcu); @@ -1831,27 +1822,19 @@ static int update_lsm_rule(struct audit_entry *entry) * updated rule. */ int audit_update_lsm_rules(void) { - struct audit_entry *e, *n; + struct audit_krule *r, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(e, n, &audit_filter_list[i], list) { - int res = update_lsm_rule(e); - if (!err) - err = res; - } - } - for (i=0; i< AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry_safe(e, n, &audit_inode_hash[i], list) { - int res = update_lsm_rule(e); + list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { + int res = update_lsm_rule(r); if (!err) err = res; } } - mutex_unlock(&audit_filter_mutex); return err; -- cgit v1.2.3 From e048e02c89db7bd49d1a5fac77a11c8fb3603087 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Dec 2008 03:51:22 -0500 Subject: make sure that filterkey of task,always rules is reported Signed-off-by: Al Viro --- kernel/auditsc.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 19d2c2747c8..8cbddff6c28 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -652,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk, * completely disabled for this task. Since we only have the task * structure at this point, we can only check uid and gid. */ -static enum audit_state audit_filter_task(struct task_struct *tsk) +static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) { struct audit_entry *e; enum audit_state state; @@ -660,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { + if (state == AUDIT_RECORD_CONTEXT) + *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); return state; } @@ -866,18 +868,21 @@ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; enum audit_state state; + char *key = NULL; if (likely(!audit_ever_enabled)) return 0; /* Return if not auditing. */ - state = audit_filter_task(tsk); + state = audit_filter_task(tsk, &key); if (likely(state == AUDIT_DISABLED)) return 0; if (!(context = audit_alloc_context(state))) { + kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; } + context->filterkey = key; tsk->audit_context = context; set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); @@ -1703,8 +1708,10 @@ void audit_syscall_exit(int valid, long return_code) context->sockaddr_len = 0; context->type = 0; context->fds[0] = -1; - kfree(context->filterkey); - context->filterkey = NULL; + if (context->state != AUDIT_RECORD_CONTEXT) { + kfree(context->filterkey); + context->filterkey = NULL; + } tsk->audit_context = context; } } -- cgit v1.2.3 From 36c4f1b18c8a7d0adb4085e7f531860b837bb6b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Dec 2008 01:50:28 -0500 Subject: clean up audit_rule_{add,del} a bit Signed-off-by: Al Viro --- kernel/auditfilter.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 5d4edc6f7a3..e6e3829cadd 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1114,12 +1114,16 @@ static void audit_inotify_unregister(struct list_head *in_list) /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head **p) { struct audit_entry *e, *found = NULL; + struct list_head *list; int h; - if (entry->rule.watch) { + if (entry->rule.inode_f) { + h = audit_hash_ino(entry->rule.inode_f->val); + *p = list = &audit_inode_hash[h]; + } else if (entry->rule.watch) { /* we don't know the inode number, so must walk entire hash */ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { list = &audit_inode_hash[h]; @@ -1130,6 +1134,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry, } } goto out; + } else { + *p = list = &audit_filter_list[entry->rule.listnr]; } list_for_each_entry(e, list, list) @@ -1274,14 +1280,13 @@ static u64 prio_low = ~0ULL/2; static u64 prio_high = ~0ULL/2 - 1; /* Add rule to given filterlist if not a duplicate. */ -static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_add_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct nameidata *ndp = NULL, *ndw = NULL; + struct list_head *list; int h, err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1292,13 +1297,8 @@ static inline int audit_add_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); mutex_unlock(&audit_filter_mutex); if (e) { err = -EEXIST; @@ -1372,15 +1372,14 @@ error: } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch, *tmp_watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; + struct list_head *list; LIST_HEAD(inotify_list); - int h, ret = 0; + int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1390,13 +1389,8 @@ static inline int audit_del_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); if (!e) { mutex_unlock(&audit_filter_mutex); ret = -ENOENT; @@ -1603,8 +1597,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_add_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_add_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "add", &entry->rule, !err); @@ -1620,8 +1613,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_del_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_del_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "remove", &entry->rule, !err); -- cgit v1.2.3 From 5af75d8d58d0f9f7b7c0515b35786b22892d5f12 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Dec 2008 05:59:26 -0500 Subject: audit: validate comparison operations, store them in sane form Don't store the field->op in the messy (and very inconvenient for e.g. audit_comparator()) form; translate to dense set of values and do full validation of userland-submitted value while we are at it. ->audit_init_rule() and ->audit_match_rule() get new values now; in-tree instances updated. Signed-off-by: Al Viro --- kernel/audit_tree.c | 2 +- kernel/auditfilter.c | 132 +++++++++++++++++++++++++-------------------------- 2 files changed, 66 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 48bddad2a3d..8ad9545b8db 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -618,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) if (pathname[0] != '/' || rule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || rule->inode_f || rule->watch || rule->tree) return -EINVAL; rule->tree = alloc_tree(pathname); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e6e3829cadd..fbf24d121d9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -252,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree) + krule->watch || krule->inode_f || krule->tree || + (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; krule->inode_f = f; @@ -270,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, if (path[0] != '/' || path[len-1] == '/' || krule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || krule->inode_f || krule->watch || krule->tree) return -EINVAL; @@ -420,12 +421,32 @@ exit_err: return ERR_PTR(err); } +static u32 audit_ops[] = +{ + [Audit_equal] = AUDIT_EQUAL, + [Audit_not_equal] = AUDIT_NOT_EQUAL, + [Audit_bitmask] = AUDIT_BIT_MASK, + [Audit_bittest] = AUDIT_BIT_TEST, + [Audit_lt] = AUDIT_LESS_THAN, + [Audit_gt] = AUDIT_GREATER_THAN, + [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, + [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, +}; + +static u32 audit_to_op(u32 op) +{ + u32 n; + for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) + ; + return n; +} + + /* Translate struct audit_rule to kernel's rule respresentation. * Exists for backward compatibility with userspace. */ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; - struct audit_field *ino_f; int err = 0; int i; @@ -435,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 n; + + n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); + + /* Support for legacy operators where + * AUDIT_NEGATE bit signifies != and otherwise assumes == */ + if (n & AUDIT_NEGATE) + f->op = Audit_not_equal; + else if (!n) + f->op = Audit_equal; + else + f->op = audit_to_op(n); + + entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; err = -EINVAL; + if (f->op == Audit_bad) + goto exit_free; + switch(f->type) { default: goto exit_free; @@ -462,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EXIT: case AUDIT_SUCCESS: /* bit ops are only useful on syscall args */ - if (f->op == AUDIT_BIT_MASK || - f->op == AUDIT_BIT_TEST) { - err = -EINVAL; + if (f->op == Audit_bitmask || f->op == Audit_bittest) goto exit_free; - } break; case AUDIT_ARG0: case AUDIT_ARG1: @@ -475,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) break; /* arch is only allowed to be = or != */ case AUDIT_ARCH: - if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) - && (f->op != AUDIT_NEGATE) && (f->op)) { - err = -EINVAL; + if (f->op != Audit_not_equal && f->op != Audit_equal) goto exit_free; - } entry->rule.arch_f = f; break; case AUDIT_PERM: @@ -496,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; } - - entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (f->op & AUDIT_NEGATE) - f->op = AUDIT_NOT_EQUAL; - else if (!f->op) - f->op = AUDIT_EQUAL; - else if (f->op == AUDIT_OPERATORS) { - err = -EINVAL; - goto exit_free; - } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -538,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; - struct audit_field *ino_f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -554,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, struct audit_field *f = &entry->rule.fields[i]; err = -EINVAL; - if (!(data->fieldflags[i] & AUDIT_OPERATORS) || - data->fieldflags[i] & ~AUDIT_OPERATORS) + + f->op = audit_to_op(data->fieldflags[i]); + if (f->op == Audit_bad) goto exit_free; - f->op = data->fieldflags[i] & AUDIT_OPERATORS; f->type = data->fields[i]; f->val = data->values[i]; f->lsm_str = NULL; @@ -670,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -721,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule->fields[i] = krule->fields[i].type; if (krule->vers_ops == 1) { - if (krule->fields[i].op & AUDIT_NOT_EQUAL) + if (krule->fields[i].op == Audit_not_equal) rule->fields[i] |= AUDIT_NEGATE; } else { - rule->fields[i] |= krule->fields[i].op; + rule->fields[i] |= audit_ops[krule->fields[i].op]; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; @@ -752,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) struct audit_field *f = &krule->fields[i]; data->fields[i] = f->type; - data->fieldflags[i] = f->op; + data->fieldflags[i] = audit_ops[f->op]; switch(f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -1626,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return err; } -int audit_comparator(const u32 left, const u32 op, const u32 right) +int audit_comparator(u32 left, u32 op, u32 right) { switch (op) { - case AUDIT_EQUAL: + case Audit_equal: return (left == right); - case AUDIT_NOT_EQUAL: + case Audit_not_equal: return (left != right); - case AUDIT_LESS_THAN: + case Audit_lt: return (left < right); - case AUDIT_LESS_THAN_OR_EQUAL: + case Audit_le: return (left <= right); - case AUDIT_GREATER_THAN: + case Audit_gt: return (left > right); - case AUDIT_GREATER_THAN_OR_EQUAL: + case Audit_ge: return (left >= right); - case AUDIT_BIT_MASK: + case Audit_bitmask: return (left & right); - case AUDIT_BIT_TEST: + case Audit_bittest: return ((left & right) == right); + default: + BUG(); + return 0; } - BUG(); - return 0; } /* Compare given dentry name with last component in given path, -- cgit v1.2.3 From 7b574b7b0124ed344911f5d581e9bc2d83bbeb19 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 4 Jan 2009 12:00:45 -0800 Subject: cgroups: fix a race between cgroup_clone and umount The race is calling cgroup_clone() while umounting the ns cgroup subsys, and thus cgroup_clone() might access invalid cgroup_fs, or kill_sb() is called after cgroup_clone() created a new dir in it. The BUG I triggered is BUG_ON(root->number_of_cgroups != 1); ------------[ cut here ]------------ kernel BUG at kernel/cgroup.c:1093! invalid opcode: 0000 [#1] SMP ... Process umount (pid: 5177, ti=e411e000 task=e40c4670 task.ti=e411e000) ... Call Trace: [] ? deactivate_super+0x3f/0x51 [] ? mntput_no_expire+0xb3/0xdd [] ? sys_umount+0x265/0x2ac [] ? sys_oldumount+0xd/0xf [] ? sysenter_do_call+0x12/0x31 ... EIP: [] cgroup_kill_sb+0x23/0xe0 SS:ESP 0068:e411ef2c ---[ end trace c766c1be3bf944ac ]--- Cc: Serge E. Hallyn Signed-off-by: Li Zefan Cc: Paul Menage Cc: "Serge E. Hallyn" Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 48348dde6d8..891a84eb9d3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2945,7 +2945,11 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ - atomic_inc(&parent->root->sb->s_active); + if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { + /* We race with the final deactivate_super() */ + mutex_unlock(&cgroup_mutex); + return 0; + } /* Keep the cgroup alive */ get_css_set(cg); -- cgit v1.2.3 From ca4787b779dd698a2a33a328aa5fa90a3e954077 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Mon, 5 Jan 2009 08:40:10 -0600 Subject: kernel/module.c: compare symbol values when marking symbols as exported in /proc/kallsyms. When there are two symbols in a module with the same name, one of which is exported, both will be marked as exported in /proc/kallsyms. There aren't any instances of this in the current kernel, but it is easy to construct a simple module with two compilation units that exhibits the problem. $ objdump -j .text -t testmod.ko | grep foo 00000000 l F .text 00000032 foo 00000080 g F .text 00000001 foo $ sudo insmod testmod.ko $ grep "T foo" /proc/kallsyms c28e8000 T foo [testmod] c28e8080 T foo [testmod] Fix this by comparing the symbol values once we've found the exported symbol table entry matching the symbol name. Tested using Ksplice: $ ksplice-create --patch=this_commit.patch --id=bar . $ sudo ksplice-apply ksplice-bar.tar.gz Done! $ grep "T foo" /proc/kallsyms c28e8080 T foo [testmod] Signed-off-by: Tim Abbott Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Rusty Russell --- kernel/module.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index dd2a54155b5..895c5675edb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1725,15 +1725,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name, return NULL; } -static int is_exported(const char *name, const struct module *mod) +static int is_exported(const char *name, unsigned long value, + const struct module *mod) { - if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) - return 1; + const struct kernel_symbol *ks; + if (!mod) + ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else - if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) - return 1; - else - return 0; + ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); + return ks != NULL && ks->value == value; } /* As per nm */ @@ -2504,7 +2504,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, mod); + *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } -- cgit v1.2.3 From d1e99d7ae4e6bbd1ebb5e81ecd3af2b8793efee0 Mon Sep 17 00:00:00 2001 From: Jianjun Kong Date: Mon, 8 Dec 2008 14:26:29 +0800 Subject: module: fix warning of unused function when !CONFIG_PROC_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix this warning: kernel/module.c:824: warning: ‘print_unload_info’ defined but not used print_unload_info() just was used when CONFIG_PROC_FS was defined. This patch mark print_unload_info() inline to solve the problem. Signed-off-by: Jianjun Kong Signed-off-by: Rusty Russell CC: Ingo Molnar CC: Américo Wang --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 895c5675edb..d3d254571bd 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -820,7 +820,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) return ret; } -static void print_unload_info(struct seq_file *m, struct module *mod) +static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; int printed_something = 0; @@ -893,7 +893,7 @@ void module_put(struct module *module) EXPORT_SYMBOL(module_put); #else /* !CONFIG_MODULE_UNLOAD */ -static void print_unload_info(struct seq_file *m, struct module *mod) +static inline void print_unload_info(struct seq_file *m, struct module *mod) { /* We don't know the usage count, or what modules are using. */ seq_printf(m, " - -"); -- cgit v1.2.3 From 088af9a6e05d51e7c3dc85d45d8b7a52c3ee08d7 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 31 Dec 2008 12:31:18 +0100 Subject: module: fix module loading failure of large kernel modules for parisc When creating the final layout of a kernel module in memory, allow the module loader to reserve some additional memory in front of a given section. This is currently only needed for the parisc port which needs to put the stub entries there to fulfill the 17/22bit PCREL relocations with large kernel modules like xfs. Signed-off-by: Helge Deller Signed-off-by: Rusty Russell (renamed fn) --- kernel/module.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d3d254571bd..4299aefc20b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1578,11 +1578,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs, return ret; } +/* Additional bytes needed by arch in front of individual sections */ +unsigned int __weak arch_mod_section_prepend(struct module *mod, + unsigned int section) +{ + /* default implementation just returns zero */ + return 0; +} + /* Update size with this section: return offset. */ -static long get_offset(unsigned int *size, Elf_Shdr *sechdr) +static long get_offset(struct module *mod, unsigned int *size, + Elf_Shdr *sechdr, unsigned int section) { long ret; + *size += arch_mod_section_prepend(mod, section); ret = ALIGN(*size, sechdr->sh_addralign ?: 1); *size = ret + sechdr->sh_size; return ret; @@ -1622,7 +1632,7 @@ static void layout_sections(struct module *mod, || strncmp(secstrings + s->sh_name, ".init", 5) == 0) continue; - s->sh_entsize = get_offset(&mod->core_size, s); + s->sh_entsize = get_offset(mod, &mod->core_size, s, i); DEBUGP("\t%s\n", secstrings + s->sh_name); } if (m == 0) @@ -1640,7 +1650,7 @@ static void layout_sections(struct module *mod, || strncmp(secstrings + s->sh_name, ".init", 5) != 0) continue; - s->sh_entsize = (get_offset(&mod->init_size, s) + s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); DEBUGP("\t%s\n", secstrings + s->sh_name); } -- cgit v1.2.3 From 9ea09af3bd3090e8349ca2899ca2011bd94cda85 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Dec 2008 12:36:30 +0100 Subject: stop_machine: introduce stop_machine_create/destroy. Introduce stop_machine_create/destroy. With this interface subsystems that need a non-failing stop_machine environment can create the stop_machine machine threads before actually calling stop_machine. When the threads aren't needed anymore they can be killed with stop_machine_destroy again. When stop_machine gets called and the threads aren't present they will be created and destroyed automatically. This restores the old behaviour of stop_machine. This patch also converts cpu hotplug to the new interface since it is special: cpu_down calls __stop_machine instead of stop_machine. However the kstop threads will only be created when stop_machine gets called. Changing the code so that the threads would be created automatically on __stop_machine is currently not possible: when __stop_machine gets called we hold cpu_add_remove_lock, which is the same lock that create_rt_workqueue would take. So the workqueue needs to be created before the cpu hotplug code locks cpu_add_remove_lock. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell --- kernel/cpu.c | 6 +++++- kernel/stop_machine.c | 55 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 47fff3b63cb..30e74dd6d01 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -269,8 +269,11 @@ out_release: int __ref cpu_down(unsigned int cpu) { - int err = 0; + int err; + err = stop_machine_create(); + if (err) + return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -297,6 +300,7 @@ int __ref cpu_down(unsigned int cpu) out: cpu_maps_update_done(); + stop_machine_destroy(); return err; } EXPORT_SYMBOL(cpu_down); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 286c41722e8..0cd415ee62a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -38,7 +38,10 @@ struct stop_machine_data { static unsigned int num_threads; static atomic_t thread_ack; static DEFINE_MUTEX(lock); - +/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ +static DEFINE_MUTEX(setup_lock); +/* Users of stop_machine. */ +static int refcount; static struct workqueue_struct *stop_machine_wq; static struct stop_machine_data active, idle; static const cpumask_t *active_cpus; @@ -109,6 +112,43 @@ static int chill(void *unused) return 0; } +int stop_machine_create(void) +{ + mutex_lock(&setup_lock); + if (refcount) + goto done; + stop_machine_wq = create_rt_workqueue("kstop"); + if (!stop_machine_wq) + goto err_out; + stop_machine_work = alloc_percpu(struct work_struct); + if (!stop_machine_work) + goto err_out; +done: + refcount++; + mutex_unlock(&setup_lock); + return 0; + +err_out: + if (stop_machine_wq) + destroy_workqueue(stop_machine_wq); + mutex_unlock(&setup_lock); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(stop_machine_create); + +void stop_machine_destroy(void) +{ + mutex_lock(&setup_lock); + refcount--; + if (refcount) + goto done; + destroy_workqueue(stop_machine_wq); + free_percpu(stop_machine_work); +done: + mutex_unlock(&setup_lock); +} +EXPORT_SYMBOL_GPL(stop_machine_destroy); + int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { struct work_struct *sm_work; @@ -146,19 +186,14 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { int ret; + ret = stop_machine_create(); + if (ret) + return ret; /* No CPUs can come up or down during this. */ get_online_cpus(); ret = __stop_machine(fn, data, cpus); put_online_cpus(); - + stop_machine_destroy(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); - -static int __init stop_machine_init(void) -{ - stop_machine_wq = create_rt_workqueue("kstop"); - stop_machine_work = alloc_percpu(struct work_struct); - return 0; -} -core_initcall(stop_machine_init); -- cgit v1.2.3 From 9e01892c4234070bbcf3a9f582514c8b91464375 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Dec 2008 12:36:31 +0100 Subject: module: convert to stop_machine_create/destroy. The module code relies on a non-failing stop_machine call. So we create the kstop threads in advance and with that make sure the call won't fail. Signed-off-by: Heiko Carstens Signed-off-by: Rusty Russell --- kernel/module.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4299aefc20b..f47cce910f2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags) return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; + /* Create stop_machine threads since free_module relies on + * a non-failing stop_machine call. */ + ret = stop_machine_create(); + if (ret) + return ret; + + if (mutex_lock_interruptible(&module_mutex) != 0) { + ret = -EINTR; + goto out_stop; + } mod = find_module(name); if (!mod) { @@ -817,6 +825,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags) out: mutex_unlock(&module_mutex); +out_stop: + stop_machine_destroy(); return ret; } @@ -1875,6 +1885,13 @@ static noinline struct module *load_module(void __user *umod, /* vmalloc barfs on "unusual" numbers. Check here */ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) return ERR_PTR(-ENOMEM); + + /* Create stop_machine threads since the error path relies on + * a non-failing stop_machine call. */ + err = stop_machine_create(); + if (err) + goto free_hdr; + if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; goto free_hdr; @@ -2258,6 +2275,7 @@ static noinline struct module *load_module(void __user *umod, /* Get rid of temporary copy */ vfree(hdr); + stop_machine_destroy(); /* Done! */ return mod; @@ -2280,6 +2298,7 @@ static noinline struct module *load_module(void __user *umod, kfree(args); free_hdr: vfree(hdr); + stop_machine_destroy(); return ERR_PTR(err); truncated: -- cgit v1.2.3 From c12172c0251761c54260376eb29a5f6547495580 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 20:30:06 -0800 Subject: rcu: fix rcutree grace-period-latency bug on small systems Impact: fix delays during bootup Kudos to Andi Kleen for finding a grace-period-latency problem! The problem was that the special-case code for small machines never updated the ->signaled field to indicate that grace-period initialization had completed, which prevented force_quiescent_state() from ever expediting grace periods. This problem resulted in grace periods extending for more than 20 seconds. Not subtle. I introduced this bug during my inspection process when I fixed a race between grace-period initialization and force_quiescent_state() execution. The following patch properly updates the ->signaled field for the "small"-system case (no more than 32 CPUs for 32-bit kernels and no more than 64 CPUs for 64-bit kernels). Reported-by: Andi Kleen Tested-by: Andi Kleen Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a342b032112..88d921c5c44 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -572,6 +572,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { rnp->qsmask = rnp->qsmaskinit; + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; } -- cgit v1.2.3 From 90a4d2c0106bb690f0b6af3d506febc35c658aa7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 11:41:11 -0800 Subject: rcu: make treercu safe for suspend and resume Impact: fix kernel warnings [and potential crash] during suspend+resume Kudos to both Dhaval Giani and Jens Axboe for finding a bug in treercu that causes warnings after suspend-resume cycles in Dhaval's case and during stress tests in Jens's case. It would also probably cause failures if heavily stressed. The solution, ironically enough, is to revert to rcupreempt's code for initializing the dynticks state. And the patch even results in smaller code -- so what was I thinking??? This is 2.6.29 material, given that people really do suspend and resume Linux these days. ;-) Reported-by: Dhaval Giani Reported-by: Jens Axboe Tested-by: Dhaval Giani Tested-by: Jens Axboe Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 88d921c5c44..f2d8638e6c6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); #ifdef CONFIG_NO_HZ -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = 1, + .dynticks = 1, +}; #endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per softirq. */ @@ -1380,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) static void __cpuinit rcu_online_cpu(int cpu) { -#ifdef CONFIG_NO_HZ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dynticks_nesting = 1; - rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */ - rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1; -#endif /* #ifdef CONFIG_NO_HZ */ rcu_init_percpu_data(cpu, &rcu_state); rcu_init_percpu_data(cpu, &rcu_bh_state); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -- cgit v1.2.3 From ea7d3fef4222cd98556a0b386598268d4dbf6670 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 13:03:02 -0800 Subject: rcu: eliminate synchronize_rcu_xxx macro Impact: cleanup Expand macro into two files. The synchronize_rcu_xxx macro is quite ugly and it's only used by two callers, so expand it instead. This makes this code easier to change. Signed-off-by: Andi Kleen Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 11 +++++++++-- kernel/rcupreempt.c | 11 ++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ad63af8b252..d92a76a881a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void); /* Makes kernel-doc tools happy */ -synchronize_rcu_xxx(synchronize_rcu, call_rcu) +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index f9dc8f3720f..33cfc50781f 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1177,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +void __synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(__synchronize_sched); /* -- cgit v1.2.3 From c59ab97e9ecdee9084d2da09e5a8ceea9a396508 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 18:28:27 -0800 Subject: rcu: fix rcutorture bug Fix an rcutorture bug that prevents the shutdown notifier from ever actually having any effect, due to the fact that kthreads ignore all signals. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 3245b40952c..1cff28db56b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -136,7 +136,7 @@ static int stutter_pause_test = 0; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ +#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */ #define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ static int fullstop; /* stop generating callbacks at test end. */ DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ @@ -151,12 +151,10 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, { if (fullstop) return NOTIFY_DONE; - if (signal_pending(current)) { - mutex_lock(&fullstop_mutex); - if (!ACCESS_ONCE(fullstop)) - fullstop = FULLSTOP_SIGNALED; - mutex_unlock(&fullstop_mutex); - } + mutex_lock(&fullstop_mutex); + if (!fullstop) + fullstop = FULLSTOP_SHUTDOWN; + mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; } @@ -624,7 +622,7 @@ rcu_torture_writer(void *arg) rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } @@ -649,7 +647,7 @@ rcu_torture_fakewriter(void *arg) } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } @@ -759,7 +757,7 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); if (irqreader && cur_ops->irqcapable) del_timer_sync(&t); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } -- cgit v1.2.3 From 8bdec955b0da2ffbd10eb9b200651dd1f9e366f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:19 +0100 Subject: hrtimer: splitout peek ahead functionality Impact: cleanup Provide a peek ahead function that assumes irqs disabled, allows for micro optimizations. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index eb2bfefa6dc..8f7001c97e0 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1243,6 +1243,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) } } +/* + * local version of hrtimer_peek_ahead_timers() called with interrupts + * disabled. + */ +static void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = &__get_cpu_var(tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + /** * hrtimer_peek_ahead_timers -- run soft-expired timers now * @@ -1254,16 +1270,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ void hrtimer_peek_ahead_timers(void) { - struct tick_device *td; unsigned long flags; - if (!hrtimer_hres_active()) - return; - local_irq_save(flags); - td = &__get_cpu_var(tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); + __hrtimer_peek_ahead_timers(); local_irq_restore(flags); } -- cgit v1.2.3 From d5fd43c4ae04523e1dcd7794f9c511b289851350 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:20 +0100 Subject: hrtimer: fix HOTPLUG_CPU=n compile warning Impact: cleanup kernel/hrtimer.c: In function 'hrtimer_cpu_notify': kernel/hrtimer.c:1574: warning: unused variable 'dcpu' Introduced by commit 37810659ea7d9572c5ac284ade272f806ef8f788 ("hrtimer: removing all ur callback modes, fix hotplug") from the timers. dcpu is only used if CONFIG_HOTPLUG_CPU is set. Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8f7001c97e0..9c2bfa84128 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1504,6 +1504,11 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU +static void tickle_timers(void *arg) +{ + hrtimer_peek_ahead_timers(); +} + static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { @@ -1539,7 +1544,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -static int migrate_hrtimers(int scpu) +static void migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; int dcpu, i; @@ -1567,12 +1572,7 @@ static int migrate_hrtimers(int scpu) spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); - return dcpu; -} - -static void tickle_timers(void *arg) -{ - hrtimer_peek_ahead_timers(); + smp_call_function_single(dcpu, tickle_timers, NULL, 0); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1593,11 +1593,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, case CPU_DEAD: case CPU_DEAD_FROZEN: { - int dcpu; - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); - dcpu = migrate_hrtimers(scpu); - smp_call_function_single(dcpu, tickle_timers, NULL, 0); + migrate_hrtimers(scpu); break; } #endif -- cgit v1.2.3 From 731a55ba0f17064f85903b7bf8e24849ec6cfa20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:21 +0100 Subject: hrtimer: simplify hotplug migration Impact: cleanup No need for a smp function call, which is likely to run on the same CPU anyway. We can just call hrtimers_peek_ahead() in the interrupts disabled section of migrate_hrtimers(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9c2bfa84128..8010a67cead 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1504,11 +1504,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void tickle_timers(void *arg) -{ - hrtimer_peek_ahead_timers(); -} - static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { @@ -1547,20 +1542,19 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, static void migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; - int dcpu, i; + int i; BUG_ON(cpu_online(scpu)); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &get_cpu_var(hrtimer_bases); - - dcpu = smp_processor_id(); - tick_cancel_sched_timer(scpu); + + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = &__get_cpu_var(hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1569,10 +1563,11 @@ static void migrate_hrtimers(int scpu) } spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_var(hrtimer_bases); + spin_unlock(&new_base->lock); - smp_call_function_single(dcpu, tickle_timers, NULL, 0); + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); } #endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From a6037b61c2f5fc99c57c15b26d7cfa58bbb34008 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Jan 2009 11:28:22 +0100 Subject: hrtimer: fix recursion deadlock by re-introducing the softirq Impact: fix rare runtime deadlock There are a few sites that do: spin_lock_irq(&foo) hrtimer_start(&bar) __run_hrtimer(&bar) func() spin_lock(&foo) which obviously deadlocks. In order to avoid this, never call __run_hrtimer() from hrtimer_start*() context, but instead defer this to softirq context. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 60 +++++++++++++++++++++++++------------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8010a67cead..b68e98f4e4c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -634,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static void __run_hrtimer(struct hrtimer *timer); /* * When High resolution timers are active, try to reprogram. Note, that in case @@ -646,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - /* - * XXX: recursion check? - * hrtimer_forward() should round up with timer granularity - * so that we never get into inf recursion here, - * it doesn't do that though - */ - __run_hrtimer(timer); + spin_unlock(&base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + spin_lock(&base->cpu_base->lock); return 1; } return 0; @@ -705,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -780,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. */ -static void enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, int reprogram) +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -814,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (leftmost) { - /* - * Reprogram the clock event device. When the timer is already - * expired hrtimer_enqueue_reprogram has either called the - * callback or added it to the pending list and raised the - * softirq. - * - * This is a NOP for !HIGHRES - */ - if (reprogram && hrtimer_enqueue_reprogram(timer, base)) - return; - + if (leftmost) base->first = &timer->node; - } rb_link_node(&timer->node, parent, link); rb_insert_color(&timer->node, &base->active); @@ -836,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * state of a possibly running callback. */ timer->state |= HRTIMER_STATE_ENQUEUED; + + return leftmost; } /* @@ -912,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret; + int ret, leftmost; base = lock_hrtimer_base(timer, &flags); @@ -940,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n timer_stats_hrtimer_set_start_info(timer); + leftmost = enqueue_hrtimer(timer, new_base); + /* * Only allow reprogramming if the new base is on this CPU. * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? */ - enqueue_hrtimer(timer, new_base, - new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) + hrtimer_enqueue_reprogram(timer, new_base); unlock_hrtimer_base(timer, &flags); @@ -1163,7 +1149,7 @@ static void __run_hrtimer(struct hrtimer *timer) */ if (restart != HRTIMER_NORESTART) { BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); - enqueue_hrtimer(timer, base, 0); + enqueue_hrtimer(timer, base); } timer->state &= ~HRTIMER_STATE_CALLBACK; } @@ -1277,6 +1263,11 @@ void hrtimer_peek_ahead_timers(void) local_irq_restore(flags); } +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + #endif /* CONFIG_HIGH_RES_TIMERS */ /* @@ -1532,7 +1523,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * is done, which will run all expired timers and re-programm * the timer device. */ - enqueue_hrtimer(timer, new_base, 0); + enqueue_hrtimer(timer, new_base); /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; @@ -1610,6 +1601,9 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif } /** -- cgit v1.2.3 From e3f1d883740b09e5116d4d4e30a6a6987264a83c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:23 +0100 Subject: hrtimer: fixup comments Clean up the comments Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b68e98f4e4c..aa024f2af78 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1143,9 +1143,9 @@ static void __run_hrtimer(struct hrtimer *timer) spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid - * reprogramming of the event hardware. This happens at the end of this - * function anyway. + * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * we do not reprogramm the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() */ if (restart != HRTIMER_NORESTART) { BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); @@ -1514,14 +1514,12 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* - * Enqueue the timers on the new cpu, but do not reprogram - * the timer as that would enable a deadlock between - * hrtimer_enqueue_reprogramm() running the timer and us still - * holding a nested base lock. - * - * Instead we tickle the hrtimer interrupt after the migration - * is done, which will run all expired timers and re-programm - * the timer device. + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. */ enqueue_hrtimer(timer, new_base); -- cgit v1.2.3 From 39aac64812da70f0af262f4700e67637338cbb3b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 5 Jan 2009 19:18:02 +0800 Subject: sched: mark sched_create_sysfs_power_savings_entries() as __init Impact: cleanup The only caller is cpu_dev_init() which is marked as __init. Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 545c6fccd1d..9a8e296959c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8060,7 +8060,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_store); #endif -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) { int err = 0; -- cgit v1.2.3 From c70f22d203fc02c805b6ed4a3483b740dc36786b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 5 Jan 2009 19:07:50 +0800 Subject: sched: clean up arch_reinit_sched_domains() - Make arch_reinit_sched_domains() static. It was exported to be used in s390, but now rebuild_sched_domains() is used instead. - Make it return void. Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9a8e296959c..c5019a5dcaa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7987,7 +7987,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static void arch_reinit_sched_domains(void) { get_online_cpus(); @@ -7996,13 +7996,10 @@ int arch_reinit_sched_domains(void) rebuild_sched_domains(); put_online_cpus(); - - return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { - int ret; unsigned int level = 0; if (sscanf(buf, "%u", &level) != 1) @@ -8023,9 +8020,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - ret = arch_reinit_sched_domains(); + arch_reinit_sched_domains(); - return ret ? ret : count; + return count; } #ifdef CONFIG_SCHED_MC -- cgit v1.2.3 From 82c5b7b527ccc4b5d3cf832437e842f9d2920a79 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 5 Jan 2009 14:11:10 +0100 Subject: hrtimer: splitout peek ahead functionality, fix Impact: build fix on !CONFIG_HIGH_RES_TIMERS Fix: kernel/hrtimer.c:1586: error: implicit declaration of function '__hrtimer_peek_ahead_timers' Signen-off-by: Ingo Molnar --- kernel/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index aa024f2af78..1455b7651b6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1268,7 +1268,11 @@ static void run_hrtimer_softirq(struct softirq_action *h) hrtimer_peek_ahead_timers(); } -#endif /* CONFIG_HIGH_RES_TIMERS */ +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from timer softirq every jiffy, expire hrtimers: -- cgit v1.2.3 From 56ff5efad96182f4d3cb3dc6b07396762c658f16 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 9 Dec 2008 09:34:39 -0500 Subject: zero i_uid/i_gid on inode allocation ... and don't bother in callers. Don't bother with zeroing i_blocks, while we are at it - it's already been zeroed. i_mode is not worth the effort; it has no common default value. Signed-off-by: Al Viro --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 48348dde6d8..f7c5099a057 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -573,7 +573,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; } -- cgit v1.2.3 From 0c910d289567163dbe40ccc174b36afd1c7723bd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 6 Jan 2009 17:39:06 +0800 Subject: sched: fix double kfree in failure path It's not the responsibility of init_rootdomain() to free root_domain allocated by alloc_rootdomain(). Signed-off-by: Li Zefan Reviewed-by: Pekka Enberg Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c5019a5dcaa..973f97362ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6970,7 +6970,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) } if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto free_rd; + goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) @@ -6986,8 +6986,7 @@ free_online: free_cpumask_var(rd->online); free_span: free_cpumask_var(rd->span); -free_rd: - kfree(rd); +out: return -ENOMEM; } -- cgit v1.2.3 From db2f59c8c9b315f2b88b1dac159b988c6009034d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 6 Jan 2009 17:40:36 +0800 Subject: sched: fix section mismatch init_rootdomain() calls alloc_bootmem_cpumask_var() at system boot, so does cpupri_init(). Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_cpupri.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 973f97362ce..2e3545f57e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6957,7 +6957,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) spin_unlock_irqrestore(&rq->lock, flags); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 018b7be1db2..1e00bfacf9b 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -151,7 +151,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) { int i; -- cgit v1.2.3 From edb123e16c6092bd08b67d1130ff03efeada0c89 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 4 Dec 2008 12:39:49 +0100 Subject: trivial: printk: fix indentation of new_text_line declaration Remove bogus indentation of new_text_line declaration introduced in commit ac60ad741. Acked-by: Nick Andrew Signed-off-by: Jiri Kosina --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index e651ab05655..7015733793e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) static const char recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int recursion_bug; - static int new_text_line = 1; +static int new_text_line = 1; static char printk_buf[1024]; asmlinkage int vprintk(const char *fmt, va_list args) -- cgit v1.2.3 From 025dfdafe77f20b3890981a394774baab7b9c827 Mon Sep 17 00:00:00 2001 From: Frederik Schwarzer Date: Thu, 16 Oct 2008 19:02:37 +0200 Subject: trivial: fix then -> than typos in comments and documentation - (better, more, bigger ...) then -> (...) than Signed-off-by: Frederik Schwarzer Signed-off-by: Jiri Kosina --- kernel/pid.c | 2 +- kernel/time/jiffies.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 064e76afa50..af9224cdd6c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -475,7 +475,7 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) EXPORT_SYMBOL(task_session_nr_ns); /* - * Used by proc to find the first pid that is greater then or equal to nr. + * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1ca99557e92..06f197560f3 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -45,7 +45,7 @@ * * The value 8 is somewhat carefully chosen, as anything * larger can result in overflows. NSEC_PER_JIFFY grows as - * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ #define JIFFIES_SHIFT 8 -- cgit v1.2.3 From cd3772e6898c6386f21d2958346d6dd57d4204f5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 16 Nov 2008 18:22:09 +0800 Subject: kernel/ksysfs.c:fix dependence on CONFIG_NET Access to uevent_seqnum and uevent_helper does not need to depend on CONFIG_NET, so remove it. Signed-off-by: Ming Lei Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 08dd8ed86c7..528dd78e7e7 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -137,7 +137,7 @@ struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif -- cgit v1.2.3 From 81ff86a11f54c9e266c6a6bc3ecd2c9a0f1e11cc Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 6 Jan 2009 10:44:39 -0800 Subject: pm: struct device - replace bus_id with dev_name(), dev_set_name() Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/power/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 613f16941b8..23998887397 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) /* this may fail if the RTC hasn't been initialized */ status = rtc_read_time(rtc, &alm.time); if (status < 0) { - printk(err_readtime, rtc->dev.bus_id, status); + printk(err_readtime, dev_name(&rtc->dev), status); return; } rtc_tm_to_time(&alm.time, &now); @@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) status = rtc_set_alarm(rtc, &alm); if (status < 0) { - printk(err_wakealarm, rtc->dev.bus_id, status); + printk(err_wakealarm, dev_name(&rtc->dev), status); return; } @@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(char **)name_ptr = dev->bus_id; + *(const char **)name_ptr = dev_name(dev); return 1; } -- cgit v1.2.3 From 75aa199410359dc5fbcf9025ff7af98a9d20f0d5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jan 2009 14:39:01 -0800 Subject: oom: print triggering task's cpuset and mems allowed When cpusets are enabled, it's necessary to print the triggering task's set of allowable nodes so the subsequently printed meminfo can be interpreted correctly. We also print the task's cpuset name for informational purposes. [rientjes@google.com: task lock current before dereferencing cpuset] Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39c1a4c1c5a..345ace5117d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -239,6 +239,17 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); +/* + * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist + * buffers. They are statically allocated to prevent using excess stack + * when calling cpuset_print_task_mems_allowed(). + */ +#define CPUSET_NAME_LEN (128) +#define CPUSET_NODELIST_LEN (256) +static char cpuset_name[CPUSET_NAME_LEN]; +static char cpuset_nodelist[CPUSET_NODELIST_LEN]; +static DEFINE_SPINLOCK(cpuset_buffer_lock); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -2356,6 +2367,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +/** + * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed + * @task: pointer to task_struct of some task. + * + * Description: Prints @task's name, cpuset name, and cached copy of its + * mems_allowed to the kernel log. Must hold task_lock(task) to allow + * dereferencing task_cs(task). + */ +void cpuset_print_task_mems_allowed(struct task_struct *tsk) +{ + struct dentry *dentry; + + dentry = task_cs(tsk)->css.cgroup->dentry; + spin_lock(&cpuset_buffer_lock); + snprintf(cpuset_name, CPUSET_NAME_LEN, + dentry ? (const char *)dentry->d_name.name : "/"); + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, + tsk->mems_allowed); + printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", + tsk->comm, cpuset_name, cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); +} + /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special -- cgit v1.2.3 From e5991371ee0d1c0ce19e133c6f9075b49c5b4ae8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:22 -0800 Subject: mm: remove cgroup_mm_owner_callbacks cgroup_mm_owner_callbacks() was brought in to support the memrlimit controller, but sneaked into mainline ahead of it. That controller has now been shelved, and the mm_owner_changed() args were inadequate for it anyway (they needed an mm pointer instead of a task pointer). Remove the dead code, and restore mm_update_next_owner() locking to how it was before: taking mmap_sem there does nothing for memcontrol.c, now the only user of mm->owner. Signed-off-by: Hugh Dickins Cc: Paul Menage Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 33 --------------------------------- kernel/exit.c | 16 ++++++---------- 2 files changed, 6 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 87bb0258fd2..f221446aa02 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -116,7 +116,6 @@ static int root_count; * be called. */ static int need_forkexit_callback __read_mostly; -static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -2539,7 +2538,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; - need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -2789,37 +2787,6 @@ void cgroup_fork_callbacks(struct task_struct *child) } } -#ifdef CONFIG_MM_OWNER -/** - * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes - * @p: the new owner - * - * Called on every change to mm->owner. mm_init_owner() does not - * invoke this routine, since it assigns the mm->owner the first time - * and does not change it. - * - * The callbacks are invoked with mmap_sem held in read mode. - */ -void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ - struct cgroup *oldcgrp, *newcgrp = NULL; - - if (need_mm_owner_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - oldcgrp = task_cgroup(old, ss->subsys_id); - if (new) - newcgrp = task_cgroup(new, ss->subsys_id); - if (oldcgrp == newcgrp) - continue; - if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); - } - } -} -#endif /* CONFIG_MM_OWNER */ - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question diff --git a/kernel/exit.c b/kernel/exit.c index c9e5a1c14e0..f923724ab3c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -642,35 +642,31 @@ retry: /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL, - * so that subsystems can understand the callback and take action. + * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - down_write(&mm->mmap_sem); - cgroup_mm_owner_callbacks(mm->owner, NULL); mm->owner = NULL; - up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); - read_unlock(&tasklist_lock); - down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } - cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ -- cgit v1.2.3 From 2da02997e08d3efe8174c7a47696e6f7cbe69ba9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jan 2009 14:39:31 -0800 Subject: mm: add dirty_background_bytes and dirty_bytes sysctls This change introduces two new sysctls to /proc/sys/vm: dirty_background_bytes and dirty_bytes. dirty_background_bytes is the counterpart to dirty_background_ratio and dirty_bytes is the counterpart to dirty_ratio. With growing memory capacities of individual machines, it's no longer sufficient to specify dirty thresholds as a percentage of the amount of dirtyable memory over the entire system. dirty_background_bytes and dirty_bytes specify quantities of memory, in bytes, that represent the dirty limits for the entire system. If either of these values is set, its value represents the amount of dirty memory that is needed to commence either background or direct writeback. When a `bytes' or `ratio' file is written, its counterpart becomes a function of the written value. For example, if dirty_bytes is written to be 8096, 8K of memory is required to commence direct writeback. dirty_ratio is then functionally equivalent to 8K / the amount of dirtyable memory: dirtyable_memory = free pages + mapped pages + file cache dirty_background_bytes = dirty_background_ratio * dirtyable_memory -or- dirty_background_ratio = dirty_background_bytes / dirtyable_memory AND dirty_bytes = dirty_ratio * dirtyable_memory -or- dirty_ratio = dirty_bytes / dirtyable_memory Only one of dirty_background_bytes and dirty_background_ratio may be specified at a time, and only one of dirty_bytes and dirty_ratio may be specified. When one sysctl is written, the other appears as 0 when read. The `bytes' files operate on a page size granularity since dirty limits are compared with ZVC values, which are in page units. Prior to this change, the minimum dirty_ratio was 5 as implemented by get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user written value between 0 and 100. This restriction is maintained, but dirty_bytes has a lower limit of only one page. Also prior to this change, the dirty_background_ratio could not equal or exceed dirty_ratio. This restriction is maintained in addition to restricting dirty_background_bytes. If either background threshold equals or exceeds that of the dirty threshold, it is implicitly set to half the dirty threshold. Acked-by: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Signed-off-by: David Rientjes Cc: Andrea Righi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff6d45c7626..92f6e5bc3c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -87,10 +87,6 @@ extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) -static int one = 1; -#endif - #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -101,6 +97,7 @@ static int two = 2; #endif static int zero; +static int one = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -952,11 +949,21 @@ static struct ctl_table vm_table[] = { .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_background_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = &dirty_background_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, { .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", @@ -968,6 +975,16 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = &dirty_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, -- cgit v1.2.3 From 901608d9045146aec6f14a7777ea4b1501c379f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Jan 2009 14:40:29 -0800 Subject: mm: introduce get_mm_hiwater_xxx(), fix taskstats->hiwater_xxx accounting xacct_add_tsk() relies on do_exit()->update_hiwater_xxx() and uses mm->hiwater_xxx directly, this leads to 2 problems: - taskstats_user_cmd() can call fill_pid()->xacct_add_tsk() at any moment before the task exits, so we should check the current values of rss/vm anyway. - do_exit()->update_hiwater_xxx() calls are racy. An exiting thread can be preempted right before mm->hiwater_xxx = new_val, and another thread can use A_LOT of memory and exit in between. When the first thread resumes it can be the last thread in the thread group, in that case we report the wrong hiwater_xxx values which do not take A_LOT into account. Introduce get_mm_hiwater_rss() and get_mm_hiwater_vm() helpers and change xacct_add_tsk() to use them. The first helper will also be used by rusage->ru_maxrss accounting. Kill do_exit()->update_hiwater_xxx() calls. Unless we are going to decrease rss/vm there is no point to update mm->hiwater_xxx, and nobody can look at this mm_struct when exit_mmap() actually unmaps the memory. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 +---- kernel/tsacct.c | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f923724ab3c..c7740fa3252 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1051,10 +1051,7 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab3571..43f891b05a4 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar; -- cgit v1.2.3 From f1883f86dea84fe47a71a39fc1afccc005915ed8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Jan 2009 14:40:45 -0800 Subject: Remove remaining unwinder code Signed-off-by: Alexey Dobriyan Cc: Gabor Gombas Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f47cce910f2..34b56cf0661 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -1449,8 +1448,6 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); - unwind_remove_table(mod->unwind_info, 0); - /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1867,7 +1864,6 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int unwindex = 0; unsigned int num_kp, num_mcount; struct kernel_param *kp; struct module *mod; @@ -1957,9 +1953,6 @@ static noinline struct module *load_module(void __user *umod, versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); -#ifdef ARCH_UNWIND_SECTION_NAME - unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); -#endif /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1969,8 +1962,6 @@ static noinline struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif - if (unwindex) - sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2267,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod, add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - /* Size of section 0 is 0, so this works well if no unwind info. */ - mod->unwind_info = unwind_add_table(mod, - (void *)sechdrs[unwindex].sh_addr, - sechdrs[unwindex].sh_size); - /* Get rid of temporary copy */ vfree(hdr); @@ -2370,7 +2356,6 @@ sys_init_module(void __user *umod, mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); - unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; -- cgit v1.2.3 From 60348802e9cb137ee86590c3e4c57c1ec2e8fc69 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 6 Jan 2009 14:40:46 -0800 Subject: fork.c: cleanup for copy_sighand() Check CLONE_SIGHAND only is enough, because combination of CLONE_THREAD and CLONE_SIGHAND is already done in copy_process(). Impact: cleanup, no functionality changed Signed-off-by: Zhao Lei Cc: Roland McGrath Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 43cbf30669e..23b91211667 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -758,7 +758,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; - if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + if (clone_flags & CLONE_SIGHAND) { atomic_inc(¤t->sighand->count); return 0; } -- cgit v1.2.3 From d6624f996ae539344e8d748cce1117ae7af06fbf Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 6 Jan 2009 14:40:54 -0800 Subject: oops: increment the oops UUID every time we oops ... because we do want repeated same-oops to be seen by automated tools like kerneloops.org Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 13f06349a78..2a2ff36ff44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,8 @@ static int init_oops_id(void) { if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); + else + oops_id++; return 0; } -- cgit v1.2.3 From e3d5a27d5862b6425d0879272e24abecf7245105 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 6 Jan 2009 14:41:02 -0800 Subject: Allow times and time system calls to return small negative values At the moment, the times() system call will appear to fail for a period shortly after boot, while the value it want to return is between -4095 and -1. The same thing will also happen for the time() system call on 32-bit platforms some time in 2106 or so. On some platforms, such as x86, this is unavoidable because of the system call ABI, but other platforms such as powerpc have a separate error indication from the return value, so system calls can in fact return small negative values without indicating an error. On those platforms, force_successful_syscall_return() provides a way to indicate that the system call return value should not be treated as an error even if it is in the range which would normally be taken as a negative error number. This adds a force_successful_syscall_return() call to the time() and times() system calls plus their 32-bit compat versions, so that they don't erroneously indicate an error on those platforms whose system call ABI has a separate error indication. This will not affect anything on other platforms. Joakim Tjernlund added the fix for time() and the compat versions of time() and times(), after I did the fix for times(). Signed-off-by: Joakim Tjernlund Signed-off-by: Paul Mackerras Acked-by: David S. Miller Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 5 ++++- kernel/sys.c | 2 ++ kernel/time.c | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d52e2ec1deb..42d56544460 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } + force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } @@ -894,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/sys.c b/kernel/sys.c index d356d79e84a..4a43617cd56 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf) if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } + force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } diff --git a/kernel/time.c b/kernel/time.c index d63a4336fad..4886e3ce83a 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } -- cgit v1.2.3 From 26e5438e4b77f04a51870f9415ffed68004fac1d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 6 Jan 2009 14:41:10 -0800 Subject: profile: don't include twice. Currently, kernel/profile.c include twice. It can be removed. Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index d18e2d2654f..784933acf5b 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -445,7 +445,6 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include #include -#include static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) -- cgit v1.2.3 From bc2f70151fe7a117dbe8347edc5a877e749572a3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:46 -0800 Subject: kprobes: bugfix: try_module_get even if calling_mod is NULL When someone called register_*probe() from kernel-core code(not from module) and that probes a kernel module, users can remove the probed module because kprobe doesn't increment reference counter of the module. (on the other hand, if the kernel-module calls register_*probe, kprobe increments refcount of the probed module.) Currently, we have no register_*probe() calling from kernel-core(except smoke-test, but the smoke-test doesn't probe module), so there is no real bugs. But the logic is wrong(or not fair) and it can causes a problem when someone might want to probe module from kernel. After this patch is applied, even if someone put register_*probe() call in the kernel-core code, it increments the reference counter of the probed module, and it prevents user to remove the module until stopping probing it. Signed-off-by: Masami Hiramatsu Cc: Lai Jiangshan Cc: Ananth N Mavinakayanahalli Cc: Hiroshi Shimamoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9f8a3f25259..3afd354c46f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -634,7 +634,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, * avoid incrementing the module refcount, so as to allow * unloading of self probing modules. */ - if (calling_mod && calling_mod != probed_mod) { + if (calling_mod != probed_mod) { if (unlikely(!try_module_get(probed_mod))) { preempt_enable(); return -EINVAL; -- cgit v1.2.3 From 8e1144050e49dd4ef19c117dc5626f212cfe73cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:47 -0800 Subject: kprobes: indirectly call kprobe_target Call kprobe_target indirectly. This prevents gcc to unroll a noinline function in caller function. I ported patches which had been discussed on http://sources.redhat.com/bugzilla/show_bug.cgi?id=3542 Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: David Miller Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/test_kprobes.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 06b6395b45b..9c0127ead6a 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,21 +22,10 @@ static u32 rand1, preh_val, posth_val, jph_val; static int errors, handler_errors, num_tests; +static u32 (*target)(u32 value); static noinline u32 kprobe_target(u32 value) { - /* - * gcc ignores noinline on some architectures unless we stuff - * sufficient lard into the function. The get_kprobe() here is - * just for that. - * - * NOTE: We aren't concerned about the correctness of get_kprobe() - * here; hence, this call is neither under !preempt nor with the - * kprobe_mutex held. This is fine(tm) - */ - if (get_kprobe((void *)0xdeadbeef)) - printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); - return (value / div_factor); } @@ -74,7 +63,7 @@ static int test_kprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kprobe(&kp); if (preh_val == 0) { @@ -121,7 +110,7 @@ static int test_jprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -177,7 +166,7 @@ static int test_kretprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -193,6 +182,8 @@ int init_test_probes(void) { int ret; + target = kprobe_target; + do { rand1 = random32(); } while (rand1 <= div_factor); -- cgit v1.2.3 From 12da3b888b2035bb0f106122f1cc1b6d357fad53 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:48 -0800 Subject: kprobes: add tests for register_kprobes Add testcases for *probe batch registration (register_kprobes) to kprobes sanity tests. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: David Miller Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/test_kprobes.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 9c0127ead6a..4f104515a19 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -23,6 +23,7 @@ static u32 rand1, preh_val, posth_val, jph_val; static int errors, handler_errors, num_tests; static u32 (*target)(u32 value); +static u32 (*target2)(u32 value); static noinline u32 kprobe_target(u32 value) { @@ -81,6 +82,84 @@ static int test_kprobe(void) return 0; } +static noinline u32 kprobe_target2(u32 value) +{ + return (value / div_factor) + 1; +} + +static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) +{ + preh_val = (rand1 / div_factor) + 1; + return 0; +} + +static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + if (preh_val != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in post_handler2\n"); + } + posth_val = preh_val + div_factor; +} + +static struct kprobe kp2 = { + .symbol_name = "kprobe_target2", + .pre_handler = kp_pre_handler2, + .post_handler = kp_post_handler2 +}; + +static int test_kprobes(void) +{ + int ret; + struct kprobe *kps[2] = {&kp, &kp2}; + + kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kprobes(kps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kprobes returned %d\n", ret); + return ret; + } + + preh_val = 0; + posth_val = 0; + ret = target(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler not called\n"); + handler_errors++; + } + + preh_val = 0; + posth_val = 0; + ret = target2(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler2 not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler2 not called\n"); + handler_errors++; + } + + unregister_kprobes(kps, 2); + return 0; + +} + static u32 j_kprobe_target(u32 value) { if (value != rand1) { @@ -121,6 +200,43 @@ static int test_jprobe(void) return 0; } +static struct jprobe jp2 = { + .entry = j_kprobe_target, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_jprobes(void) +{ + int ret; + struct jprobe *jps[2] = {&jp, &jp2}; + + jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_jprobes(jps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_jprobes returned %d\n", ret); + return ret; + } + + jph_val = 0; + ret = target(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler not called\n"); + handler_errors++; + } + + jph_val = 0; + ret = target2(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler2 not called\n"); + handler_errors++; + } + unregister_jprobes(jps, 2); + + return 0; +} #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -176,6 +292,63 @@ static int test_kretprobe(void) return 0; } + +static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long ret = regs_return_value(regs); + + if (ret != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in kretprobe handler2\n"); + } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } + + krph_val = rand1; + return 0; +} + +static struct kretprobe rp2 = { + .handler = return_handler2, + .entry_handler = entry_handler, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_kretprobes(void) +{ + int ret; + struct kretprobe *rps[2] = {&rp, &rp2}; + + rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kretprobes(rps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kretprobe returned %d\n", ret); + return ret; + } + + krph_val = 0; + ret = target(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler not called\n"); + handler_errors++; + } + + krph_val = 0; + ret = target2(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler2 not called\n"); + handler_errors++; + } + unregister_kretprobes(rps, 2); + return 0; +} #endif /* CONFIG_KRETPROBES */ int init_test_probes(void) @@ -183,6 +356,7 @@ int init_test_probes(void) int ret; target = kprobe_target; + target2 = kprobe_target2; do { rand1 = random32(); @@ -194,16 +368,31 @@ int init_test_probes(void) if (ret < 0) errors++; + num_tests++; + ret = test_kprobes(); + if (ret < 0) + errors++; + num_tests++; ret = test_jprobe(); if (ret < 0) errors++; + num_tests++; + ret = test_jprobes(); + if (ret < 0) + errors++; + #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); if (ret < 0) errors++; + + num_tests++; + ret = test_kretprobes(); + if (ret < 0) + errors++; #endif /* CONFIG_KRETPROBES */ if (errors) -- cgit v1.2.3 From a06f6211ef9b1785922f9d0e8766d63ac4e66de1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:49 -0800 Subject: module: add within_module_core() and within_module_init() This series of patches allows kprobes to probe module's __init and __exit functions. This means, you can probe driver initialization and terminating. Currently, kprobes can't probe __init function because these functions are freed after module initialization. And it also can't probe module __exit functions because kprobe increments reference count of target module and user can't unload it. this means __exit functions never be called unless removing probes from the module. To solve both cases, this series of patches introduces GONE flag and sets it when the target code is freed(for this purpose, kprobes hooks MODULE_STATE_* events). This also removes refcount incrementing for allowing user to unload target module. Users can check which probes are GONE by debugfs interface. For taking timing of freeing module's .init text, these also include a patch which adds module's notifier of MODULE_STATE_LIVE event. This patch: Add within_module_core() and within_module_init() for checking whether an address is in the module .init.text section or .text section, and replace within() local inline functions in kernel/module.c with them. kprobes uses these functions to check where the kprobe is inserted. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 34b56cf0661..cc79c942c57 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2390,7 +2390,7 @@ static const char *get_ksymbol(struct module *mod, unsigned long nextval; /* At worse, next value is at end of module */ - if (within(addr, mod->module_init, mod->init_size)) + if (within_module_init(addr, mod)) nextval = (unsigned long)mod->module_init+mod->init_text_size; else nextval = (unsigned long)mod->module_core+mod->core_text_size; @@ -2438,8 +2438,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) - || within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -2461,8 +2461,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -2485,8 +2485,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -2705,7 +2705,7 @@ int is_module_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_core, mod->core_size)) { + if (within_module_core(addr, mod)) { preempt_enable(); return 1; } -- cgit v1.2.3 From 129415607845d4daea11ddcba706005c69dcb942 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:50 -0800 Subject: kprobes: add kprobe_insn_mutex and cleanup arch_remove_kprobe() Add kprobe_insn_mutex for protecting kprobe_insn_pages hlist, and remove kprobe_mutex from architecture dependent code. This allows us to call arch_remove_kprobe() (and free_insn_slot) while holding kprobe_mutex. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Russell King Cc: "Luck, Tony" Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3afd354c46f..29e87921437 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobe_enabled; -DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -115,6 +115,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ static struct hlist_head kprobe_insn_pages; static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -144,10 +145,10 @@ loop_end: } /** - * get_insn_slot() - Find a slot on an executable page for an instruction. + * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) return kip->insns; } +kprobe_opcode_t __kprobes *get_insn_slot(void) +{ + kprobe_opcode_t *ret; + mutex_lock(&kprobe_insn_mutex); + ret = __get_insn_slot(); + mutex_unlock(&kprobe_insn_mutex); + return ret; +} + /* Return 1 if all garbages are collected, otherwise 0. */ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) { @@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip; struct hlist_node *pos, *next; + int safety; /* Ensure no-one is preepmted on the garbages */ - if (check_safety() != 0) + mutex_unlock(&kprobe_insn_mutex); + safety = check_safety(); + mutex_lock(&kprobe_insn_mutex); + if (safety != 0) return -EAGAIN; hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { @@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; + mutex_lock(&kprobe_insn_mutex); hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { @@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); + + mutex_unlock(&kprobe_insn_mutex); } #endif -- cgit v1.2.3 From 017c39bdb1b3ac1da6db339474a77b528043c05a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:51 -0800 Subject: kprobes: add __kprobes to kprobe internal functions Add __kprobes to kprobes internal functions for protecting from probing by kprobes itself. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 29e87921437..a1e233a1958 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -410,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, head); } -void kretprobe_hash_lock(struct task_struct *tsk, +void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -421,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_lock(unsigned long hash, + unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -436,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) spin_unlock_irqrestore(hlist_lock, *flags); } -void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -762,7 +764,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) } } -static int __register_kprobes(struct kprobe **kps, int num, +static int __kprobes __register_kprobes(struct kprobe **kps, int num, unsigned long called_from) { int i, ret = 0; @@ -828,7 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -static int __register_jprobes(struct jprobe **jps, int num, +static int __kprobes __register_jprobes(struct jprobe **jps, int num, unsigned long called_from) { struct jprobe *jp; @@ -990,7 +992,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, return ret; } -static int __register_kretprobes(struct kretprobe **rps, int num, +static int __kprobes __register_kretprobes(struct kretprobe **rps, int num, unsigned long called_from) { int ret = 0, i; -- cgit v1.2.3 From e8386a0cb22f4a2d439384212c494ad0bda848fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:52 -0800 Subject: kprobes: support probing module __exit function Allows kprobes to probe __exit routine. This adds flags member to struct kprobe. When module is freed(kprobes hooks module_notifier to get this event), kprobes which probe the functions in that module are set to "Gone" flag to the flags member. These "Gone" probes are never be enabled. Users can check the GONE flag through debugfs. This also removes mod_refcounted, because we couldn't free a module if kprobe incremented the refcount of that module. [akpm@linux-foundation.org: document some locking] [mhiramat@redhat.com: bugfix: pass aggr_kprobe to arch_remove_kprobe] [mhiramat@redhat.com: bugfix: release old_p's insn_slot before error return] Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 159 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 119 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a1e233a1958..cb732a9aa55 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -327,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler) { + if (kp->pre_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; @@ -343,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler) { + if (kp->post_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); reset_kprobe_instance(); @@ -545,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; - if (p->post_handler) + /* We don't care the kprobe which has gone. */ + if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); @@ -566,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap; + if (kprobe_gone(old_p)) { + /* + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. + */ + ret = arch_prepare_kprobe(old_p); + if (ret) + return ret; + } if (old_p->pre_handler == aggr_pre_handler) { copy_kprobe(old_p, p); ret = add_new_kprobe(old_p, p); + ap = old_p; } else { ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!ap) + if (!ap) { + if (kprobe_gone(old_p)) + arch_remove_kprobe(old_p); return -ENOMEM; + } add_aggr_kprobe(ap, old_p); copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); } + if (kprobe_gone(old_p)) { + /* + * If the old_p has gone, its breakpoint has been disarmed. + * We have to arm it again after preparing real kprobes. + */ + ap->flags &= ~KPROBE_FLAG_GONE; + if (kprobe_enabled) + arch_arm_kprobe(ap); + } return ret; } @@ -639,8 +664,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; } - p->mod_refcounted = 0; - + p->flags = 0; /* * Check if are we probing a module. */ @@ -649,16 +673,14 @@ static int __kprobes __register_kprobe(struct kprobe *p, struct module *calling_mod; calling_mod = __module_text_address(called_from); /* - * We must allow modules to probe themself and in this case - * avoid incrementing the module refcount, so as to allow - * unloading of self probing modules. + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. */ if (calling_mod != probed_mod) { if (unlikely(!try_module_get(probed_mod))) { preempt_enable(); return -EINVAL; } - p->mod_refcounted = 1; } else probed_mod = NULL; } @@ -687,8 +709,9 @@ static int __kprobes __register_kprobe(struct kprobe *p, out: mutex_unlock(&kprobe_mutex); - if (ret && probed_mod) + if (probed_mod) module_put(probed_mod); + return ret; } @@ -716,16 +739,16 @@ valid_p: list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are - * enabled - otherwise, the breakpoint would already have - * been removed. We save on flushing icache. + * enabled and not gone - otherwise, the breakpoint would + * already have been removed. We save on flushing icache. */ - if (kprobe_enabled) + if (kprobe_enabled && !kprobe_gone(old_p)) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) old_p->break_handler = NULL; - if (p->post_handler) { + if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &old_p->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; @@ -740,27 +763,16 @@ noclean: static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) { - struct module *mod; struct kprobe *old_p; - if (p->mod_refcounted) { - /* - * Since we've already incremented refcount, - * we don't need to disable preemption. - */ - mod = module_text_address((unsigned long)p->addr); - if (mod) - module_put(mod); - } - - if (list_empty(&p->list) || list_is_singular(&p->list)) { - if (!list_empty(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); - list_del(&p->list); - kfree(old_p); - } + if (list_empty(&p->list)) arch_remove_kprobe(p); + else if (list_is_singular(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); + arch_remove_kprobe(old_p); + kfree(old_p); } } @@ -1074,6 +1086,67 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, #endif /* CONFIG_KRETPROBES */ +/* Set the kprobe gone and remove its instruction buffer. */ +static void __kprobes kill_kprobe(struct kprobe *p) +{ + struct kprobe *kp; + p->flags |= KPROBE_FLAG_GONE; + if (p->pre_handler == aggr_pre_handler) { + /* + * If this is an aggr_kprobe, we have to list all the + * chained probes and mark them GONE. + */ + list_for_each_entry_rcu(kp, &p->list, list) + kp->flags |= KPROBE_FLAG_GONE; + p->post_handler = NULL; + p->break_handler = NULL; + } + /* + * Here, we can remove insn_slot safely, because no thread calls + * the original probed function (which will be freed soon) any more. + */ + arch_remove_kprobe(p); +} + +/* Module notifier call back, checking kprobes on the module */ +static int __kprobes kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + /* + * module .text section will be freed. We need to + * disable kprobes which have been inserted in the section. + */ + mutex_lock(&kprobe_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (within_module_core((unsigned long)p->addr, mod)) { + /* + * The vaddr this probe is installed will soon + * be vfreed buy not synced to disk. Hence, + * disarming the breakpoint isn't needed. + */ + kill_kprobe(p); + } + } + mutex_unlock(&kprobe_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block kprobe_module_nb = { + .notifier_call = kprobes_module_callback, + .priority = 0 +}; + static int __init init_kprobes(void) { int i, err = 0; @@ -1130,6 +1203,9 @@ static int __init init_kprobes(void) err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + if (!err) + err = register_module_notifier(&kprobe_module_nb); + kprobes_initialized = (err == 0); if (!err) @@ -1150,10 +1226,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; if (sym) - seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, - sym, offset, (modname ? modname : " ")); + seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, + sym, offset, (modname ? modname : " "), + (kprobe_gone(p) ? "[GONE]" : "")); else - seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); + seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, + (kprobe_gone(p) ? "[GONE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1234,7 +1312,8 @@ static void __kprobes enable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - arch_arm_kprobe(p); + if (!kprobe_gone(p)) + arch_arm_kprobe(p); } kprobe_enabled = true; @@ -1263,7 +1342,7 @@ static void __kprobes disable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p)) + if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) arch_disarm_kprobe(p); } } -- cgit v1.2.3 From 49ad2fd76c97133fb396edc24ded7fe26093a578 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:53 -0800 Subject: kprobes: remove called_from argument Remove called_from argument from kprobes which had been used for preventing self-refering of kernel module. However, since we don't keep module's refcount after registering kprobe any more, there is no reason to check that. This patch also simplifies registering/unregistering functions because we don't need to use __builtin_return_address(0) which was passed to called_from. [ananth@in.ibm.com: build fix] Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 72 +++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cb732a9aa55..ddefb9fae0c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -644,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } -static int __kprobes __register_kprobe(struct kprobe *p, - unsigned long called_from) +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; struct kprobe *old_p; @@ -670,19 +669,14 @@ static int __kprobes __register_kprobe(struct kprobe *p, */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod; - calling_mod = __module_text_address(called_from); /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } - } else - probed_mod = NULL; + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); + return -EINVAL; + } } preempt_enable(); @@ -776,15 +770,14 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) } } -static int __kprobes __register_kprobes(struct kprobe **kps, int num, - unsigned long called_from) +int __kprobes register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kprobe(kps[i], called_from); + ret = register_kprobe(kps[i]); if (ret < 0) { if (i > 0) unregister_kprobes(kps, i); @@ -794,26 +787,11 @@ static int __kprobes __register_kprobes(struct kprobe **kps, int num, return ret; } -/* - * Registration and unregistration functions for kprobe. - */ -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobes(&p, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } -int __kprobes register_kprobes(struct kprobe **kps, int num) -{ - return __register_kprobes(kps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -842,8 +820,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -static int __kprobes __register_jprobes(struct jprobe **jps, int num, - unsigned long called_from) +int __kprobes register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -861,7 +838,7 @@ static int __kprobes __register_jprobes(struct jprobe **jps, int num, /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; - ret = __register_kprobe(&jp->kp, called_from); + ret = register_kprobe(&jp->kp); } if (ret < 0) { if (i > 0) @@ -874,8 +851,7 @@ static int __kprobes __register_jprobes(struct jprobe **jps, int num, int __kprobes register_jprobe(struct jprobe *jp) { - return __register_jprobes(&jp, 1, - (unsigned long)__builtin_return_address(0)); + return register_jprobes(&jp, 1); } void __kprobes unregister_jprobe(struct jprobe *jp) @@ -883,12 +859,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp) unregister_jprobes(&jp, 1); } -int __kprobes register_jprobes(struct jprobe **jps, int num) -{ - return __register_jprobes(jps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -951,8 +921,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -static int __kprobes __register_kretprobe(struct kretprobe *rp, - unsigned long called_from) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -998,21 +967,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, rp->nmissed = 0; /* Establish function entry probe point */ - ret = __register_kprobe(&rp->kp, called_from); + ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); return ret; } -static int __kprobes __register_kretprobes(struct kretprobe **rps, int num, - unsigned long called_from) +int __kprobes register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kretprobe(rps[i], called_from); + ret = register_kretprobe(rps[i]); if (ret < 0) { if (i > 0) unregister_kretprobes(rps, i); @@ -1022,23 +990,11 @@ static int __kprobes __register_kretprobes(struct kretprobe **rps, int num, return ret; } -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return __register_kretprobes(&rp, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return __register_kretprobes(rps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { int i; -- cgit v1.2.3 From 0deddf436a37c18ceb26c6e3b632fb9b5f58a0c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:54 -0800 Subject: module: add MODULE_STATE_LIVE notify Add a module notifier call which notifies that the state of a module changes from MODULE_STATE_COMING to MODULE_STATE_LIVE. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index cc79c942c57..496dcb57b60 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2352,6 +2352,8 @@ sys_init_module(void __user *umod, /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; wake_up(&module_wq); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); mutex_lock(&module_mutex); /* Drop initial reference. */ -- cgit v1.2.3 From f24659d96f4e056125f14498285203d1427cb18e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:55 -0800 Subject: kprobes: support probing module __init function Allow kprobes to probe module __init routines. When __init functions are freed, kprobes which probe those functions are set to "Gone" flag. These "Gone" probes are disarmed from the code and never be enabled. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ddefb9fae0c..1b9cbdc0127 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -677,6 +677,16 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_enable(); return -EINVAL; } + /* + * If the module freed .init.text, we couldn't insert + * kprobes in there. + */ + if (within_module_init((unsigned long)p->addr, probed_mod) && + probed_mod->state != MODULE_STATE_COMING) { + module_put(probed_mod); + preempt_enable(); + return -EINVAL; + } } preempt_enable(); @@ -1073,19 +1083,24 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, struct hlist_node *node; struct kprobe *p; unsigned int i; + int checkcore = (val == MODULE_STATE_GOING); - if (val != MODULE_STATE_GOING) + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; /* - * module .text section will be freed. We need to - * disable kprobes which have been inserted in the section. + * When MODULE_STATE_GOING was notified, both of module .text and + * .init.text sections would be freed. When MODULE_STATE_LIVE was + * notified, only .init.text section would be freed. We need to + * disable kprobes which have been inserted in the sections. */ mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - if (within_module_core((unsigned long)p->addr, mod)) { + if (within_module_init((unsigned long)p->addr, mod) || + (checkcore && + within_module_core((unsigned long)p->addr, mod))) { /* * The vaddr this probe is installed will soon * be vfreed buy not synced to disk. Hence, -- cgit v1.2.3 From bd4207c9016749f0a212faf7f7f49e5317d96d9b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 6 Jan 2009 14:42:39 -0800 Subject: kmod: fix varargs kernel-doc Fix varargs kernel-doc format in kmod.c: Use @... instead of @varargs. Warning(kernel/kmod.c:67): Excess function parameter or struct member 'varargs' description in 'request_module' Signed-off-by: Randy Dunlap Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index b46dbb90866..a27a5f64443 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; /** * request_module - try to load a kernel module - * @fmt: printf style format string for the name of the module - * @varargs: arguements as specified in the format string + * @fmt: printf style format string for the name of the module + * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns * zero on success or a negative errno code on failure. Note that a -- cgit v1.2.3 From 09bca05c90c639f57aae057e0c28f287e61f5a07 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 6 Jan 2009 14:42:45 -0800 Subject: SEND_SIG_NOINFO: masquerade si_pid when crossing pid-ns boundary For SEND_SIG_NOINFO, si_pid is currently set to the pid of sender in sender's active pid namespace. But if the receiver is in a Eg: when parent sends the 'pdeath_signal' to a child that is in a descendant pid namespace, we should set si_pid 0. Signed-off-by: Sukadev Bhattiprolu Acked-By: Roland McGrath Cc: "Eric W. Biederman" Cc: Oleg Nesterov Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8e95855ff3c..31db63b3f88 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = task_pid_vnr(current); + q->info.si_pid = task_pid_nr_ns(current, + task_active_pid_ns(t)); q->info.si_uid = current_uid(); break; case (unsigned long) SEND_SIG_PRIV: -- cgit v1.2.3 From 9cd4fd10437dda6b520cb1410b28f36967a34de8 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 6 Jan 2009 14:42:46 -0800 Subject: SEND_SIG_NOINFO: set si_pid to tgid instead of pid POSIX requires the si_pid to be the process id of the sender, so ->si_pid should really be set to 'tgid'. This change does have following changes in behavior: - When sending pdeath_signal on re-parent to a sub-thread, ->si_pid cannot be used to identify the thread that did the re-parent since it will now show the tgid instead of thread id. - A multi-threaded application that expects to find the specific thread that encountered a SIGPIPE using the ->si_pid will now break. Signed-off-by: Sukadev Bhattiprolu Acked-By: Roland McGrath Cc: "Eric W. Biederman" Cc: Oleg Nesterov Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 31db63b3f88..3152ac3b62e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -858,7 +858,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = task_pid_nr_ns(current, + q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); q->info.si_uid = current_uid(); break; -- cgit v1.2.3 From 4cb0e11b15d2badad455fcd538af0cccf05dc012 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 6 Jan 2009 14:42:47 -0800 Subject: coredump_filter: permit changing of the default filter Introduce a new kernel parameter `coredump_filter'. Setting a value to this parameter causes the default bitmask of coredump_filter to be changed. It is useful for users to change coredump_filter settings for the whole system at boot time. Without this parameter, users have to change coredump_filter settings for each /proc// in an initializing script. Signed-off-by: Hidehiro Kawai Cc: Roland McGrath Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 23b91211667..7b8f2a78be3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; + +static int __init coredump_filter_setup(char *s) +{ + default_dump_filter = + (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & + MMF_DUMP_FILTER_MASK; + return 1; +} + +__setup("coredump_filter=", coredump_filter_setup); + #include static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) @@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags - : MMF_DUMP_FILTER_DEFAULT; + mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.2.3 From 0bef3c2dc7d0c8238330785c8f4504761b0e370b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 6 Jan 2009 14:43:08 -0800 Subject: dma_alloc_from_coherent(): fix fallback to generic memory If bitmap_find_free_region() fails and DMA_MEMORY_EXCLUSIVE is not set, the function will fail to write anything to *ret and will return 1. This will cause dma_alloc_coherent() to return an uninitialised value, crashing the kernel, perhaps via DMA to a random address. Fix that by changing it to return zero in this case, so the caller will proceed to allocate the memory from the generic memory allocator. Cc: Tetsuo Handa Cc: Dmitry Baryshkov Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index f013a0c2e11..4bdcea822b4 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -116,11 +116,25 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, int page = bitmap_find_free_region(mem->bitmap, mem->size, order); if (page >= 0) { + /* + * Memory was found in the per-device arena. + */ *dma_handle = mem->device_base + (page << PAGE_SHIFT); *ret = mem->virt_base + (page << PAGE_SHIFT); memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { + /* + * The per-device arena is exhausted and we are not + * permitted to fall back to generic memory. + */ *ret = NULL; + } else { + /* + * The per-device arena is exhausted and we are + * permitted to fall back to generic memory. + */ + return 0; + } } return (mem != NULL); } -- cgit v1.2.3 From eccd83e116e7f414a1da3aae3745384b7b171883 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 6 Jan 2009 14:43:09 -0800 Subject: dma_alloc_coherent: clean it up This thing was rather stupidly coded. Rework it all prior to making changes. Also, rename local variable `page': kernel readers expect something called `page' to have type `struct page *'. Cc: Guennadi Liakhovetski Cc: Johannes Weiner Cc: Pekka Enberg Cc: Dmitry Baryshkov Cc: Jesse Barnes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 54 +++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 4bdcea822b4..8056d081609 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -109,34 +109,38 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + struct dma_coherent_mem *mem; int order = get_order(size); + int pageno; - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - /* - * Memory was found in the per-device arena. - */ - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { - /* - * The per-device arena is exhausted and we are not - * permitted to fall back to generic memory. - */ - *ret = NULL; - } else { - /* - * The per-device arena is exhausted and we are - * permitted to fall back to generic memory. - */ - return 0; - } + if (!dev) + return 0; + mem = dev->dma_mem; + if (!mem) + return 0; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (pageno >= 0) { + /* + * Memory was found in the per-device arena. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *ret = mem->virt_base + (pageno << PAGE_SHIFT); + memset(*ret, 0, size); + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { + /* + * The per-device arena is exhausted and we are not + * permitted to fall back to generic memory. + */ + *ret = NULL; + } else { + /* + * The per-device arena is exhausted and we are + * permitted to fall back to generic memory. + */ + return 0; } - return (mem != NULL); + return 1; } EXPORT_SYMBOL(dma_alloc_from_coherent); -- cgit v1.2.3 From 58c6d3dfe436eb8cfb451981d8fdc9044eaf42da Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Jan 2009 14:43:10 -0800 Subject: dma-coherent: catch oversized requests to dma_alloc_from_coherent() Prevent passing an order to bitmap_find_free_region() that is larger than the actual bitmap can represent. These requests can come from device drivers that have no idea how big the dma region is and need to rely on dma_alloc_from_coherent() to sort it out for them. Reported-by: Guennadi Liakhovetski Signed-off-by: Johannes Weiner Cc: Pekka Enberg Cc: Dmitry Baryshkov Cc: Jesse Barnes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 8056d081609..038707404b7 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -118,6 +118,8 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, mem = dev->dma_mem; if (!mem) return 0; + if (unlikely(size > mem->size)) + return 0; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); if (pageno >= 0) { -- cgit v1.2.3 From da8d5089da6dfd54e5fd05d0c291a63c2bcf6885 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Jan 2009 15:28:57 +0100 Subject: sched: fix possible recursive rq->lock Vaidyanathan Srinivasan reported: > ============================================= > [ INFO: possible recursive locking detected ] > 2.6.28-autotest-tip-sv #1 > --------------------------------------------- > klogd/5062 is trying to acquire lock: > (&rq->lock){++..}, at: [] task_rq_lock+0x45/0x7e > > but task is already holding lock: > (&rq->lock){++..}, at: [] schedule+0x158/0xa31 With sched_mc at 2. (it is default-off) Strictly speaking we'll not deadlock, because ttwu will not be able to place the migration task on our rq, but since the code can deal with both rqs getting unlocked, this seems the easiest way out. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2e3545f57e7..deb5ac8c12f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3728,8 +3728,13 @@ redo: } double_unlock_balance(this_rq, busiest); + /* + * Should not call ttwu while holding a rq->lock + */ + spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); + spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; -- cgit v1.2.3 From 22a9d645677feefd402befd02edd59b122289ef1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 7 Jan 2009 08:45:46 -0800 Subject: async: Asynchronous function calls to speed up kernel boot Right now, most of the kernel boot is strictly synchronous, such that various hardware delays are done sequentially. In order to make the kernel boot faster, this patch introduces infrastructure to allow doing some of the initialization steps asynchronously, which will hide significant portions of the hardware delays in practice. In order to not change device order and other similar observables, this patch does NOT do full parallel initialization. Rather, it operates more in the way an out of order CPU does; the work may be done out of order and asynchronous, but the observable effects (instruction retiring for the CPU) are still done in the original sequence. Signed-off-by: Arjan van de Ven --- kernel/Makefile | 3 +- kernel/async.c | 321 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/autoprobe.c | 5 + kernel/module.c | 2 + 4 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 kernel/async.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index e1c5bf3365c..2921d90ce32 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + async.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/async.c b/kernel/async.c new file mode 100644 index 00000000000..afaa8a653d5 --- /dev/null +++ b/kernel/async.c @@ -0,0 +1,321 @@ +/* + * async.c: Asynchronous function calls for boot performance + * + * (C) Copyright 2009 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + + +/* + +Goals and Theory of Operation + +The primary goal of this feature is to reduce the kernel boot time, +by doing various independent hardware delays and discovery operations +decoupled and not strictly serialized. + +More specifically, the asynchronous function call concept allows +certain operations (primarily during system boot) to happen +asynchronously, out of order, while these operations still +have their externally visible parts happen sequentially and in-order. +(not unlike how out-of-order CPUs retire their instructions in order) + +Key to the asynchronous function call implementation is the concept of +a "sequence cookie" (which, although it has an abstracted type, can be +thought of as a monotonically incrementing number). + +The async core will assign each scheduled event such a sequence cookie and +pass this to the called functions. + +The asynchronously called function should before doing a globally visible +operation, such as registering device numbers, call the +async_synchronize_cookie() function and pass in its own cookie. The +async_synchronize_cookie() function will make sure that all asynchronous +operations that were scheduled prior to the operation corresponding with the +cookie have completed. + +Subsystem/driver initialization code that scheduled asynchronous probe +functions, but which shares global resources with other drivers/subsystems +that do not use the asynchronous call feature, need to do a full +synchronization with the async_synchronize_full() function, before returning +from their init function. This is to maintain strict ordering between the +asynchronous and synchronous parts of the kernel. + +*/ + +#include +#include +#include +#include +#include +#include +#include + +static async_cookie_t next_cookie = 1; + +#define MAX_THREADS 256 +#define MAX_WORK 32768 + +static LIST_HEAD(async_pending); +static LIST_HEAD(async_running); +static DEFINE_SPINLOCK(async_lock); + +struct async_entry { + struct list_head list; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; +}; + +static DECLARE_WAIT_QUEUE_HEAD(async_done); +static DECLARE_WAIT_QUEUE_HEAD(async_new); + +static atomic_t entry_count; +static atomic_t thread_count; + +extern int initcall_debug; + + +/* + * MUST be called with the lock held! + */ +static async_cookie_t __lowest_in_progress(struct list_head *running) +{ + struct async_entry *entry; + if (!list_empty(&async_pending)) { + entry = list_first_entry(&async_pending, + struct async_entry, list); + return entry->cookie; + } else if (!list_empty(running)) { + entry = list_first_entry(running, + struct async_entry, list); + return entry->cookie; + } else { + /* nothing in progress... next_cookie is "infinity" */ + return next_cookie; + } + +} +/* + * pick the first pending entry and run it + */ +static void run_one_entry(void) +{ + unsigned long flags; + struct async_entry *entry; + ktime_t calltime, delta, rettime; + + /* 1) pick one task from the pending queue */ + + spin_lock_irqsave(&async_lock, flags); + if (list_empty(&async_pending)) + goto out; + entry = list_first_entry(&async_pending, struct async_entry, list); + + /* 2) move it to the running queue */ + list_del(&entry->list); + list_add_tail(&entry->list, &async_running); + spin_unlock_irqrestore(&async_lock, flags); + + /* 3) run it (and print duration)*/ + if (initcall_debug) { + printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); + calltime = ktime_get(); + } + entry->func(entry->data, entry->cookie); + if (initcall_debug) { + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, + entry->func, ktime_to_ns(delta) >> 10); + } + + /* 4) remove it from the running queue */ + spin_lock_irqsave(&async_lock, flags); + list_del(&entry->list); + + /* 5) free the entry */ + kfree(entry); + atomic_dec(&entry_count); + + spin_unlock_irqrestore(&async_lock, flags); + + /* 6) wake up any waiters. */ + wake_up(&async_done); + return; + +out: + spin_unlock_irqrestore(&async_lock, flags); +} + + +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +{ + struct async_entry *entry; + unsigned long flags; + async_cookie_t newcookie; + + + /* allow irq-off callers */ + entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); + + /* + * If we're out of memory or if there's too much work + * pending already, we execute synchronously. + */ + if (!entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + spin_lock_irqsave(&async_lock, flags); + newcookie = next_cookie++; + spin_unlock_irqrestore(&async_lock, flags); + + /* low on memory.. run synchronously */ + ptr(data, newcookie); + return newcookie; + } + entry->func = ptr; + entry->data = data; + entry->running = running; + + spin_lock_irqsave(&async_lock, flags); + newcookie = entry->cookie = next_cookie++; + list_add_tail(&entry->list, &async_pending); + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + wake_up(&async_new); + return newcookie; +} + +async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +{ + return __async_schedule(ptr, data, &async_pending); +} +EXPORT_SYMBOL_GPL(async_schedule); + +async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) +{ + return __async_schedule(ptr, data, running); +} +EXPORT_SYMBOL_GPL(async_schedule_special); + +void async_synchronize_full(void) +{ + async_synchronize_cookie(next_cookie); +} +EXPORT_SYMBOL_GPL(async_synchronize_full); + +void async_synchronize_full_special(struct list_head *list) +{ + async_synchronize_cookie_special(next_cookie, list); +} +EXPORT_SYMBOL_GPL(async_synchronize_full_special); + +void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) +{ + ktime_t starttime, delta, endtime; + + if (initcall_debug) { + printk("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); + } + + wait_event(async_done, __lowest_in_progress(running) >= cookie); + + if (initcall_debug) { + endtime = ktime_get(); + delta = ktime_sub(endtime, starttime); + + printk("async_continuing @ %i after %lli usec\n", + task_pid_nr(current), ktime_to_ns(delta) >> 10); + } +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); + +void async_synchronize_cookie(async_cookie_t cookie) +{ + async_synchronize_cookie_special(cookie, &async_running); +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie); + + +static int async_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int ret = HZ; + set_current_state(TASK_INTERRUPTIBLE); + /* + * check the list head without lock.. false positives + * are dealt with inside run_one_entry() while holding + * the lock. + */ + rmb(); + if (!list_empty(&async_pending)) + run_one_entry(); + else + ret = schedule_timeout(HZ); + + if (ret == 0) { + /* + * we timed out, this means we as thread are redundant. + * we sign off and die, but we to avoid any races there + * is a last-straw check to see if work snuck in. + */ + atomic_dec(&thread_count); + wmb(); /* manager must see our departure first */ + if (list_empty(&async_pending)) + break; + /* + * woops work came in between us timing out and us + * signing off; we need to stay alive and keep working. + */ + atomic_inc(&thread_count); + } + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int async_manager_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int tc, ec; + + set_current_state(TASK_INTERRUPTIBLE); + + tc = atomic_read(&thread_count); + rmb(); + ec = atomic_read(&entry_count); + + while (tc < ec && tc < MAX_THREADS) { + kthread_run(async_thread, NULL, "async/%i", tc); + atomic_inc(&thread_count); + tc++; + } + + schedule(); + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int __init async_init(void) +{ + kthread_run(async_manager_thread, NULL, "async/mgr"); + return 0; +} + +core_initcall(async_init); diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cc0f7321b8c..1de9700f416 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internals.h" @@ -34,6 +35,10 @@ unsigned long probe_irq_on(void) unsigned int status; int i; + /* + * quiesce the kernel, or at least the asynchronous portion + */ + async_synchronize_full(); mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to diff --git a/kernel/module.c b/kernel/module.c index 496dcb57b60..c9332c90d5a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -50,6 +50,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -816,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + async_synchronize_full(); mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); -- cgit v1.2.3 From ad160d23198193135cb2bcc75222e0816b5838c0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 7 Jan 2009 09:28:53 -0800 Subject: async: don't do the initcall stuff post boot while tracking the asynchronous calls during boot using the initcall_debug convention is useful, doing it once the kernel is done is actually bad now that we use asynchronous operations post boot as well... Signed-off-by: Arjan van de Ven --- kernel/async.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index afaa8a653d5..97373380c9e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -124,12 +124,12 @@ static void run_one_entry(void) spin_unlock_irqrestore(&async_lock, flags); /* 3) run it (and print duration)*/ - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, @@ -220,14 +220,14 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r { ktime_t starttime, delta, endtime; - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, __lowest_in_progress(running) >= cookie); - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); -- cgit v1.2.3 From 490dea45d00f01847ebebd007685d564aaf2cd98 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Nov 2008 17:06:57 +0100 Subject: itimers: remove the per-cpu-ish-ness Either we bounce once cacheline per cpu per tick, yielding n^2 bounces or we just bounce a single.. Also, using per-cpu allocations for the thread-groups complicates the per-cpu allocator in that its currently aimed to be a fixed sized allocator and the only possible extention to that would be vmap based, which is seriously constrained on 32 bit archs. So making the per-cpu memory requirement depend on the number of processes is an issue. Lastly, it didn't deal with cpu-hotplug, although admittedly that might be fixable. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/fork.c | 15 +++++----- kernel/posix-cpu-timers.c | 70 ----------------------------------------------- kernel/sched_stats.h | 33 ++++++++++------------ 3 files changed, 22 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3..7087d8c0e5e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -820,14 +820,15 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) int ret; if (clone_flags & CLONE_THREAD) { - ret = thread_group_cputime_clone_thread(current); - if (likely(!ret)) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - } - return ret; + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + + if (sig) + posix_cpu_timers_init_group(sig); + tsk->signal = sig; if (!sig) return -ENOMEM; @@ -864,8 +865,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); tty_audit_fork(sig); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 157de3a4783..fa07da94d7b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -9,76 +9,6 @@ #include #include -/* - * Allocate the thread_group_cputime structure appropriately and fill in the - * current values of the fields. Called from copy_signal() via - * thread_group_cputime_clone_thread() when adding a second or subsequent - * thread to a thread group. Assumes interrupts are enabled when called. - */ -int thread_group_cputime_alloc(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct task_cputime *cputime; - - /* - * If we have multiple threads and we don't already have a - * per-CPU task_cputime struct (checked in the caller), allocate - * one and fill it in with the times accumulated so far. We may - * race with another thread so recheck after we pick up the sighand - * lock. - */ - cputime = alloc_percpu(struct task_cputime); - if (cputime == NULL) - return -ENOMEM; - spin_lock_irq(&tsk->sighand->siglock); - if (sig->cputime.totals) { - spin_unlock_irq(&tsk->sighand->siglock); - free_percpu(cputime); - return 0; - } - sig->cputime.totals = cputime; - cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); - cputime->utime = tsk->utime; - cputime->stime = tsk->stime; - cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; - spin_unlock_irq(&tsk->sighand->siglock); - return 0; -} - -/** - * thread_group_cputime - Sum the thread group time fields across all CPUs. - * - * @tsk: The task we use to identify the thread group. - * @times: task_cputime structure in which we return the summed fields. - * - * Walk the list of CPUs to sum the per-CPU time fields in the thread group - * time structure. - */ -void thread_group_cputime( - struct task_struct *tsk, - struct task_cputime *times) -{ - struct task_cputime *totals, *tot; - int i; - - totals = tsk->signal->cputime.totals; - if (!totals) { - times->utime = tsk->utime; - times->stime = tsk->stime; - times->sum_exec_runtime = tsk->se.sum_exec_runtime; - return; - } - - times->stime = times->utime = cputime_zero; - times->sum_exec_runtime = 0; - for_each_possible_cpu(i) { - tot = per_cpu_ptr(totals, i); - times->utime = cputime_add(times->utime, tot->utime); - times->stime = cputime_add(times->stime, tot->stime); - times->sum_exec_runtime += tot->sum_exec_runtime; - } -} - /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index f2773b5d122..8ab0cef8eca 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,6 +296,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { + struct task_cputime *times; struct signal_struct *sig; /* tsk == current, ensure it is safe to use ->signal */ @@ -303,13 +304,11 @@ static inline void account_group_user_time(struct task_struct *tsk, return; sig = tsk->signal; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->utime = cputime_add(times->utime, cputime); - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->utime = cputime_add(times->utime, cputime); + spin_unlock(×->lock); } /** @@ -325,6 +324,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { + struct task_cputime *times; struct signal_struct *sig; /* tsk == current, ensure it is safe to use ->signal */ @@ -332,13 +332,11 @@ static inline void account_group_system_time(struct task_struct *tsk, return; sig = tsk->signal; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->stime = cputime_add(times->stime, cputime); - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->stime = cputime_add(times->stime, cputime); + spin_unlock(×->lock); } /** @@ -354,6 +352,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { + struct task_cputime *times; struct signal_struct *sig; sig = tsk->signal; @@ -362,11 +361,9 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->sum_exec_runtime += ns; - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->sum_exec_runtime += ns; + spin_unlock(×->lock); } -- cgit v1.2.3 From e8de1481fd7126ee9e93d6889da6f00c05e1e019 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 22 Oct 2008 19:55:31 -0700 Subject: resource: allow MMIO exclusivity for device drivers Device drivers that use pci_request_regions() (and similar APIs) have a reasonable expectation that they are the only ones accessing their device. As part of the e1000e hunt, we were afraid that some userland (X or some bootsplash stuff) was mapping the MMIO region that the driver thought it had exclusively via /dev/mem or via various sysfs resource mappings. This patch adds the option for device drivers to cause their reserved regions to the "banned from /dev/mem use" list, so now both kernel memory and device-exclusive MMIO regions are banned. NOTE: This is only active when CONFIG_STRICT_DEVMEM is set. In addition to the config option, a kernel parameter iomem=relaxed is provided for the cases where developers want to diagnose, in the field, drivers issues from userspace. Reviewed-by: Matthew Wilcox Signed-off-by: Arjan van de Ven Signed-off-by: Jesse Barnes --- kernel/resource.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index e633106b12f..ca6a1536b20 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res) */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, - const char *name) + const char *name, int flags) { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); @@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent, res->start = start; res->end = start + n - 1; res->flags = IORESOURCE_BUSY; + res->flags |= flags; write_lock(&resource_lock); @@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start, { struct resource * res; - res = __request_region(parent, start, n, "check-region"); + res = __request_region(parent, start, n, "check-region", 0); if (!res) return -EBUSY; @@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev, dr->start = start; dr->n = n; - res = __request_region(parent, start, n, name); + res = __request_region(parent, start, n, name, 0); if (res) devres_add(dev, dr); else @@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) return err; } + +#ifdef CONFIG_STRICT_DEVMEM +static int strict_iomem_checks = 1; +#else +static int strict_iomem_checks; +#endif + +/* + * check if an address is reserved in the iomem resource tree + * returns 1 if reserved, 0 if not reserved. + */ +int iomem_is_exclusive(u64 addr) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + int size = PAGE_SIZE; + + if (!strict_iomem_checks) + return 0; + + addr = addr & PAGE_MASK; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + break; + if (p->end < addr) + continue; + if (p->flags & IORESOURCE_BUSY && + p->flags & IORESOURCE_EXCLUSIVE) { + err = 1; + break; + } + } + read_unlock(&resource_lock); + + return err; +} + +static int __init strict_iomem(char *str) +{ + if (strstr(str, "relaxed")) + strict_iomem_checks = 0; + if (strstr(str, "strict")) + strict_iomem_checks = 1; + return 1; +} + +__setup("iomem=", strict_iomem); -- cgit v1.2.3 From a0e280e0f33f6c859a235fb69a875ed8f3420388 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 7 Jan 2009 16:19:46 +0100 Subject: stop_machine/cpu hotplug: fix disable_nonboot_cpus disable_nonboot_cpus calls _cpu_down. But _cpu_down requires that the caller already created the stop_machine workqueue (like cpu_down does). Otherwise a call to stop_machine will lead to accesses to random memory regions. When introducing this new interface (9ea09af3bd3090e8349ca2899ca2011bd94cda85 "stop_machine: introduce stop_machine_create/destroy") I missed the second call site of _cpu_down. So add the missing stop_machine_create/destroy calls to disable_nonboot_cpus as well. Fixes suspend-to-ram/disk and also this bug: [ 286.547348] BUG: unable to handle kernel paging request at 6b6b6b6b [ 286.548940] IP: [] __stop_machine+0x88/0xe3 [ 286.550598] Oops: 0002 [#1] SMP [ 286.560580] Pid: 3273, comm: halt Not tainted (2.6.28-06127-g238c6d5 [ 286.560580] EIP: is at __stop_machine+0x88/0xe3 [ 286.560580] Process halt (pid: 3273, ti=f1a28000 task=f4530f30 [ 286.560580] Call Trace: [ 286.560580] [] ? _cpu_down+0x10f/0x234 [ 286.560580] [] ? disable_nonboot_cpus+0x58/0xdc [ 286.560580] [] ? kernel_poweroff+0x22/0x39 [ 286.560580] [] ? sys_reboot+0xde/0x14c [ 286.560580] [] ? complete_signal+0x179/0x191 [ 286.560580] [] ? send_signal+0x1cc/0x1e1 [ 286.560580] [] ? _spin_unlock_irqrestore+0x2d/0x3c [ 286.560580] [] ? group_send_signal_info+0x58/0x61 [ 286.560580] [] ? kill_pid_info+0x30/0x3a [ 286.560580] [] ? sys_kill+0x75/0x13a [ 286.560580] [] ? mntput_no_expire+ox1f/0x101 [ 286.560580] [] ? dput+0x1e/0x105 [ 286.560580] [] ? __fput+0x150/0x158 [ 286.560580] [] ? audit_syscall_entry+0x137/0x159 [ 286.560580] [] ? sysenter_do_call+0x12/0x34 Reported-and-tested-by: "Justin P. Mattock" Reviewed-by: Pekka Enberg Signed-off-by: Heiko Carstens Tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/cpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 30e74dd6d01..79e40f00dcb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -379,8 +379,11 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error = 0; + int cpu, first_cpu, error; + error = stop_machine_create(); + if (error) + return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); /* We take down all of the non-boot CPUs in one shot to avoid races @@ -409,6 +412,7 @@ int disable_nonboot_cpus(void) printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); + stop_machine_destroy(); return error; } -- cgit v1.2.3 From 465634adc1d09b490c8ee31885575be39d375d53 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 7 Jan 2009 15:32:11 +0100 Subject: ring_buffer: fix ring_buffer_event_length() Function ring_buffer_event_length() provides an interface to detect the length of data stored in an entry. However, the length contains offsets depending on the internal usage. This makes it unusable. This patch fixes this and now ring_buffer_event_length() returns the alligned length that has been used in ring_buffer_lock_reserve(). Cc: Steven Rostedt Signed-off-by: Robert Richter --- kernel/trace/ring_buffer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 30d57dd01a8..d42b882dfe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -117,7 +117,13 @@ rb_event_length(struct ring_buffer_event *event) */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - return rb_event_length(event); + unsigned length = rb_event_length(event); + if (event->type != RINGBUF_TYPE_DATA) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); -- cgit v1.2.3 From c9d557c19f94df42db78d4a5de4d25feee694bad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Jan 2009 14:33:30 -0800 Subject: rcu: fix bug in rcutorture system-shutdown code This patch fixes an rcutorture bug found by Eric Sesterhenn that resulted in oopses in response to "rmmod rcutorture". The problem was in some new code that attempted to handle the case where a system is shut down while rcutorture is still running, for example, when rcutorture is built into the kernel so that it cannot be removed. The fix causes the rcutorture threads to "park" in an schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT) rather than trying to get them to terminate cleanly. Concurrent shutdown and rmmod is illegal. I believe that this is 2.6.29 material, as it is used in some testing setups. For reference, here are the rcutorture operating modes: CONFIG_RCU_TORTURE_TEST=m This is the normal rcutorture build. Use "modprobe rcutorture" (with optional arguments) to start, and "rmmod rcutorture" to stop. If you shut the system down without doing the rmmod, you should see console output like: rcutorture thread rcu_torture_writer parking due to system shutdown One for each rcutorture kthread. CONFIG_RCU_TORTURE_TEST=y CONFIG_RCU_TORTURE_TEST_RUNNABLE=n Use this if you want rcutorture built in, but don't want the test to start running during early boot. To start the torturing: echo 1 > /proc/sys/kernel/rcutorture_runnable To stop the torturing, s/1/0/ You will get "parking" console messages as noted above when you shut the system down. CONFIG_RCU_TORTURE_TEST=y CONFIG_RCU_TORTURE_TEST_RUNNABLE=y Same as above, except that the torturing starts during early boot. Only for the stout of heart and strong of stomach. The same /proc entry noted above may be used to control the test. Located-by: Eric Sesterhenn Tested-by: Eric Sesterhenn Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 113 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 1cff28db56b..7c4142a79f0 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -136,28 +136,46 @@ static int stutter_pause_test = 0; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */ -#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ -static int fullstop; /* stop generating callbacks at test end. */ -DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ - /* spawning of kthreads. */ +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ + +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +static int fullstop = FULLSTOP_RMMOD; +DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ + /* of kthreads. */ /* - * Detect and respond to a signal-based shutdown. + * Detect and respond to a system shutdown. */ static int rcutorture_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { - if (fullstop) - return NOTIFY_DONE; mutex_lock(&fullstop_mutex); - if (!fullstop) + if (fullstop == FULLSTOP_DONTSTOP) fullstop = FULLSTOP_SHUTDOWN; + else + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; } +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +static void rcutorture_shutdown_absorb(char *title) +{ + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + printk(KERN_NOTICE + "rcutorture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} + /* * Allocate an element from the rcu_tortures pool. */ @@ -219,13 +237,14 @@ rcu_random(struct rcu_random_state *rrsp) } static void -rcu_stutter_wait(void) +rcu_stutter_wait(char *title) { - while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) { + while (stutter_pause_test || !rcutorture_runnable) { if (rcutorture_runnable) schedule_timeout_interruptible(1); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); + rcutorture_shutdown_absorb(title); } } @@ -287,7 +306,7 @@ rcu_torture_cb(struct rcu_head *p) int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - if (fullstop) { + if (fullstop != FULLSTOP_DONTSTOP) { /* Test is ending, just drop callbacks on the floor. */ /* The next initialization will pick up the pieces. */ return; @@ -619,10 +638,11 @@ rcu_torture_writer(void *arg) } rcu_torture_current_version++; oldbatch = cur_ops->completed(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_writer"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) + rcutorture_shutdown_absorb("rcu_torture_writer"); + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -643,11 +663,12 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); cur_ops->sync(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_fakewriter"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) + rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -752,12 +773,13 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_reader"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + rcutorture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irqcapable) del_timer_sync(&t); - while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -854,7 +876,8 @@ rcu_torture_stats(void *arg) do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_stats"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); return 0; } @@ -866,52 +889,49 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_var_t tmp_mask; + cpumask_t tmp_mask; int i; - if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) - BUG(); - - cpumask_setall(tmp_mask); + cpus_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) - goto out; + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, tmp_mask); + cpu_clear(rcu_idle_cpu, tmp_mask); - set_cpus_allowed_ptr(current, tmp_mask); + set_cpus_allowed_ptr(current, &tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed_ptr(reader_tasks[i], - tmp_mask); + &tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], - tmp_mask); + &tmp_mask); } if (writer_task) - set_cpus_allowed_ptr(writer_task, tmp_mask); + set_cpus_allowed_ptr(writer_task, &tmp_mask); if (stats_task) - set_cpus_allowed_ptr(stats_task, tmp_mask); + set_cpus_allowed_ptr(stats_task, &tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; else rcu_idle_cpu--; -out: put_online_cpus(); - free_cpumask_var(tmp_mask); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the @@ -925,7 +945,8 @@ rcu_torture_shuffle(void *arg) do { schedule_timeout_interruptible(shuffle_interval * HZ); rcu_torture_shuffle_tasks(); - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_shuffle"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); return 0; } @@ -940,10 +961,11 @@ rcu_torture_stutter(void *arg) do { schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 1; - if (!kthread_should_stop() && !fullstop) + if (!kthread_should_stop()) schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 0; - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_stutter"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); return 0; } @@ -970,15 +992,16 @@ rcu_torture_cleanup(void) int i; mutex_lock(&fullstop_mutex); - if (!fullstop) { - /* If being signaled, let it happen, then exit. */ + if (fullstop == FULLSTOP_SHUTDOWN) { + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); - schedule_timeout_interruptible(10 * HZ); + schedule_timeout_uninterruptible(10); if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; } - fullstop = FULLSTOP_CLEANUP; + fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_nb); if (stutter_task) { @@ -1078,7 +1101,7 @@ rcu_torture_init(void) else nrealreaders = 2 * num_online_cpus(); rcu_torture_print_module_parms("Start of test"); - fullstop = 0; + fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ -- cgit v1.2.3 From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Jan 2009 12:04:47 +0000 Subject: NOMMU: Make VMAs per MM as for MMU-mode linux Make VMAs per mm_struct as for MMU-mode linux. This solves two problems: (1) In SYSV SHM where nattch for a segment does not reflect the number of shmat's (and forks) done. (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an exec'ing process when VM_EXECUTABLE is specified, regardless of the fact that a VMA might be shared and already have its vm_mm assigned to another process or a dead process. A new struct (vm_region) is introduced to track a mapped region and to remember the circumstances under which it may be shared and the vm_list_struct structure is discarded as it's no longer required. This patch makes the following additional changes: (1) Regions are now allocated with alloc_pages() rather than kmalloc() and with no recourse to __GFP_COMP, so the pages are not composite. Instead, each page has a reference on it held by the region. Anything else that is interested in such a page will have to get a reference on it to retain it. When the pages are released due to unmapping, each page is passed to put_page() and will be freed when the page usage count reaches zero. (2) Excess pages are trimmed after an allocation as the allocation must be made as a power-of-2 quantity of pages. (3) VMAs are added to the parent MM's R/B tree and mmap lists. As an MM may end up with overlapping VMAs within the tree, the VMA struct address is appended to the sort key. (4) Non-anonymous VMAs are now added to the backing inode's prio list. (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of the backing region. The VMA and region structs will be split if necessary. (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory segment instead of all the attachments at that addresss. Multiple shmat()'s return the same address under NOMMU-mode instead of different virtual addresses as under MMU-mode. (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode. (8) /proc/maps is now the global list of mapped regions, and may list bits that aren't actually mapped anywhere. (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount of RAM currently allocated by mmap to hold mappable regions that can't be mapped directly. These are copies of the backing device or file if not anonymous. These changes make NOMMU mode more similar to MMU mode. The downside is that NOMMU mode requires some extra memory to track things over NOMMU without this patch (VMAs are no longer shared, and there are now region structs). Signed-off-by: David Howells Tested-by: Mike Frysinger Acked-by: Paul Mundt --- kernel/fork.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3..0bce4a43bb3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1481,12 +1481,10 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + mmap_init(); } /* -- cgit v1.2.3 From dd8632a12e500a684478fea0951f380478d56fed Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 8 Jan 2009 12:04:47 +0000 Subject: NOMMU: Make mmap allocation page trimming behaviour configurable. NOMMU mmap allocates a piece of memory for an mmap that's rounded up in size to the nearest power-of-2 number of pages. Currently it then discards the excess pages back to the page allocator, making that memory available for use by other things. This can, however, cause greater amount of fragmentation. To counter this, a sysctl is added in order to fine-tune the trimming behaviour. The default behaviour remains to trim pages aggressively, while this can either be disabled completely or set to a higher page-granular watermark in order to have finer-grained control. vm region vm_top bits taken from an earlier patch by David Howells. Signed-off-by: Paul Mundt Signed-off-by: David Howells Tested-by: Mike Frysinger --- kernel/sysctl.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 92f6e5bc3c2..89d74436318 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifndef CONFIG_MMU +extern int sysctl_nr_trim_pages; +#endif #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ @@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#else + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #endif { .ctl_name = VM_LAPTOP_MODE, -- cgit v1.2.3 From b9456371a73871d001e67b5f4eac118c2c278e1c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Jan 2009 11:18:31 +0000 Subject: CRED: Fix commit_creds() on a process that has no mm Fix commit_creds()'s handling of a process that has no mm (such as one that is calling or has called daemonize()). commit_creds() should check to see if task->mm is not NULL before calling set_dumpable() on it. Reported-by: Jiri Slaby Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index ff7bc071991..480a61aec80 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -372,7 +372,8 @@ int commit_creds(struct cred *new) old->fsuid != new->fsuid || old->fsgid != new->fsgid || !cap_issubset(new->cap_permitted, old->cap_permitted)) { - set_dumpable(task->mm, suid_dumpable); + if (task->mm) + set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; smp_wmb(); } -- cgit v1.2.3 From 75139b8274c3e30354daea623f14b43a482a0bb5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:33 -0800 Subject: cgroups: remove some redundant NULL checks - In cgroup_clone(), if vfs_mkdir() returns successfully, dentry->d_fsdata will be the pointer to the newly created cgroup and won't be NULL. - a cgroup file's dentry->d_fsdata won't be NULL, guaranteed by cgroup_add_file(). - When walking through the subsystems of a cgroup_fs (using for_each_subsys), cgrp->subsys[ss->subsys_id] won't be NULL, guaranteed by cgroup_create(). (Also remove 2 unused variables in cgroup_rmdir(). Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f221446aa02..220e0fd659f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -586,7 +586,7 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) + if (ss->pre_destroy) ss->pre_destroy(ss, cgrp); return; } @@ -610,10 +610,8 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) /* * Release the subsystem state objects. */ - for_each_subsys(cgrp->root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } + for_each_subsys(cgrp->root, ss) + ss->destroy(ss, cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1445,7 +1443,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -1490,7 +1488,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->read) @@ -1554,10 +1552,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); - if (!cft) - return -ENODEV; + if (cft->read_map || cft->read_seq_string) { struct cgroup_seqfile_state *state = kzalloc(sizeof(*state), GFP_USER); @@ -2463,8 +2459,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; - struct super_block *sb; - struct cgroupfs_root *root; /* the vfs holds both inode->i_mutex already */ @@ -2487,8 +2481,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_lock(&cgroup_mutex); parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) @@ -2937,7 +2929,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, } /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + ret = vfs_mkdir(inode, dentry, 0755); child = __d_cgrp(dentry); dput(dentry); if (ret) { @@ -2947,13 +2939,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, goto out_release; } - if (!child) { - printk(KERN_INFO - "Couldn't find new cgroup %s\n", nodename); - ret = -ENOMEM; - goto out_release; - } - /* The cgroup now exists. Retake cgroup_mutex and check * that we're still in the same state that we thought we * were. */ -- cgit v1.2.3 From cae7a366f77ea5c9f54ae98c5fc65056877a89ed Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:34 -0800 Subject: ns_cgroup: remove unused spinlock I happened to find the spinlock in struct ns_cgroup is never used. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ns_cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 43c2111cd54..78bc3fdac0d 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -13,7 +13,6 @@ struct ns_cgroup { struct cgroup_subsys_state css; - spinlock_t lock; }; struct cgroup_subsys ns_subsys; @@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); if (!ns_cgroup) return ERR_PTR(-ENOMEM); - spin_lock_init(&ns_cgroup->lock); return &ns_cgroup->css; } -- cgit v1.2.3 From b12b533fa523e94e0cc9dc23274ae4f9439f1313 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:36 -0800 Subject: cgroups: add lock for child->cgroups in cgroup_post_fork() When cgroup_post_fork() is called, child is seen by find_task_by_vpid(), so child->cgroups maybe be changed, It'll incorrect. child->cgroups's refcnt is decreased child->cgroups's refcnt is increased but child->cg_list is added to child->cgroups's list. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 220e0fd659f..d7ab4ffd8fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2792,8 +2792,10 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); + task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); + task_unlock(child); write_unlock(&css_set_lock); } } -- cgit v1.2.3 From 2019f634ce5904c19eba4e86f51b1a119a53a9f1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:36 -0800 Subject: cgroups: fix cgroup_iter_next() bug We access res->cgroups without the task_lock(), so res->cgroups may be changed. it's unreliable, and "if (l == &res->cgroups->tasks)" may be false forever. We don't need add any lock for fixing this bug. we just access to struct css_set by struct cg_cgroup_link, not by struct task_struct. Since we hold css_set_lock, struct cg_cgroup_link is reliable. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d7ab4ffd8fd..a391ab3bdfc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1808,6 +1808,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, { struct task_struct *res; struct list_head *l = it->task; + struct cg_cgroup_link *link; /* If the iterator cg is NULL, we have no tasks */ if (!it->cg_link) @@ -1815,7 +1816,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - if (l == &res->cgroups->tasks) { + link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); + if (l == &link->cg->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); -- cgit v1.2.3 From b2aa30f7bb381e04c93eed106089ba55553955f1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:37 -0800 Subject: cgroups: don't put struct cgroupfs_root protected by RCU We don't access struct cgroupfs_root in fast path, so we should not put struct cgroupfs_root protected by RCU But the comment in struct cgroup_subsys.root confuse us. struct cgroup_subsys.root is used in these places: 1 find_css_set(): if (ss->root->subsys_list.next == &ss->sibling) 2 rebind_subsystems(): if (ss->root != &rootnode) rcu_assign_pointer(ss->root, root); rcu_assign_pointer(subsys[i]->root, &rootnode); 3 cgroup_has_css_refs(): if (ss->root != cgrp->root) 4 cgroup_init_subsys(): ss->root = &rootnode; 5 proc_cgroupstats_show(): ss->name, ss->root->subsys_bits, ss->root->number_of_cgroups, !ss->disabled); 6 cgroup_clone(): root = subsys->root; if ((root != subsys->root) || All these place we have held cgroup_lock() or we don't dereference to struct cgroupfs_root. It's means wo don't need RCU when use struct cgroup_subsys.root, and we should not put struct cgroupfs_root protected by RCU. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a391ab3bdfc..a288da176e4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -713,7 +713,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_add(&ss->sibling, &root->subsys_list); - rcu_assign_pointer(ss->root, root); + ss->root = root; if (ss->bind) ss->bind(ss, cgrp); @@ -725,7 +725,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; - rcu_assign_pointer(subsys[i]->root, &rootnode); + subsys[i]->root = &rootnode; list_del(&ss->sibling); } else if (bit & final_bits) { /* Subsystem state should already exist */ -- cgit v1.2.3 From 104cbd55377029e70fc2cee01089e84b9c36e5dc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:38 -0800 Subject: cgroups: use task_lock() for access tsk->cgroups safe in cgroup_clone() Use task_lock() protect tsk->cgroups and get_css_set(tsk->cgroups). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a288da176e4..00d5136d38c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2903,6 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } + task_lock(tsk); cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); @@ -2915,6 +2916,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, /* Keep the cgroup alive */ get_css_set(cg); + task_unlock(tsk); mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ -- cgit v1.2.3 From 77efecd9e0526327548152df715ab8644ecb5ba0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:39 -0800 Subject: cgroups: call find_css_set() safely in cgroup_attach_task() In cgroup_attach_task(), tsk maybe exit when we call find_css_set(). and find_css_set() will access to invalid css_set. This patch increases the count before get_css_set(), and decreases it after find_css_set(). NOTE: css_set's refcount is also taskcount, after this patch applied, taskcount may be off-by-one WHEN cgroup_lock() is not held. but I reviewed other code which use taskcount, they are still correct. No regression found by reviewing and simply testing. So I do not use two counters in css_set. (one counter for taskcount, the other for refcount. like struct mm_struct) If this fix cause regression, we will use two counters in css_set. Signed-off-by: Lai Jiangshan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 00d5136d38c..61e92c5867e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1214,7 +1214,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) int retval = 0; struct cgroup_subsys *ss; struct cgroup *oldcgrp; - struct css_set *cg = tsk->cgroups; + struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; int subsys_id; @@ -1234,11 +1234,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } + task_lock(tsk); + cg = tsk->cgroups; + get_css_set(cg); + task_unlock(tsk); /* * Locate or allocate a new css_set for this task, * based on its final set of cgroups */ newcg = find_css_set(cg, cgrp); + put_css_set(cg); if (!newcg) return -ENOMEM; -- cgit v1.2.3 From 7534432dcc3c654a8671b6b0cdffd1dbdbc73074 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:40 -0800 Subject: cgroups: remove rcu_read_lock() in cgroupstats_build() cgroup_iter_* do not need rcu_read_lock(). In cgroup_enable_task_cg_lists(), do_each_thread() and while_each_thread() are protected by RCU, it's OK, for write_lock(&css_set_lock) implies rcu_read_lock() in non-RT kernel. If we need explicit rcu_read_lock(), we should add rcu_read_lock() in cgroup_enable_task_cg_lists(), not cgroup_iter_*. Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 61e92c5867e..f55af3daffc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2055,7 +2055,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - rcu_read_lock(); cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { @@ -2080,7 +2079,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } cgroup_iter_end(cgrp, &it); - rcu_read_unlock(); err: return ret; } -- cgit v1.2.3 From e5f6a8609bab0c2d7543ab1505105e011832afd7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:41 -0800 Subject: cgroups: make root_list contains active hierarchies only Don't link rootnode to the root list, so root_list contains active hierarchies only as the comment indicates. And rename for_each_root() to for_each_active_root(). Also remove redundant check in cgroup_kill_sb(). Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f55af3daffc..fd572d05769 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -84,7 +84,7 @@ struct cgroupfs_root { /* Tracks how many cgroups are currently defined in hierarchy.*/ int number_of_cgroups; - /* A list running through the mounted hierarchies */ + /* A list running through the active hierarchies */ struct list_head root_list; /* Hierarchy-specific flags */ @@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_subsys(_root, _ss) \ list_for_each_entry(_ss, &_root->subsys_list, sibling) -/* for_each_root() allows you to iterate across the active hierarchies */ -#define for_each_root(_root) \ +/* for_each_active_root() allows you to iterate across the active hierarchies */ +#define for_each_active_root(_root) \ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by @@ -1111,10 +1111,9 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - root_count--; - } + list_del(&root->root_list); + root_count--; + mutex_unlock(&cgroup_mutex); kfree(root); @@ -2559,7 +2558,6 @@ int __init cgroup_init_early(void) INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); - list_add(&rootnode.root_list, &roots); root_count = 1; init_task.cgroups = &init_css_set; @@ -2666,15 +2664,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); - for_each_root(root) { + for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int subsys_id; int count = 0; - /* Skip this hierarchy if it has no active subsystems */ - if (!root->actual_subsys_bits) - continue; seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); -- cgit v1.2.3 From 33a68ac1c1b695216e873ee12e819adbe73e4d9f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:42 -0800 Subject: cgroups: add inactive subsystems to rootnode.subsys_list Though for an inactive hierarchy, we have subsys->root == &rootnode, but rootnode's subsys_list is always empty. This conflicts with the code in find_css_set(): for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ... if (ss->root->subsys_list.next == &ss->sibling) { ... } } if (list_empty(&rootnode.subsys_list)) { ... } The above code assumes rootnode.subsys_list links all inactive hierarchies. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fd572d05769..abf7248f501 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -712,7 +712,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(dummytop->subsys[i]->cgroup != dummytop); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; - list_add(&ss->sibling, &root->subsys_list); + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(ss, cgrp); @@ -726,7 +726,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; - list_del(&ss->sibling); + list_move(&ss->sibling, &rootnode.subsys_list); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -2521,6 +2521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); /* Create the top cgroup state for this subsystem */ + list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; css = ss->create(ss, dummytop); /* We don't handle early failures gracefully */ -- cgit v1.2.3 From c12f65d4396e05c51ce3af7f159ead98574a587c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:42 -0800 Subject: cgroups: introduce link_css_set() to remove duplicate code Add a common function link_css_set() to link a css_set to a cgroup. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 68 +++++++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abf7248f501..4c475ce4e22 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } +/** + * link_css_set - a helper function to link a css_set to a cgroup + * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @cg: the css_set to be linked + * @cgrp: the destination cgroup + */ +static void link_css_set(struct list_head *tmp_cg_links, + struct css_set *cg, struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + BUG_ON(list_empty(tmp_cg_links)); + link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, + cgrp_link_list); + link->cg = cg; + list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_add(&link->cg_link_list, &cg->cg_links); +} + /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -399,7 +418,6 @@ static struct css_set *find_css_set( int i; struct list_head tmp_cg_links; - struct cg_cgroup_link *link; struct hlist_head *hhead; @@ -444,26 +462,11 @@ static struct css_set *find_css_set( * only do it for the first subsystem in each * hierarchy */ - if (ss->root->subsys_list.next == &ss->sibling) { - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &cgrp->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); - } - } - if (list_empty(&rootnode.subsys_list)) { - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &dummytop->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); + if (ss->root->subsys_list.next == &ss->sibling) + link_css_set(&tmp_cg_links, res, cgrp); } + if (list_empty(&rootnode.subsys_list)) + link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -988,7 +991,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, root = NULL; } else { /* New superblock */ - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; int i; @@ -1029,7 +1032,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, list_add(&root->root_list, &roots); root_count++; - sb->s_root->d_fsdata = &root->top_cgroup; + sb->s_root->d_fsdata = root_cgrp; root->top_cgroup.dentry = sb->s_root; /* Link the top cgroup in this hierarchy into all @@ -1040,29 +1043,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct hlist_node *node; struct css_set *cg; - hlist_for_each_entry(cg, node, hhead, hlist) { - struct cg_cgroup_link *link; - - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - } + hlist_for_each_entry(cg, node, hhead, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&cgrp->sibling)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->sibling)); + BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cgroup_populate_dir(cgrp); + cgroup_populate_dir(root_cgrp); mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); } -- cgit v1.2.3 From e7b80bb695a5b64c92e314838e083b2f3bdf29b2 Mon Sep 17 00:00:00 2001 From: Gowrishankar M Date: Wed, 7 Jan 2009 18:07:43 -0800 Subject: cgroups: skip processes from other namespaces when listing a cgroup Once tasks are populated from system namespace inside cgroup, container replaces other namespace task with 0 while listing tasks, inside container. Though this is expected behaviour from container end, there is no use of showing unwanted 0s. In this patch, we check if a process is in same namespace before loading into pid array. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Gowrishankar M Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4c475ce4e22..cb7c72b91f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2007,14 +2007,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) { - int n = 0; + int n = 0, pid; struct cgroup_iter it; struct task_struct *tsk; cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; - pidarray[n++] = task_pid_vnr(tsk); + pid = task_pid_vnr(tsk); + if (pid > 0) + pidarray[n++] = pid; } cgroup_iter_end(cgrp, &it); return n; -- cgit v1.2.3 From a47295e6bc42ad35f9c15ac66f598aa24debd4e2 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:07:44 -0800 Subject: cgroups: make cgroup_path() RCU-safe Fix races between /proc/sched_debug by freeing cgroup objects via an RCU callback. Thus any cgroup reference obtained from an RCU-safe source will remain valid during the RCU section. Since dentries are also RCU-safe, this allows us to traverse up the tree safely. Additionally, make cgroup_path() check for a NULL cgrp->dentry to avoid trying to report a path for a partially-created cgroup. [lizf@cn.fujitsu.com: call deactive_super() in cgroup_diput()] Signed-off-by: Paul Menage Reviewed-by: Li Zefan Tested-by: Li Zefan Cc: Peter Zijlstra Signed-off-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb7c72b91f4..83ea4f524be 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = cg->subsys[i]->cgroup; + struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) @@ -594,6 +594,13 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } +static void free_cgroup_rcu(struct rcu_head *obj) +{ + struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); + + kfree(cgrp); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -619,11 +626,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); - /* Drop the active superblock reference that we took when we - * created the cgroup */ + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ deactivate_super(cgrp->root->sb); - kfree(cgrp); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); } @@ -1134,14 +1143,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry) * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held. Writes path of cgroup into buf. - * Returns 0 on success, -errno on error. + * Called with cgroup_mutex held or else with an RCU-protected cgroup + * reference. Writes path of cgroup into buf. Returns 0 on success, + * -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; + struct dentry *dentry = rcu_dereference(cgrp->dentry); - if (cgrp == dummytop) { + if (!dentry || cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -1154,13 +1165,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { - int len = cgrp->dentry->d_name.len; + int len = dentry->d_name.len; if ((start -= len) < buf) return -ENAMETOOLONG; memcpy(start, cgrp->dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; + dentry = rcu_dereference(cgrp->dentry); if (!cgrp->parent) continue; if (--start < buf) @@ -1663,7 +1675,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, if (!error) { dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); - cgrp->dentry = dentry; + rcu_assign_pointer(cgrp->dentry, dentry); dget(dentry); } dput(dentry); -- cgit v1.2.3 From 28dbc4b6a01fb579a9441c7b81e3d3413dc452df Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 7 Jan 2009 18:08:05 -0800 Subject: memcg: memory cgroup resource counters for hierarchy Add support for building hierarchies in resource counters. Cgroups allows us to build a deep hierarchy, but we currently don't link the resource counters belonging to the memory controller control groups, in the same fashion as the corresponding cgroup entries in the cgroup hierarchy. This patch provides the infrastructure for resource counters that have the same hiearchy as their cgroup counter parts. These set of patches are based on the resource counter hiearchy patches posted by Pavel Emelianov. NOTE: Building hiearchies is expensive, deeper hierarchies imply charging the all the way up to the root. It is known that hiearchies are expensive, so the user needs to be careful and aware of the trade-offs before creating very deep ones. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Balbir Singh Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: Li Zefan Cc: David Rientjes Cc: Pavel Emelianov Cc: Dhaval Giani Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index f275c8eca77..bf8e7534c80 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -15,10 +15,11 @@ #include #include -void res_counter_init(struct res_counter *counter) +void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = (unsigned long long)LLONG_MAX; + counter->parent = parent; } int res_counter_charge_locked(struct res_counter *counter, unsigned long val) @@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) return 0; } -int res_counter_charge(struct res_counter *counter, unsigned long val) +int res_counter_charge(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) { int ret; unsigned long flags; - - spin_lock_irqsave(&counter->lock, flags); - ret = res_counter_charge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + struct res_counter *c, *u; + + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + ret = res_counter_charge_locked(c, val); + spin_unlock(&c->lock); + if (ret < 0) { + *limit_fail_at = c; + goto undo; + } + } + ret = 0; + goto done; +undo: + for (u = counter; u != c; u = u->parent) { + spin_lock(&u->lock); + res_counter_uncharge_locked(u, val); + spin_unlock(&u->lock); + } +done: + local_irq_restore(flags); return ret; } @@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) void res_counter_uncharge(struct res_counter *counter, unsigned long val) { unsigned long flags; + struct res_counter *c; - spin_lock_irqsave(&counter->lock, flags); - res_counter_uncharge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + res_counter_uncharge_locked(c, val); + spin_unlock(&c->lock); + } + local_irq_restore(flags); } -- cgit v1.2.3 From 999cd8a450f8f93701669a61cac4d3b19eca07e8 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:08:36 -0800 Subject: cgroups: add a per-subsystem hierarchy_mutex These patches introduce new locking/refcount support for cgroups to reduce the need for subsystems to call cgroup_lock(). This will ultimately allow the atomicity of cgroup_rmdir() (which was removed recently) to be restored. These three patches give: 1/3 - introduce a per-subsystem hierarchy_mutex which a subsystem can use to prevent changes to its own cgroup tree 2/3 - use hierarchy_mutex in place of calling cgroup_lock() in the memory controller 3/3 - introduce a css_tryget() function similar to the one recently proposed by Kamezawa, but avoiding spurious refcount failures in the event of a race between a css_tryget() and an unsuccessful cgroup_rmdir() Future patches will likely involve: - using hierarchy mutex in place of cgroup_lock() in more subsystems where appropriate - restoring the atomicity of cgroup_rmdir() with respect to cgroup_create() This patch: Add a hierarchy_mutex to the cgroup_subsys object that protects changes to the hierarchy observed by that subsystem. It is taken by the cgroup subsystem (in addition to cgroup_mutex) for the following operations: - linking a cgroup into that subsystem's cgroup tree - unlinking a cgroup from that subsystem's cgroup tree - moving the subsystem to/from a hierarchy (including across the bind() callback) Thus if the subsystem holds its own hierarchy_mutex, it can safely traverse its own hierarchy. Signed-off-by: Paul Menage Tested-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 83ea4f524be..8b6379cdf63 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -722,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(ss, cgrp); - + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -2338,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; } +static void cgroup_lock_hierarchy(struct cgroupfs_root *root) +{ + /* We need to take each hierarchy_mutex in a consistent order */ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_lock_nested(&ss->hierarchy_mutex, i); + } +} + +static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) +{ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_unlock(&ss->hierarchy_mutex); + } +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -2386,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_css(css, ss, cgrp); } + cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); + cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -2504,8 +2532,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); - /* delete my sibling from parent->children */ + + cgroup_lock_hierarchy(cgrp->root); + /* delete this cgroup from parent->children */ list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(cgrp->root); + spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); spin_unlock(&d->d_lock); @@ -2547,6 +2579,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); + mutex_init(&ss->hierarchy_mutex); ss->active = 1; } -- cgit v1.2.3 From e7c5ec9193d32b9559a3bb8893ceedbda85201ff Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:08:38 -0800 Subject: cgroups: add css_tryget() Add css_tryget(), that obtains a counted reference on a CSS. It is used in situations where the caller has a "weak" reference to the CSS, i.e. one that does not protect the cgroup from removal via a reference count, but would instead be cleaned up by a destroy() callback. css_tryget() will return true on success, or false if the cgroup is being removed. This is similar to Kamezawa Hiroyuki's patch from a week or two ago, but with the difference that in the event of css_tryget() racing with a cgroup_rmdir(), css_tryget() will only return false if the cgroup really does get removed. This implementation is done by biasing css->refcnt, so that a refcnt of 1 means "releasable" and 0 means "released or releasing". In the event of a race, css_tryget() distinguishes between "released" and "releasing" by checking for the CSS_REMOVED flag in css->flags. Signed-off-by: Paul Menage Tested-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8b6379cdf63..c29831076e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2333,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 0); + atomic_set(&css->refcnt, 1); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); @@ -2465,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the - * cgroup, if the css refcount is also 0, then there should + * cgroup, if the css refcount is also 1, then there should * be no outstanding references, so the subsystem is safe to * destroy. We scan across all subsystems rather than using * the per-hierarchy linked list of mounted subsystems since @@ -2486,12 +2486,62 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) + if (css && (atomic_read(&css->refcnt) > 1)) return 1; } return 0; } +/* + * Atomically mark all (or else none) of the cgroup's CSS objects as + * CSS_REMOVED. Return true on success, or false if the cgroup has + * busy subsystems. Call with cgroup_mutex held + */ + +static int cgroup_clear_css_refs(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + unsigned long flags; + bool failed = false; + local_irq_save(flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + int refcnt; + do { + /* We can only remove a CSS with a refcnt==1 */ + refcnt = atomic_read(&css->refcnt); + if (refcnt > 1) { + failed = true; + goto done; + } + BUG_ON(!refcnt); + /* + * Drop the refcnt to 0 while we check other + * subsystems. This will cause any racing + * css_tryget() to spin until we set the + * CSS_REMOVED bits or abort + */ + } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + } + done: + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (failed) { + /* + * Restore old refcnt if we previously managed + * to clear it from 1 to 0 + */ + if (!atomic_read(&css->refcnt)) + atomic_set(&css->refcnt, 1); + } else { + /* Commit the fact that the CSS is removed */ + set_bit(CSS_REMOVED, &css->flags); + } + } + local_irq_restore(flags); + return !failed; +} + static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -2522,7 +2572,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) - || cgroup_has_css_refs(cgrp)) { + || !cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } @@ -3078,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { + if ((atomic_dec_return(&css->refcnt) == 1) && + notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } -- cgit v1.2.3 From 13337714f3b0307dc7f75ef5d83ecf0db2abbd65 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:08:39 -0800 Subject: cpuset: rcu_read_lock() to protect task_cs() task_cs() calls task_subsys_state(). We must use rcu_read_lock() to protect cgroup_subsys_state(). It's correct that top_cpuset is never freed, but cgroup_subsys_state() accesses css_set, this css_set maybe freed when task_cs() called. We use use rcu_read_lock() to protect it. Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 345ace5117d..a841b5c01ef 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -375,14 +375,9 @@ void cpuset_update_task_memory_state(void) struct task_struct *tsk = current; struct cpuset *cs; - if (task_cs(tsk) == &top_cpuset) { - /* Don't need rcu for top_cpuset. It's never freed. */ - my_cpusets_mem_gen = top_cpuset.mems_generation; - } else { - rcu_read_lock(); - my_cpusets_mem_gen = task_cs(tsk)->mems_generation; - rcu_read_unlock(); - } + rcu_read_lock(); + my_cpusets_mem_gen = task_cs(tsk)->mems_generation; + rcu_read_unlock(); if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { mutex_lock(&callback_mutex); -- cgit v1.2.3 From f5813d94279a18ff5936d675e24b44b44a571197 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 7 Jan 2009 18:08:40 -0800 Subject: cpusets: set task's cpu_allowed to cpu_possible_map when attaching it into top cpuset I found a bug on my dual-cpu box. I created a sub cpuset in top cpuset and assign 1 to its cpus. And then we attach some tasks into this sub cpuset. After this, we offline CPU1. Now, the tasks in this new cpuset are moved into top cpuset automatically because there is no cpu in sub cpuset. Then we online CPU1, we find all the tasks which doesn't belong to top cpuset originally just run on CPU0. We fix this bug by setting task's cpu_allowed to cpu_possible_map when attaching it into top cpuset. This method needn't modify the current behavior of cpusets on CPU hotplug, and all of tasks in top cpuset use cpu_possible_map to initialize their cpu_allowed. Signed-off-by: Miao Xie Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a841b5c01ef..6012e326e85 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1338,10 +1338,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cpuset *oldcs = cgroup_cs(oldcont); int err; - mutex_lock(&callback_mutex); - guarantee_online_cpus(cs, &cpus); + if (cs == &top_cpuset) { + cpus = cpu_possible_map; + } else { + mutex_lock(&callback_mutex); + guarantee_online_cpus(cs, &cpus); + mutex_unlock(&callback_mutex); + } err = set_cpus_allowed_ptr(tsk, &cpus); - mutex_unlock(&callback_mutex); if (err) return; -- cgit v1.2.3 From 5a7625df725a486ad600b0036b6111dbd6321c41 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:41 -0800 Subject: cpuset: remove on stack cpumask_t in cpuset_sprintf_cpulist() This patchset converts cpuset to use new cpumask API, and thus remove on stack cpumask_t to reduce stack usage. Before: # cat kernel/cpuset.c include/linux/cpuset.h | grep -c cpumask_t 21 After: # cat kernel/cpuset.c include/linux/cpuset.h | grep -c cpumask_t 0 This patch: Impact: reduce stack usage It's safe to call cpulist_scnprintf inside callback_mutex, and thus we can just remove the cpumask_t and no need to allocate a cpumask_var_t. Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6012e326e85..41c2343df97 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1486,13 +1486,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - cpumask_t mask; + int ret; mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + ret = cpulist_scnprintf(page, PAGE_SIZE, &cs->cpus_allowed); mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, &mask); + return ret; } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) -- cgit v1.2.3 From 5771f0a2236df69683e9abea87f35f522c97ff94 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:41 -0800 Subject: cpuset: remove on stack cpumask_t in cpuset_can_attach() Impact: reduce stack usage Just use cs->cpus_allowed, and no need to allocate a cpumask_var_t. Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41c2343df97..afa29cfc5bb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1311,20 +1311,19 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk) { struct cpuset *cs = cgroup_cs(cont); + int ret = 0; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; - if (tsk->flags & PF_THREAD_BOUND) { - cpumask_t mask; + if (tsk->flags & PF_THREAD_BOUND) { mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + if (!cpus_equal(tsk->cpus_allowed, cs->cpus_allowed)) + ret = -EINVAL; mutex_unlock(&callback_mutex); - if (!cpus_equal(tsk->cpus_allowed, mask)) - return -EINVAL; } - return security_task_setscheduler(tsk, 0, NULL); + return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); } static void cpuset_attach(struct cgroup_subsys *ss, -- cgit v1.2.3 From 2341d1b6598c7146d64a5050b53a72a5a819617f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:42 -0800 Subject: cpuset: convert cpuset_attach() to use cpumask_var_t Impact: reduce stack usage Allocate a global cpumask_var_t at boot, and use it in cpuset_attach(), so we won't fail cpuset_attach(). Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index afa29cfc5bb..1e32e6b380a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1306,6 +1306,9 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* Protected by cgroup_lock */ +static cpumask_var_t cpus_attach; + /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk) @@ -1330,7 +1333,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk) { - cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); @@ -1338,13 +1340,13 @@ static void cpuset_attach(struct cgroup_subsys *ss, int err; if (cs == &top_cpuset) { - cpus = cpu_possible_map; + cpumask_copy(cpus_attach, cpu_possible_mask); } else { mutex_lock(&callback_mutex); - guarantee_online_cpus(cs, &cpus); + guarantee_online_cpus(cs, cpus_attach); mutex_unlock(&callback_mutex); } - err = set_cpus_allowed_ptr(tsk, &cpus); + err = set_cpus_allowed_ptr(tsk, cpus_attach); if (err) return; @@ -1357,7 +1359,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, cpuset_migrate_mm(mm, &from, &to); mmput(mm); } - } /* The various types of files and directories in a cpuset file system */ @@ -1838,6 +1839,9 @@ int __init cpuset_init(void) if (err < 0) return err; + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); + number_of_cpusets = 1; return 0; } -- cgit v1.2.3 From 645fcc9d2f6946f97a41c8d00edee38f8a6f0060 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:43 -0800 Subject: cpuset: don't allocate trial cpuset on stack Impact: cleanups, reduce stack usage This patch prepares for the next patch. When we convert cpuset.cpus_allowed to cpumask_var_t, (trialcs = *cs) no longer works. Another result of this patch is reducing stack usage of trialcs. sizeof(*cs) can be as large as 148 bytes on x86_64, so it's really not good to have it on stack. Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 93 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1e32e6b380a..f66527bfd21 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -415,6 +415,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) is_mem_exclusive(p) <= is_mem_exclusive(q); } +/** + * alloc_trial_cpuset - allocate a trial cpuset + * @cs: the cpuset that the trial cpuset duplicates + */ +static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +{ + return kmemdup(cs, sizeof(*cs), GFP_KERNEL); +} + +/** + * free_trial_cpuset - free the trial cpuset + * @trial: the trial cpuset to be freed + */ +static void free_trial_cpuset(struct cpuset *trial) +{ + kfree(trial); +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -880,10 +898,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) * @cs: the cpuset to consider * @buf: buffer of cpu numbers written to this cpuset */ -static int update_cpumask(struct cpuset *cs, const char *buf) +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { struct ptr_heap heap; - struct cpuset trialcs; int retval; int is_load_balanced; @@ -891,8 +909,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case @@ -900,31 +916,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * with tasks have cpus. */ if (!*buf) { - cpus_clear(trialcs.cpus_allowed); + cpus_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, &trialcs.cpus_allowed); + retval = cpulist_parse(buf, &trialcs->cpus_allowed); if (retval < 0) return retval; - if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) + if (!cpus_subset(trialcs->cpus_allowed, cpu_online_map)) return -EINVAL; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) return retval; /* Nothing to do if the cpus didn't change */ - if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) + if (cpus_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; - is_load_balanced = is_sched_load_balance(&trialcs); + is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); - cs->cpus_allowed = trialcs.cpus_allowed; + cs->cpus_allowed = trialcs->cpus_allowed; mutex_unlock(&callback_mutex); /* @@ -1099,9 +1115,9 @@ done: * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ -static int update_nodemask(struct cpuset *cs, const char *buf) +static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { - struct cpuset trialcs; nodemask_t oldmem; int retval; @@ -1112,8 +1128,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * Since nodelist_parse() fails on an empty mask, we special case @@ -1121,27 +1135,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf) * with tasks have memory. */ if (!*buf) { - nodes_clear(trialcs.mems_allowed); + nodes_clear(trialcs->mems_allowed); } else { - retval = nodelist_parse(buf, trialcs.mems_allowed); + retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) goto done; - if (!nodes_subset(trialcs.mems_allowed, + if (!nodes_subset(trialcs->mems_allowed, node_states[N_HIGH_MEMORY])) return -EINVAL; } oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs.mems_allowed)) { + if (nodes_equal(oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) goto done; mutex_lock(&callback_mutex); - cs->mems_allowed = trialcs.mems_allowed; + cs->mems_allowed = trialcs->mems_allowed; cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); @@ -1181,31 +1195,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { - struct cpuset trialcs; + struct cpuset *trialcs; int err; int balance_flag_changed; - trialcs = *cs; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + if (turning_on) - set_bit(bit, &trialcs.flags); + set_bit(bit, &trialcs->flags); else - clear_bit(bit, &trialcs.flags); + clear_bit(bit, &trialcs->flags); - err = validate_change(cs, &trialcs); + err = validate_change(cs, trialcs); if (err < 0) - return err; + goto out; balance_flag_changed = (is_sched_load_balance(cs) != - is_sched_load_balance(&trialcs)); + is_sched_load_balance(trialcs)); mutex_lock(&callback_mutex); - cs->flags = trialcs.flags; + cs->flags = trialcs->flags; mutex_unlock(&callback_mutex); - if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) + if (!cpus_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); - return 0; +out: + free_trial_cpuset(trialcs); + return err; } /* @@ -1453,21 +1472,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *trialcs; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + switch (cft->private) { case FILE_CPULIST: - retval = update_cpumask(cgroup_cs(cgrp), buf); + retval = update_cpumask(cs, trialcs, buf); break; case FILE_MEMLIST: - retval = update_nodemask(cgroup_cs(cgrp), buf); + retval = update_nodemask(cs, trialcs, buf); break; default: retval = -EINVAL; break; } + + free_trial_cpuset(trialcs); cgroup_unlock(); return retval; } -- cgit v1.2.3 From 300ed6cbb70718872cb4936d1d22ef295f9ba44d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:44 -0800 Subject: cpuset: convert cpuset->cpus_allowed to cpumask_var_t Impact: use new cpumask API This patch mainly does the following things: - change cs->cpus_allowed from cpumask_t to cpumask_var_t - call alloc_bootmem_cpumask_var() for top_cpuset in cpuset_init_early() - call alloc_cpumask_var() for other cpusets - replace cpus_xxx() to cpumask_xxx() Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 100 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f66527bfd21..fc294aa9a97 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -84,7 +84,7 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ + cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ struct cpuset *parent; /* my parent */ @@ -195,8 +195,6 @@ static int cpuset_mems_generation; static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .cpus_allowed = CPU_MASK_ALL, - .mems_allowed = NODE_MASK_ALL, }; /* @@ -278,7 +276,7 @@ static struct file_system_type cpuset_fs_type = { }; /* - * Return in *pmask the portion of a cpusets's cpus_allowed that + * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, @@ -293,13 +291,13 @@ static struct file_system_type cpuset_fs_type = { static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) { - while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) + while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = cs->parent; if (cs) - cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else - *pmask = cpu_online_map; - BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); + cpumask_copy(pmask, cpu_online_mask); + BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); } /* @@ -409,7 +407,7 @@ void cpuset_update_task_memory_state(void) static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpus_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -421,7 +419,19 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) */ static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) { - return kmemdup(cs, sizeof(*cs), GFP_KERNEL); + struct cpuset *trial; + + trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + if (!trial) + return NULL; + + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { + kfree(trial); + return NULL; + } + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + + return trial; } /** @@ -430,6 +440,7 @@ static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) */ static void free_trial_cpuset(struct cpuset *trial) { + free_cpumask_var(trial->cpus_allowed); kfree(trial); } @@ -482,7 +493,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) c = cgroup_cs(cont); if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) return -EINVAL; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -492,7 +503,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ if (cgroup_task_count(cur->css.cgroup)) { - if (cpus_empty(trial->cpus_allowed) || + if (cpumask_empty(trial->cpus_allowed) || nodes_empty(trial->mems_allowed)) { return -ENOSPC; } @@ -507,7 +518,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpus_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); } static void @@ -532,7 +543,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; if (is_sched_load_balance(cp)) @@ -627,7 +638,7 @@ static int generate_sched_domains(cpumask_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - *doms = top_cpuset.cpus_allowed; + cpumask_copy(doms, top_cpuset.cpus_allowed); ndoms = 1; goto done; @@ -646,7 +657,7 @@ static int generate_sched_domains(cpumask_t **domains, cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; /* @@ -739,7 +750,7 @@ restart: struct cpuset *b = csa[j]; if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); + cpumask_or(dp, dp, b->cpus_allowed); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -848,7 +859,7 @@ void rebuild_sched_domains(void) static int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - return !cpus_equal(tsk->cpus_allowed, + return !cpumask_equal(&tsk->cpus_allowed, (cgroup_cs(scan->cg))->cpus_allowed); } @@ -866,7 +877,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); + set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -916,13 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * with tasks have cpus. */ if (!*buf) { - cpus_clear(trialcs->cpus_allowed); + cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, &trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) return retval; - if (!cpus_subset(trialcs->cpus_allowed, cpu_online_map)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) return -EINVAL; } retval = validate_change(cs, trialcs); @@ -930,7 +941,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return retval; /* Nothing to do if the cpus didn't change */ - if (cpus_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); @@ -940,7 +951,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); - cs->cpus_allowed = trialcs->cpus_allowed; + cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); /* @@ -1028,7 +1039,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ - fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ retval = -ENOMEM; /* @@ -1176,7 +1187,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; - if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) async_rebuild_sched_domains(); } @@ -1219,7 +1231,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; mutex_unlock(&callback_mutex); - if (!cpus_empty(trialcs->cpus_allowed) && balance_flag_changed) + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); out: @@ -1335,12 +1347,12 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cpuset *cs = cgroup_cs(cont); int ret = 0; - if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; if (tsk->flags & PF_THREAD_BOUND) { mutex_lock(&callback_mutex); - if (!cpus_equal(tsk->cpus_allowed, cs->cpus_allowed)) + if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) ret = -EINVAL; mutex_unlock(&callback_mutex); } @@ -1516,7 +1528,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) int ret; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, &cs->cpus_allowed); + ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); return ret; @@ -1755,7 +1767,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, parent_cs = cgroup_cs(parent); cs->mems_allowed = parent_cs->mems_allowed; - cs->cpus_allowed = parent_cs->cpus_allowed; + cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); return; } @@ -1781,6 +1793,10 @@ static struct cgroup_subsys_state *cpuset_create( cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { + kfree(cs); + return ERR_PTR(-ENOMEM); + } cpuset_update_task_memory_state(); cs->flags = 0; @@ -1789,7 +1805,7 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpus_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); @@ -1816,6 +1832,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; + free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1839,6 +1856,8 @@ struct cgroup_subsys cpuset_subsys = { int __init cpuset_init_early(void) { + alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); + top_cpuset.mems_generation = cpuset_mems_generation++; return 0; } @@ -1854,7 +1873,7 @@ int __init cpuset_init(void) { int err = 0; - cpus_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); @@ -1943,7 +1962,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed) || + while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) parent = parent->parent; @@ -1984,7 +2003,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -1992,13 +2011,14 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); - cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_online_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); /* Move tasks from the empty cpuset to a parent */ - if (cpus_empty(cp->cpus_allowed) || + if (cpumask_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { @@ -2039,7 +2059,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, } cgroup_lock(); - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2084,7 +2104,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); @@ -2096,7 +2116,7 @@ void __init cpuset_init_smp(void) * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * - * Description: Returns the cpumask_t cpus_allowed of the cpuset + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_map, even if this means going outside the * tasks cpuset. -- cgit v1.2.3 From 6af866af34a96fed24a55979a78b6f73bd4e8e87 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:45 -0800 Subject: cpuset: remove remaining pointers to cpumask_t Impact: cleanups, use new cpumask API Final trivial cleanups: mainly s/cpumask_t/struct cpumask Note there is a FIXME in generate_sched_domains(). A future patch will change struct cpumask *doms to struct cpumask *doms[]. (I suppose Rusty will do this.) Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc294aa9a97..647c77a88fc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -289,7 +289,8 @@ static struct file_system_type cpuset_fs_type = { * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) +static void guarantee_online_cpus(const struct cpuset *cs, + struct cpumask *pmask) { while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = cs->parent; @@ -610,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -static int generate_sched_domains(cpumask_t **domains, +/* FIXME: see the FIXME in partition_sched_domains() */ +static int generate_sched_domains(struct cpumask **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -618,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct cpumask *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] cpumask_t slot */ + int nslot; /* next empty doms[] struct cpumask slot */ doms = NULL; dattr = NULL; @@ -629,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -708,7 +710,7 @@ restart: * Now we know how many domains to create. * Convert to and populate cpu masks. */ - doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -720,7 +722,7 @@ restart: for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; - cpumask_t *dp; + struct cpumask *dp; int apn = a->pn; if (apn < 0) { @@ -743,7 +745,7 @@ restart: continue; } - cpus_clear(*dp); + cpumask_clear(dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { @@ -790,7 +792,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; get_online_cpus(); @@ -2044,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; switch (phase) { @@ -2114,7 +2116,7 @@ void __init cpuset_init_smp(void) /** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty @@ -2122,7 +2124,7 @@ void __init cpuset_init_smp(void) * tasks cpuset. **/ -void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { mutex_lock(&callback_mutex); cpuset_cpus_allowed_locked(tsk, pmask); @@ -2133,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { task_lock(tsk); guarantee_online_cpus(task_cs(tsk), pmask); -- cgit v1.2.3 From 61bce0f1371cfff497fe85594fd39d1a0b15ebe1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Jan 2009 18:08:49 -0800 Subject: pid: generalize task_active_pid_ns Currently task_active_pid_ns is not safe to call after a task becomes a zombie and exit_task_namespaces is called, as nsproxy becomes NULL. By reading the pid namespace from the pid of the task we can trivially solve this problem at the cost of one extra memory read in what should be the same cacheline as we read the namespace from. When moving things around I have made task_active_pid_ns out of line because keeping it in pid_namespace.h would require adding includes of pid.h and sched.h that I don't think we want. This change does make task_active_pid_ns unsafe to call during copy_process until we attach a pid on the task_struct which seems to be a reasonable trade off. Signed-off-by: Eric W. Biederman Signed-off-by: Sukadev Bhattiprolu Cc: Oleg Nesterov Cc: Roland McGrath Cc: Bastian Blank Cc: Pavel Emelyanov Cc: Nadia Derbey Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- kernel/pid.c | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3..4018308048c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1126,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(task_active_pid_ns(p)); + pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(task_active_pid_ns(p)); + retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; } diff --git a/kernel/pid.c b/kernel/pid.c index af9224cdd6c..1b3586fe753 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -474,6 +474,12 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) } EXPORT_SYMBOL(task_session_nr_ns); +struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) +{ + return ns_of_pid(task_pid(tsk)); +} +EXPORT_SYMBOL_GPL(task_active_pid_ns); + /* * Used by proc to find the first pid that is greater than or equal to nr. * -- cgit v1.2.3 From df4927bf6ccf6278a97a44bd107080c71b269cb5 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Jan 2009 18:09:14 -0800 Subject: generic swap(): sched: remove local swap() macro Use the new generic implementation. Signed-off-by: Wu Fengguang Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched_fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e0c0b4bc3f0..8e1352c7555 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1617,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } } -#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) - /* * Share the fairness runtime between parent and child, thus the * total amount of pressure for CPU stays equal - new tasks -- cgit v1.2.3 From 33b04b9308959af7febc1c111c766fa3fd8b1934 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 8 Jan 2009 12:35:11 -0800 Subject: async: make async_synchronize_full() more serializing turns out that there are real problems with allowing async tasks that are scheduled from async tasks to run after the async_synchronize_full() returns. This patch makes the _full more strict and a complete synchronization. Later I might need to add back a lighter form of synchronization for other uses.. but not right now. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 97373380c9e..64cc916299a 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -206,7 +206,9 @@ EXPORT_SYMBOL_GPL(async_schedule_special); void async_synchronize_full(void) { - async_synchronize_cookie(next_cookie); + do { + async_synchronize_cookie(next_cookie); + } while (!list_empty(&async_running) || !list_empty(&async_pending)); } EXPORT_SYMBOL_GPL(async_synchronize_full); -- cgit v1.2.3 From 0de336814107358bc8c4173bf9ce2d42445173fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Jan 2009 16:13:41 +0000 Subject: CRED: Missing put_cred() in prepare_kernel_cred() Missing put_cred() in the error handling path of prepare_kernel_cred(). Signed-off-by: David Howells Acked-by: Steve Dickson Signed-off-by: Linus Torvalds --- kernel/cred.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index ff7bc071991..fc222e4acfb 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -529,6 +529,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) error: put_cred(new); + put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); -- cgit v1.2.3 From 43529c97122f2c851126447963eedcb8cba74fbe Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Jan 2009 16:13:46 +0000 Subject: CRED: Must initialise the new creds in prepare_kernel_cred() The newly allocated creds in prepare_kernel_cred() must be initialised before get_uid() and get_group_info() can access them. They should be copied from the old credentials. Reported-by: Steve Dickson Signed-off-by: David Howells Acked-by: Steve Dickson Signed-off-by: Linus Torvalds --- kernel/cred.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index fc222e4acfb..043f78c133c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -506,6 +506,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) else old = get_cred(&init_cred); + *new = *old; get_uid(new->user); get_group_info(new->group_info); -- cgit v1.2.3 From cdb80f630be5cbc23d82331f24dc4704f75b64f4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 9 Jan 2009 13:23:45 -0800 Subject: async: make async a command line option for now ... and have it default off. This does allow people to work with it for testing. Signed-off-by: Arjan van de Ven --- kernel/async.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 64cc916299a..f286e9f2b73 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -65,6 +65,8 @@ static LIST_HEAD(async_pending); static LIST_HEAD(async_running); static DEFINE_SPINLOCK(async_lock); +static int async_enabled = 0; + struct async_entry { struct list_head list; async_cookie_t cookie; @@ -169,7 +171,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l * If we're out of memory or if there's too much work * pending already, we execute synchronously. */ - if (!entry || atomic_read(&entry_count) > MAX_WORK) { + if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { kfree(entry); spin_lock_irqsave(&async_lock, flags); newcookie = next_cookie++; @@ -316,8 +318,18 @@ static int async_manager_thread(void *unused) static int __init async_init(void) { - kthread_run(async_manager_thread, NULL, "async/mgr"); + if (async_enabled) + kthread_run(async_manager_thread, NULL, "async/mgr"); return 0; } +static int __init setup_async(char *str) +{ + async_enabled = 1; + return 1; +} + +__setup("fastboot", setup_async); + + core_initcall(async_init); -- cgit v1.2.3 From 62ea9ceb17a74bc7544211bfeecf4170c554ac4f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 11 Jan 2009 01:04:16 +0100 Subject: cpumask: fix CONFIG_NUMA=y sched.c Impact: fix panic on ia64 with NR_CPUS=1024 struct sched_domain is now a dangling structure; where we really want static ones, we need to use static_sched_domain. (As the FIXME in this file says, cpumask_var_t would be better, but this code is hairy enough without trying to add initialization code to the right places). Reported-by: Mike Travis Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index deb5ac8c12f..f0c0a81d763 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7282,10 +7282,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, * groups, so roll our own. Now each node has its own list of groups which * gets dynamically allocated. */ -static DEFINE_PER_CPU(struct sched_domain, node_domains); +static DEFINE_PER_CPU(struct static_sched_domain, node_domains); static struct sched_group ***sched_group_nodes_bycpu; -static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, @@ -7560,7 +7560,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { - sd = &per_cpu(allnodes_domains, i); + sd = &per_cpu(allnodes_domains, i).sd; SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), cpu_map); @@ -7570,7 +7570,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } else p = NULL; - sd = &per_cpu(node_domains, i); + sd = &per_cpu(node_domains, i).sd; SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); @@ -7688,7 +7688,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(j, nodemask) { struct sched_domain *sd; - sd = &per_cpu(node_domains, j); + sd = &per_cpu(node_domains, j).sd; sd->groups = sg; } sg->__cpu_power = 0; -- cgit v1.2.3 From 805194c35b91999b139e4d6b6145f4f84fd4c814 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 10 Jan 2009 15:43:15 +0800 Subject: sched: partly revert "sched debug: remove NULL checking in print_cfs_rt_rq()" Impact: avoid accessing NULL tg.css->cgroup In commit 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d, I removed checking NULL tg.css->cgroup, but I realized I was wrong when I found reading /proc/sched_debug can race with cgroup_create(). Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4293cfa9681..16eeba4e416 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -145,6 +145,19 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_unlock_irqrestore(&tasklist_lock, flags); } +#if defined(CONFIG_CGROUP_SCHED) && \ + (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) +static void task_group_path(struct task_group *tg, char *buf, int buflen) +{ + /* may be NULL if the underlying cgroup isn't fully-created yet */ + if (!tg->css.cgroup) { + buf[0] = '\0'; + return; + } + cgroup_path(tg->css.cgroup, buf, buflen); +} +#endif + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, @@ -154,10 +167,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) unsigned long flags; #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128] = ""; + char path[128]; struct task_group *tg = cfs_rq->tg; - cgroup_path(tg->css.cgroup, path, sizeof(path)); + task_group_path(tg, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); #elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) @@ -208,10 +221,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) - char path[128] = ""; + char path[128]; struct task_group *tg = rt_rq->tg; - cgroup_path(tg->css.cgroup, path, sizeof(path)); + task_group_path(tg, path, sizeof(path)); SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); #else -- cgit v1.2.3 From 53ce3d9564908794ae7dd32969089b57df5fc098 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Jan 2009 12:27:08 -0800 Subject: smp_call_function_single(): be slightly less stupid If you do smp_call_function_single(expression-with-side-effects, ...) then expression-with-side-effects never gets evaluated on UP builds. As always, implementing it in C is the correct thing to do. While we're there, uninline it for size and possible header dependency reasons. And create a new kernel/up.c, as a place in which to put uniprocessor-specific code and storage. It should mirror kernel/smp.c. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/Makefile | 6 +++++- kernel/up.c | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 kernel/up.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2921d90ce32..2aebc4cd787 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,7 +40,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) +obj-y += smp.o +else +obj-y += up.o +endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o diff --git a/kernel/up.c b/kernel/up.c new file mode 100644 index 00000000000..ce62cc9e9f7 --- /dev/null +++ b/kernel/up.c @@ -0,0 +1,18 @@ +/* + * Uniprocessor-only support functions. The counterpart to kernel/smp.c + */ + +#include +#include +#include + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) +{ + WARN_ON(cpuid != 0); + local_irq_disable(); + (func)(info); + local_irq_enable(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3 From 93423b8665f43a0c7a006a1d5be048b99db56d32 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 11 Jan 2009 05:15:21 +0100 Subject: smp_call_function_single(): be slightly less stupid, fix Impact: build fix on Alpha kernel/up.c: In function 'smp_call_function_single': kernel/up.c:12: error: 'cpuid' undeclared (first use in this function) kernel/up.c:12: error: (Each undeclared identifier is reported only once kernel/up.c:12: error: for each function it appears in.) The typo didnt show up on x86 because 'cpuid' happens to be a function address as well ... Signed-off-by: Ingo Molnar --- kernel/up.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/up.c b/kernel/up.c index ce62cc9e9f7..c04b9dcfceb 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -9,10 +9,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { - WARN_ON(cpuid != 0); + WARN_ON(cpu != 0); + local_irq_disable(); (func)(info); local_irq_enable(); + return 0; } EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3 From fd2ab30b65e961b974ae0bc71e0d47d6b35e0968 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sun, 11 Jan 2009 01:04:22 -0800 Subject: kernel/sched.c: add missing forward declaration for 'double_rq_lock' Impact: build fix on certain configs Added 'double_rq_lock' forward declaration, allowing double_rq_lock to be used in _double_lock_balance(). Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f0c0a81d763..8be2c13b50d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch); DEFINE_TRACE(sched_migrate_task); #ifdef CONFIG_SMP + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) * Since cpu_power is a 'constant', we can use a reciprocal divide. -- cgit v1.2.3 From 783adf42cf039083dd3c734c07c3bdc707e2bb15 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sun, 11 Jan 2009 01:04:21 -0800 Subject: kernel/fork.c: unused variable 'ret' Removed the unused variable. Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e995899ea83..81da4aae85c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,7 +817,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; - int ret; if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); -- cgit v1.2.3 From 01e3eb82278bf45221fc38b391bc5ee0f6a314d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 Jan 2009 13:00:50 +0100 Subject: Revert "sched: improve preempt debugging" This reverts commit 7317d7b87edb41a9135e30be1ec3f7ef817c53dd. This has been reported (and bisected) by Alexey Zaytsev and Kamalesh Babulal to produce annoying warnings during bootup on both x86 and powerpc. kernel_locked() is not a valid test in IRQ context (we update the BKL's ->lock_depth and the preempt count separately and non-atomicalyy), so we cannot put it into the generic preempt debugging checks which can run in IRQ contexts too. Reported-and-bisected-by: Alexey Zaytsev Reported-and-bisected-by: Kamalesh Babulal Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d..3b630d88266 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4440,7 +4440,7 @@ void __kprobes sub_preempt_count(int val) /* * Underflow? */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; /* * Is the spinlock portion underflowing? -- cgit v1.2.3 From 6e96281412f2f757abe623e08a9577e2bbd3402f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 Jan 2009 16:04:37 +0100 Subject: smp_call_function_single(): be slightly less stupid, fix #2 fix m68k build failure: tip/kernel/up.c: In function 'smp_call_function_single': tip/kernel/up.c:16: error: dereferencing pointer to incomplete type make[2]: *** [kernel/up.o] Error 1 Signed-off-by: Ingo Molnar --- kernel/up.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/up.c b/kernel/up.c index c04b9dcfceb..1ff27a28bb7 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -2,6 +2,7 @@ * Uniprocessor-only support functions. The counterpart to kernel/smp.c */ +#include #include #include #include -- cgit v1.2.3 From 37a76bd4f1b716949fc38a6842e89f0ccb8384d0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 11 Jan 2009 15:35:01 +0000 Subject: async: fix __lowest_in_progress() At 37000 feet somewhere near Greenland I woke up from a half-sleep with the realisation that __lowest_in_progress() is buggy. After landing I checked and there were indeed 2 problems with it; this patch fixes both: * The order of the list checks was wrong * The locking was not correct. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index f286e9f2b73..608b32b4281 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -90,12 +90,12 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; - if (!list_empty(&async_pending)) { - entry = list_first_entry(&async_pending, + if (!list_empty(running)) { + entry = list_first_entry(running, struct async_entry, list); return entry->cookie; - } else if (!list_empty(running)) { - entry = list_first_entry(running, + } else if (!list_empty(&async_pending)) { + entry = list_first_entry(&async_pending, struct async_entry, list); return entry->cookie; } else { @@ -104,6 +104,17 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) } } + +static async_cookie_t lowest_in_progress(struct list_head *running) +{ + unsigned long flags; + async_cookie_t ret; + + spin_lock_irqsave(&async_lock, flags); + ret = __lowest_in_progress(running); + spin_unlock_irqrestore(&async_lock, flags); + return ret; +} /* * pick the first pending entry and run it */ @@ -229,7 +240,7 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r starttime = ktime_get(); } - wait_event(async_done, __lowest_in_progress(running) >= cookie); + wait_event(async_done, lowest_in_progress(running) >= cookie); if (initcall_debug && system_state == SYSTEM_BOOTING) { endtime = ktime_get(); -- cgit v1.2.3 From e4fa4c97016037620f9dc8bafe03e1086b665b4c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 14 Jan 2009 14:58:15 +0800 Subject: rcu: add __cpuinit to rcu_init_percpu_data() Impact: reduce memory footprint add __cpuinit to rcu_init_percpu_data(), and this function's text will be discarded after boot when !CONFIG_HOTPLUG_CPU. Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- kernel/rcutree.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 490934fc7ac..bd5a9003497 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -716,7 +716,7 @@ void rcu_check_callbacks(int cpu, int user) raise_rcu_softirq(); } -static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, +static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { unsigned long flags; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f2d8638e6c6..b2fd602a6f6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1314,7 +1314,7 @@ int rcu_needs_cpu(int cpu) * access due to the fact that this CPU cannot possibly have any RCU * callbacks in flight yet. */ -static void +static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; -- cgit v1.2.3 From baf48f6577e581a9adb8fe849dc80e24b21d171d Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Mon, 12 Jan 2009 21:15:17 -0800 Subject: softlock: fix false panic which can occur if softlockup_thresh is reduced At run-time, if softlockup_thresh is changed to a much lower value, touch_timestamp is likely to be much older than the new softlock_thresh. This will cause a false softlockup to be detected. If softlockup_panic is enabled, the system will panic. The fix is to touch all watchdogs before changing softlockup_thresh. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 9 +++++++++ kernel/sysctl.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d9188c66278..85d5a245510 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void) } EXPORT_SYMBOL(touch_all_softlockup_watchdogs); +int proc_dosoftlockup_thresh(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + touch_all_softlockup_watchdogs(); + return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); +} + /* * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89d74436318..596dc31a711 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -800,7 +800,7 @@ static struct ctl_table kern_table[] = { .data = &softlockup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &proc_dosoftlockup_thresh, .strategy = &sysctl_intvec, .extra1 = &neg_one, .extra2 = &sixty, -- cgit v1.2.3 From 14819ea1e0bcbdc9b084cd60a6a24d5d786324ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 14 Jan 2009 12:34:21 +0100 Subject: irq: export __set_irq_handler() and handle_level_irq() Impact: build fix ARM updates broke x86 allmodconfig builds: ERROR: "__set_irq_handler" [drivers/mfd/pcf50633-core.ko] undefined! ERROR: "handle_level_irq" [drivers/mfd/pcf50633-core.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/irq/chip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f63c706d25e..7de11bd64df 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -383,6 +383,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) out_unlock: spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_level_irq); /** * handle_fasteoi_irq - irq handler for transparent controllers @@ -593,6 +594,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, } spin_unlock_irqrestore(&desc->lock, flags); } +EXPORT_SYMBOL_GPL(__set_irq_handler); void set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, -- cgit v1.2.3 From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:54 +0100 Subject: [CVE-2009-0029] Convert all system calls to return a long Convert all system calls to return a long. This should be a NOP since all converted types should have the same size anyway. With the exception of sys_exit_group which returned void. But that doesn't matter since the system call doesn't return. Signed-off-by: Heiko Carstens --- kernel/exit.c | 4 +++- kernel/signal.c | 2 +- kernel/timer.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c7740fa3252..fac9b040af2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1182,9 +1182,11 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage void sys_exit_group(int error_code) +asmlinkage long sys_exit_group(int error_code) { do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; } static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) diff --git a/kernel/signal.c b/kernel/signal.c index 3152ac3b62e..856a5479d49 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2559,7 +2559,7 @@ sys_ssetmask(int newmask) /* * For backwards compatibility. Functionality superseded by sigaction. */ -asmlinkage unsigned long +asmlinkage long sys_signal(int sig, __sighandler_t handler) { struct k_sigaction new_sa, old_sa; diff --git a/kernel/timer.c b/kernel/timer.c index dee3f641a7a..7b8697d7f04 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks) * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ -asmlinkage unsigned long sys_alarm(unsigned int seconds) +asmlinkage long sys_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } -- cgit v1.2.3 From f627a741d24f12955fa2d9f8831c3b12860635bd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:58 +0100 Subject: [CVE-2009-0029] Make sys_syslog a conditional system call Remove the -ENOSYS implementation for !CONFIG_PRINTK and use the cond_syscall infrastructure instead. Acked-by: Kyle McMartin Signed-off-by: Heiko Carstens --- kernel/printk.c | 5 ----- kernel/sys_ni.c | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 7015733793e..e48cf33783f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -742,11 +742,6 @@ EXPORT_SYMBOL(vprintk); #else -asmlinkage long sys_syslog(int type, char __user *buf, int len) -{ - return -ENOSYS; -} - static void call_console_drivers(unsigned start, unsigned end) { } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e14a2328170..27dad296738 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -131,6 +131,7 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(sys_syslog); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3 From 58fd3aa288939d3097fa04505b25c2f5e6e144d1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:03 +0100 Subject: [CVE-2009-0029] System call wrappers part 01 Signed-off-by: Heiko Carstens --- kernel/hrtimer.c | 4 ++-- kernel/sys.c | 2 +- kernel/time.c | 14 +++++++------- kernel/timer.c | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1455b7651b6..2dc30c59c5f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1467,8 +1467,8 @@ out: return ret; } -asmlinkage long -sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) +SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, + struct timespec __user *, rmtp) { struct timespec tu; diff --git a/kernel/sys.c b/kernel/sys.c index 763c3c17ded..37165e55233 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -919,7 +919,7 @@ void do_sys_times(struct tms *tms) tms->tms_cstime = cputime_to_clock_t(cstime); } -asmlinkage long sys_times(struct tms __user * tbuf) +SYSCALL_DEFINE1(times, struct tms __user *, tbuf) { if (tbuf) { struct tms tmp; diff --git a/kernel/time.c b/kernel/time.c index 4886e3ce83a..29511943871 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -60,7 +60,7 @@ EXPORT_SYMBOL(sys_tz); * why not move it into the appropriate arch directory (for those * architectures that need it). */ -asmlinkage long sys_time(time_t __user * tloc) +SYSCALL_DEFINE1(time, time_t __user *, tloc) { time_t i = get_seconds(); @@ -79,7 +79,7 @@ asmlinkage long sys_time(time_t __user * tloc) * architectures that need it). */ -asmlinkage long sys_stime(time_t __user *tptr) +SYSCALL_DEFINE1(stime, time_t __user *, tptr) { struct timespec tv; int err; @@ -99,8 +99,8 @@ asmlinkage long sys_stime(time_t __user *tptr) #endif /* __ARCH_WANT_SYS_TIME */ -asmlinkage long sys_gettimeofday(struct timeval __user *tv, - struct timezone __user *tz) +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) { if (likely(tv != NULL)) { struct timeval ktv; @@ -184,8 +184,8 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) return 0; } -asmlinkage long sys_settimeofday(struct timeval __user *tv, - struct timezone __user *tz) +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) { struct timeval user_tv; struct timespec new_ts; @@ -205,7 +205,7 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -asmlinkage long sys_adjtimex(struct timex __user *txc_p) +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) { struct timex txc; /* Local copy of parameter */ int ret; diff --git a/kernel/timer.c b/kernel/timer.c index 7b8697d7f04..76041df06c5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks) * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ -asmlinkage long sys_alarm(unsigned int seconds) +SYSCALL_DEFINE1(alarm, unsigned int, seconds) { return alarm_setitimer(seconds); } @@ -1152,7 +1152,7 @@ asmlinkage long sys_alarm(unsigned int seconds) * * This is SMP safe as current->tgid does not change. */ -asmlinkage long sys_getpid(void) +SYSCALL_DEFINE0(getpid) { return task_tgid_vnr(current); } @@ -1308,7 +1308,7 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) EXPORT_SYMBOL(schedule_timeout_uninterruptible); /* Thread ID - the internal kernel "pid" */ -asmlinkage long sys_gettid(void) +SYSCALL_DEFINE0(gettid) { return task_pid_vnr(current); } -- cgit v1.2.3 From dbf040d9d1cbf1ef6250bdb095c5c118950bcde8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:04 +0100 Subject: [CVE-2009-0029] System call wrappers part 02 Signed-off-by: Heiko Carstens --- kernel/sys.c | 10 +++++----- kernel/timer.c | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 37165e55233..4c33555f8d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -756,7 +756,7 @@ error: return retval; } -asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) { const struct cred *cred = current_cred(); int retval; @@ -814,7 +814,7 @@ error: return retval; } -asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) { const struct cred *cred = current_cred(); int retval; @@ -1015,7 +1015,7 @@ out: return err; } -asmlinkage long sys_getpgid(pid_t pid) +SYSCALL_DEFINE1(getpgid, pid_t, pid) { struct task_struct *p; struct pid *grp; @@ -1045,14 +1045,14 @@ out: #ifdef __ARCH_WANT_SYS_GETPGRP -asmlinkage long sys_getpgrp(void) +SYSCALL_DEFINE0(getpgrp) { return sys_getpgid(0); } #endif -asmlinkage long sys_getsid(pid_t pid) +SYSCALL_DEFINE1(getsid, pid_t, pid) { struct task_struct *p; struct pid *sid; diff --git a/kernel/timer.c b/kernel/timer.c index 76041df06c5..14a51530a4c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1163,7 +1163,7 @@ SYSCALL_DEFINE0(getpid) * value of ->real_parent under rcu_read_lock(), see * release_task()->call_rcu(delayed_put_task_struct). */ -asmlinkage long sys_getppid(void) +SYSCALL_DEFINE0(getppid) { int pid; @@ -1174,25 +1174,25 @@ asmlinkage long sys_getppid(void) return pid; } -asmlinkage long sys_getuid(void) +SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ return current_uid(); } -asmlinkage long sys_geteuid(void) +SYSCALL_DEFINE0(geteuid) { /* Only we change this so SMP safe */ return current_euid(); } -asmlinkage long sys_getgid(void) +SYSCALL_DEFINE0(getgid) { /* Only we change this so SMP safe */ return current_gid(); } -asmlinkage long sys_getegid(void) +SYSCALL_DEFINE0(getegid) { /* Only we change this so SMP safe */ return current_egid(); -- cgit v1.2.3 From ae1251ab785f6da87219df8352ffdac68bba23e4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:05 +0100 Subject: [CVE-2009-0029] System call wrappers part 03 Signed-off-by: Heiko Carstens --- kernel/sys.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 4c33555f8d9..ace9ced598b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -478,7 +478,7 @@ void ctrl_alt_del(void) * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ -asmlinkage long sys_setregid(gid_t rgid, gid_t egid) +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { const struct cred *old; struct cred *new; @@ -529,7 +529,7 @@ error: * * SMP: Same implicit races as above. */ -asmlinkage long sys_setgid(gid_t gid) +SYSCALL_DEFINE1(setgid, gid_t, gid) { const struct cred *old; struct cred *new; @@ -597,7 +597,7 @@ static int set_user(struct cred *new) * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ -asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { const struct cred *old; struct cred *new; @@ -661,7 +661,7 @@ error: * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ -asmlinkage long sys_setuid(uid_t uid) +SYSCALL_DEFINE1(setuid, uid_t, uid) { const struct cred *old; struct cred *new; @@ -705,7 +705,7 @@ error: * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ -asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { const struct cred *old; struct cred *new; @@ -771,7 +771,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u /* * Same as above, but for rgid, egid, sgid. */ -asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { const struct cred *old; struct cred *new; @@ -833,7 +833,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ -asmlinkage long sys_setfsuid(uid_t uid) +SYSCALL_DEFINE1(setfsuid, uid_t, uid) { const struct cred *old; struct cred *new; @@ -870,7 +870,7 @@ change_okay: /* * Samma pÃ¥ svenska.. */ -asmlinkage long sys_setfsgid(gid_t gid) +SYSCALL_DEFINE1(setfsgid, gid_t, gid) { const struct cred *old; struct cred *new; @@ -1311,7 +1311,7 @@ int set_current_groups(struct group_info *group_info) EXPORT_SYMBOL(set_current_groups); -asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) +SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; -- cgit v1.2.3 From b290ebe2c46d01b742b948ce03f09e8a3efb9a92 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:06 +0100 Subject: [CVE-2009-0029] System call wrappers part 04 Signed-off-by: Heiko Carstens --- kernel/acct.c | 2 +- kernel/capability.c | 4 ++-- kernel/exec_domain.c | 3 +-- kernel/itimer.c | 2 +- kernel/signal.c | 7 +++---- kernel/sys.c | 6 +++--- 6 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index d57b7cbb98b..7afa3156416 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -277,7 +277,7 @@ static int acct_on(char *name) * should be written. If the filename is NULL, accounting will be * shutdown. */ -asmlinkage long sys_acct(const char __user *name) +SYSCALL_DEFINE1(acct, const char __user *, name) { int error; diff --git a/kernel/capability.c b/kernel/capability.c index 688926e496b..4e17041963f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -161,7 +161,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, * * Returns 0 on success and < 0 on error. */ -asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) +SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; @@ -235,7 +235,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) * * Returns 0 on success and < 0 on error. */ -asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) +SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0511716e942..667c841c295 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -209,8 +209,7 @@ static int __init proc_execdomains_init(void) module_init(proc_execdomains_init); #endif -asmlinkage long -sys_personality(u_long personality) +SYSCALL_DEFINE1(personality, u_long, personality) { u_long old = current->personality; diff --git a/kernel/itimer.c b/kernel/itimer.c index db7c358b9a0..7e0663ea94f 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -100,7 +100,7 @@ int do_getitimer(int which, struct itimerval *value) return 0; } -asmlinkage long sys_getitimer(int which, struct itimerval __user *value) +SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) { int error = -EFAULT; struct itimerval get_buffer; diff --git a/kernel/signal.c b/kernel/signal.c index 856a5479d49..3fe08eaa5de 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2434,8 +2434,7 @@ out: #ifdef __ARCH_WANT_SYS_SIGPENDING -asmlinkage long -sys_sigpending(old_sigset_t __user *set) +SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { return do_sigpending(set, sizeof(*set)); } @@ -2446,8 +2445,8 @@ sys_sigpending(old_sigset_t __user *set) /* Some platforms have their own version with special arguments others support only sys_rt_sigprocmask. */ -asmlinkage long -sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset) +SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, + old_sigset_t __user *, oset) { int error; old_sigset_t old_set, new_set; diff --git a/kernel/sys.c b/kernel/sys.c index ace9ced598b..cbe4502c28a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -944,7 +944,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. * LBT 04.03.94 */ -asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) +SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; @@ -1080,7 +1080,7 @@ out: return retval; } -asmlinkage long sys_setsid(void) +SYSCALL_DEFINE0(setsid) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); @@ -1340,7 +1340,7 @@ out: * without another task interfering. */ -asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) +SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) { struct group_info *group_info; int retval; -- cgit v1.2.3 From 362e9c07c7220c0a78c88826fc0d2bf7e4a4bb68 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:07 +0100 Subject: [CVE-2009-0029] System call wrappers part 05 Signed-off-by: Heiko Carstens --- kernel/itimer.c | 5 ++--- kernel/posix-timers.c | 43 +++++++++++++++++++------------------------ 2 files changed, 21 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 7e0663ea94f..6a5fe93dd8b 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -260,9 +260,8 @@ unsigned int alarm_setitimer(unsigned int seconds) return it_old.it_value.tv_sec; } -asmlinkage long sys_setitimer(int which, - struct itimerval __user *value, - struct itimerval __user *ovalue) +SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, + struct itimerval __user *, ovalue) { struct itimerval set_buffer, get_buffer; int error; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 887c63787de..052ec4d195c 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -477,10 +477,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) /* Create a POSIX.1b interval timer. */ -asmlinkage long -sys_timer_create(const clockid_t which_clock, - struct sigevent __user *timer_event_spec, - timer_t __user * created_timer_id) +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) { struct k_itimer *new_timer; int error, new_timer_id; @@ -661,8 +660,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) } /* Get the time remaining on a POSIX.1b interval timer. */ -asmlinkage long -sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) { struct k_itimer *timr; struct itimerspec cur_setting; @@ -691,8 +690,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) * the call back to do_schedule_next_timer(). So all we need to do is * to pick up the frozen overrun. */ -asmlinkage long -sys_timer_getoverrun(timer_t timer_id) +SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { struct k_itimer *timr; int overrun; @@ -760,10 +758,9 @@ common_timer_set(struct k_itimer *timr, int flags, } /* Set a POSIX.1b interval timer */ -asmlinkage long -sys_timer_settime(timer_t timer_id, int flags, - const struct itimerspec __user *new_setting, - struct itimerspec __user *old_setting) +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) { struct k_itimer *timr; struct itimerspec new_spec, old_spec; @@ -816,8 +813,7 @@ static inline int timer_delete_hook(struct k_itimer *timer) } /* Delete a POSIX.1b interval timer. */ -asmlinkage long -sys_timer_delete(timer_t timer_id) +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; unsigned long flags; @@ -903,8 +899,8 @@ int do_posix_clock_nonanosleep(const clockid_t clock, int flags, } EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); -asmlinkage long sys_clock_settime(const clockid_t which_clock, - const struct timespec __user *tp) +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) { struct timespec new_tp; @@ -916,8 +912,8 @@ asmlinkage long sys_clock_settime(const clockid_t which_clock, return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); } -asmlinkage long -sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) { struct timespec kernel_tp; int error; @@ -933,8 +929,8 @@ sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) } -asmlinkage long -sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct timespec __user *, tp) { struct timespec rtn_tp; int error; @@ -963,10 +959,9 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } -asmlinkage long -sys_clock_nanosleep(const clockid_t which_clock, int flags, - const struct timespec __user *rqtp, - struct timespec __user *rmtp) +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) { struct timespec t; -- cgit v1.2.3 From 5add95d4f7cf08f6f62510f19576992912387501 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:08 +0100 Subject: [CVE-2009-0029] System call wrappers part 06 Signed-off-by: Heiko Carstens --- kernel/sched.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d..1a0fdfa5ddf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5126,7 +5126,7 @@ int can_nice(const struct task_struct *p, const int nice) * sys_setpriority is a more generic, but much slower function that * does similar things. */ -asmlinkage long sys_nice(int increment) +SYSCALL_DEFINE1(nice, int, increment) { long nice, retval; @@ -5433,8 +5433,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @policy: new policy. * @param: structure containing the new RT priority. */ -asmlinkage long -sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) { /* negative values for policy are not valid */ if (policy < 0) @@ -5448,7 +5448,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @pid: the pid in question. * @param: structure containing the new RT priority. */ -asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { return do_sched_setscheduler(pid, -1, param); } @@ -5457,7 +5457,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. */ -asmlinkage long sys_sched_getscheduler(pid_t pid) +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { struct task_struct *p; int retval; @@ -5482,7 +5482,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) * @pid: the pid in question. * @param: structure containing the RT priority. */ -asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { struct sched_param lp; struct task_struct *p; @@ -5600,8 +5600,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; @@ -5648,8 +5648,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ -asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) { int ret; cpumask_var_t mask; @@ -5678,7 +5678,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. */ -asmlinkage long sys_sched_yield(void) +SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); @@ -5819,7 +5819,7 @@ long __sched io_schedule_timeout(long timeout) * this syscall returns the maximum rt_priority that can be used * by a given scheduling class. */ -asmlinkage long sys_sched_get_priority_max(int policy) +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { int ret = -EINVAL; @@ -5844,7 +5844,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) * this syscall returns the minimum rt_priority that can be used * by a given scheduling class. */ -asmlinkage long sys_sched_get_priority_min(int policy) +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { int ret = -EINVAL; -- cgit v1.2.3 From 754fe8d297bfae7b77f7ce866e2fb0c5fb186506 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:09 +0100 Subject: [CVE-2009-0029] System call wrappers part 07 Signed-off-by: Heiko Carstens --- kernel/exit.c | 8 ++++---- kernel/kexec.c | 5 ++--- kernel/sched.c | 4 ++-- kernel/signal.c | 2 +- kernel/sys.c | 7 ++++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fac9b040af2..08895df0eab 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1141,7 +1141,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) EXPORT_SYMBOL(complete_and_exit); -asmlinkage long sys_exit(int error_code) +SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } @@ -1182,7 +1182,7 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage long sys_exit_group(int error_code) +SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ @@ -1795,8 +1795,8 @@ asmlinkage long sys_waitid(int which, pid_t upid, return ret; } -asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, - int options, struct rusage __user *ru) +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; diff --git a/kernel/kexec.c b/kernel/kexec.c index 3fb855ad6aa..8a6d7b08864 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -934,9 +934,8 @@ struct kimage *kexec_crash_image; static DEFINE_MUTEX(kexec_mutex); -asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, - unsigned long flags) +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) { struct kimage **dest_image, *image; int result; diff --git a/kernel/sched.c b/kernel/sched.c index 1a0fdfa5ddf..65c02037b05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5869,8 +5869,8 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -asmlinkage -long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) { struct task_struct *p; unsigned int time_slice; diff --git a/kernel/signal.c b/kernel/signal.c index 3fe08eaa5de..41f32e08615 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1961,7 +1961,7 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ -asmlinkage long sys_restart_syscall(void) +SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; return restart->fn(restart); diff --git a/kernel/sys.c b/kernel/sys.c index cbe4502c28a..39b192b4003 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -143,7 +143,7 @@ out: return error; } -asmlinkage long sys_setpriority(int which, int who, int niceval) +SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) { struct task_struct *g, *p; struct user_struct *user; @@ -208,7 +208,7 @@ out: * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible. */ -asmlinkage long sys_getpriority(int which, int who) +SYSCALL_DEFINE2(getpriority, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; @@ -355,7 +355,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off); * * reboot doesn't sync: do that yourself before calling this. */ -asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) { char buffer[256]; -- cgit v1.2.3 From 17da2bd90abf428523de0fb98f7075e00e3ed42e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:10 +0100 Subject: [CVE-2009-0029] System call wrappers part 08 Signed-off-by: Heiko Carstens --- kernel/exit.c | 7 +++---- kernel/fork.c | 2 +- kernel/futex.c | 6 +++--- kernel/module.c | 10 ++++------ kernel/sched.c | 2 +- kernel/signal.c | 18 +++++++----------- 6 files changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 08895df0eab..f80dec3f187 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1754,9 +1754,8 @@ end: return retval; } -asmlinkage long sys_waitid(int which, pid_t upid, - struct siginfo __user *infop, int options, - struct rusage __user *ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; @@ -1833,7 +1832,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ -asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return sys_wait4(pid, stat_addr, options, NULL); } diff --git a/kernel/fork.c b/kernel/fork.c index 1d68f1255dd..8eb37d38c6a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -901,7 +901,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) clear_freeze_flag(p); } -asmlinkage long sys_set_tid_address(int __user *tidptr) +SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; diff --git a/kernel/futex.c b/kernel/futex.c index 002aa189eb0..e86931d8d4e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1978,9 +1978,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, } -asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, - struct timespec __user *utime, u32 __user *uaddr2, - u32 val3) +SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) { struct timespec ts; ktime_t t, *tp = NULL; diff --git a/kernel/module.c b/kernel/module.c index c9332c90d5a..e8b51d41dd7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -743,8 +743,8 @@ static void wait_for_zero_refcount(struct module *mod) mutex_lock(&module_mutex); } -asmlinkage long -sys_delete_module(const char __user *name_user, unsigned int flags) +SYSCALL_DEFINE2(delete_module, const char __user *, name_user, + unsigned int, flags) { struct module *mod; char name[MODULE_NAME_LEN]; @@ -2296,10 +2296,8 @@ static noinline struct module *load_module(void __user *umod, } /* This is where the real work happens */ -asmlinkage long -sys_init_module(void __user *umod, - unsigned long len, - const char __user *uargs) +SYSCALL_DEFINE3(init_module, void __user *, umod, + unsigned long, len, const char __user *, uargs) { struct module *mod; int ret = 0; diff --git a/kernel/sched.c b/kernel/sched.c index 65c02037b05..eb1931eef58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5869,7 +5869,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid, +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { struct task_struct *p; diff --git a/kernel/signal.c b/kernel/signal.c index 41f32e08615..278cc8737f1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2014,8 +2014,8 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return error; } -asmlinkage long -sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, + sigset_t __user *, oset, size_t, sigsetsize) { int error = -EINVAL; sigset_t old_set, new_set; @@ -2074,8 +2074,7 @@ out: return error; } -asmlinkage long -sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize) +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); } @@ -2146,11 +2145,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif -asmlinkage long -sys_rt_sigtimedwait(const sigset_t __user *uthese, - siginfo_t __user *uinfo, - const struct timespec __user *uts, - size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + siginfo_t __user *, uinfo, const struct timespec __user *, uts, + size_t, sigsetsize) { int ret, sig; sigset_t these; @@ -2223,8 +2220,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, return ret; } -asmlinkage long -sys_kill(pid_t pid, int sig) +SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; -- cgit v1.2.3 From a5f8fa9e9ba5ef3305e147f41ad6e1e84ac1f0bd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:11 +0100 Subject: [CVE-2009-0029] System call wrappers part 09 Signed-off-by: Heiko Carstens --- kernel/signal.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 278cc8737f1..e2333929611 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2279,7 +2279,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ -asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig) +SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) @@ -2291,8 +2291,7 @@ asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig) /* * Send a signal to only one task, even if it's a CLONE_THREAD task. */ -asmlinkage long -sys_tkill(pid_t pid, int sig) +SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0) @@ -2301,8 +2300,8 @@ sys_tkill(pid_t pid, int sig) return do_tkill(0, pid, sig); } -asmlinkage long -sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo) +SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) { siginfo_t info; @@ -2526,15 +2525,13 @@ out: /* * For backwards compatibility. Functionality superseded by sigprocmask. */ -asmlinkage long -sys_sgetmask(void) +SYSCALL_DEFINE0(sgetmask) { /* SMP safe */ return current->blocked.sig[0]; } -asmlinkage long -sys_ssetmask(int newmask) +SYSCALL_DEFINE1(ssetmask, int, newmask) { int old; @@ -2554,8 +2551,7 @@ sys_ssetmask(int newmask) /* * For backwards compatibility. Functionality superseded by sigaction. */ -asmlinkage long -sys_signal(int sig, __sighandler_t handler) +SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) { struct k_sigaction new_sa, old_sa; int ret; @@ -2572,8 +2568,7 @@ sys_signal(int sig, __sighandler_t handler) #ifdef __ARCH_WANT_SYS_PAUSE -asmlinkage long -sys_pause(void) +SYSCALL_DEFINE0(pause) { current->state = TASK_INTERRUPTIBLE; schedule(); -- cgit v1.2.3 From ca013e945b1ba5828b151ee646946f1297b67a4c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:19 +0100 Subject: [CVE-2009-0029] System call wrappers part 17 Signed-off-by: Heiko Carstens --- kernel/uid16.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/uid16.c b/kernel/uid16.c index 2460c3199b5..37f48c049a2 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -17,7 +17,7 @@ #include -asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) +SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ @@ -25,7 +25,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi return ret; } -asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) +SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ @@ -33,7 +33,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g return ret; } -asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) +SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) { long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ -- cgit v1.2.3 From a6b42e83f249aad723589b2bdf6d1dfb2b0997c8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:20 +0100 Subject: [CVE-2009-0029] System call wrappers part 18 Signed-off-by: Heiko Carstens --- kernel/uid16.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/uid16.c b/kernel/uid16.c index 37f48c049a2..221894e6e98 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -41,7 +41,7 @@ SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) return ret; } -asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) +SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) { long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); /* avoid REGPARM breakage on x86: */ @@ -49,7 +49,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) return ret; } -asmlinkage long sys_setgid16(old_gid_t gid) +SYSCALL_DEFINE1(setgid16, old_gid_t, gid) { long ret = sys_setgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ @@ -57,7 +57,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) return ret; } -asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) +SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) { long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); /* avoid REGPARM breakage on x86: */ @@ -65,7 +65,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) return ret; } -asmlinkage long sys_setuid16(old_uid_t uid) +SYSCALL_DEFINE1(setuid16, old_uid_t, uid) { long ret = sys_setuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ @@ -73,7 +73,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) return ret; } -asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) +SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) { long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); @@ -82,7 +82,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) return ret; } -asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) +SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) { const struct cred *cred = current_cred(); int retval; @@ -94,7 +94,7 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, return retval; } -asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) +SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) { long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); @@ -103,7 +103,8 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) return ret; } -asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) + +SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) { const struct cred *cred = current_cred(); int retval; @@ -115,7 +116,7 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, return retval; } -asmlinkage long sys_setfsuid16(old_uid_t uid) +SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) { long ret = sys_setfsuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ @@ -123,7 +124,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) return ret; } -asmlinkage long sys_setfsgid16(old_gid_t gid) +SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) { long ret = sys_setfsgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ -- cgit v1.2.3 From 003d7ab479168132a2b2c6700fe682b08f08ab0c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:21 +0100 Subject: [CVE-2009-0029] System call wrappers part 19 Signed-off-by: Heiko Carstens --- kernel/uid16.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/uid16.c b/kernel/uid16.c index 221894e6e98..0314501688b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -162,7 +162,7 @@ static int groups16_from_user(struct group_info *group_info, return 0; } -asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) +SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; @@ -185,7 +185,7 @@ out: return i; } -asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) +SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) { struct group_info *group_info; int retval; @@ -210,22 +210,22 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) return retval; } -asmlinkage long sys_getuid16(void) +SYSCALL_DEFINE0(getuid16) { return high2lowuid(current_uid()); } -asmlinkage long sys_geteuid16(void) +SYSCALL_DEFINE0(geteuid16) { return high2lowuid(current_euid()); } -asmlinkage long sys_getgid16(void) +SYSCALL_DEFINE0(getgid16) { return high2lowgid(current_gid()); } -asmlinkage long sys_getegid16(void) +SYSCALL_DEFINE0(getegid16) { return high2lowgid(current_egid()); } -- cgit v1.2.3 From 5a8a82b1d306a325d899b67715618413657efda4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:25 +0100 Subject: [CVE-2009-0029] System call wrappers part 23 Signed-off-by: Heiko Carstens --- kernel/sys.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 39b192b4003..5292f2119da 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1406,7 +1406,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name) return errno; } -asmlinkage long sys_sethostname(char __user *name, int len) +SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; @@ -1430,7 +1430,7 @@ asmlinkage long sys_sethostname(char __user *name, int len) #ifdef __ARCH_WANT_SYS_GETHOSTNAME -asmlinkage long sys_gethostname(char __user *name, int len) +SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { int i, errno; struct new_utsname *u; @@ -1455,7 +1455,7 @@ asmlinkage long sys_gethostname(char __user *name, int len) * Only setdomainname; getdomainname can be implemented by calling * uname() */ -asmlinkage long sys_setdomainname(char __user *name, int len) +SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; -- cgit v1.2.3 From e48fbb699f82ef1e80bd7126046394d2dc9ca7e6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:26 +0100 Subject: [CVE-2009-0029] System call wrappers part 24 Signed-off-by: Heiko Carstens --- kernel/sys.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 5292f2119da..70ffa8408cd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1395,7 +1395,7 @@ EXPORT_SYMBOL(in_egroup_p); DECLARE_RWSEM(uts_sem); -asmlinkage long sys_newuname(struct new_utsname __user * name) +SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1478,7 +1478,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) return errno; } -asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { if (resource >= RLIM_NLIMITS) return -EINVAL; @@ -1497,7 +1497,8 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) * Back compatibility for getrlimit. Needed for some apps. */ -asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct rlimit __user *, rlim) { struct rlimit x; if (resource >= RLIM_NLIMITS) @@ -1515,7 +1516,7 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r #endif -asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit new_rlim, *old_rlim; int retval; @@ -1688,7 +1689,7 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } -asmlinkage long sys_getrusage(int who, struct rusage __user *ru) +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) @@ -1696,7 +1697,7 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru) return getrusage(current, who, ru); } -asmlinkage long sys_umask(int mask) +SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); return mask; -- cgit v1.2.3 From c4ea37c26a691ad0b7e86aa5884aab27830e95c9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:28 +0100 Subject: [CVE-2009-0029] System call wrappers part 26 Signed-off-by: Heiko Carstens --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 70ffa8408cd..59aadcdad6c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1703,8 +1703,8 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5) +SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; unsigned char comm[sizeof(me->comm)]; -- cgit v1.2.3 From 1e7bfb2134dfec37ce04fb3a4ca89299e892d10c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:29 +0100 Subject: [CVE-2009-0029] System call wrappers part 27 Signed-off-by: Heiko Carstens --- kernel/printk.c | 2 +- kernel/ptrace.c | 2 +- kernel/sysctl.c | 4 ++-- kernel/timer.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index e48cf33783f..69188f226a9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -382,7 +382,7 @@ out: return error; } -asmlinkage long sys_syslog(int type, char __user *buf, int len) +SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 29dc700e198..c9cf48b21f0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -574,7 +574,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) #define arch_ptrace_attach(child) do { } while (0) #endif -asmlinkage long sys_ptrace(long request, long pid, long addr, long data) +SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) { struct task_struct *child; long ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89d74436318..3e38b74b612 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1688,7 +1688,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol return error; } -asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) { struct __sysctl_args tmp; int error; @@ -2989,7 +2989,7 @@ int sysctl_ms_jiffies(struct ctl_table *table, #else /* CONFIG_SYSCTL_SYSCALL */ -asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) { struct __sysctl_args tmp; int error; diff --git a/kernel/timer.c b/kernel/timer.c index 14a51530a4c..13dd64fe143 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1400,7 +1400,7 @@ out: return 0; } -asmlinkage long sys_sysinfo(struct sysinfo __user *info) +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) { struct sysinfo val; -- cgit v1.2.3 From 6559eed8ca7db0531a207cd80be5e28cd6f213c5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:32 +0100 Subject: [CVE-2009-0029] System call wrappers part 30 Signed-off-by: Heiko Carstens --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8eb37d38c6a..bf0cef8bbdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1603,7 +1603,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp * constructed. Here we are modifying the current, active, * task_struct. */ -asmlinkage long sys_unshare(unsigned long unshare_flags) +SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; -- cgit v1.2.3 From 836f92adf121f806e9beb5b6b88bd5c9c4ea3f24 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:33 +0100 Subject: [CVE-2009-0029] System call wrappers part 31 Signed-off-by: Heiko Carstens --- kernel/futex.c | 11 +++++------ kernel/sys.c | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e86931d8d4e..f89d373a9c6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1733,9 +1733,8 @@ pi_faulted: * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ -asmlinkage long -sys_set_robust_list(struct robust_list_head __user *head, - size_t len) +SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, + size_t, len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -1756,9 +1755,9 @@ sys_set_robust_list(struct robust_list_head __user *head, * @head_ptr: pointer to a list-head pointer, the kernel fills it in * @len_ptr: pointer to a length field, the kernel fills in the header size */ -asmlinkage long -sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, - size_t __user *len_ptr) +SYSCALL_DEFINE3(get_robust_list, int, pid, + struct robust_list_head __user * __user *, head_ptr, + size_t __user *, len_ptr) { struct robust_list_head __user *head; unsigned long ret; diff --git a/kernel/sys.c b/kernel/sys.c index 59aadcdad6c..e7dc0e10a48 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1817,8 +1817,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return error; } -asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, - struct getcpu_cache __user *unused) +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); -- cgit v1.2.3 From d4e82042c4cfa87a7d51710b71f568fe80132551 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:34 +0100 Subject: [CVE-2009-0029] System call wrappers part 32 Signed-off-by: Heiko Carstens --- kernel/signal.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e2333929611..e73759783dc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2491,11 +2491,10 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION -asmlinkage long -sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct sigaction __user *, act, + struct sigaction __user *, oact, + size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret = -EINVAL; @@ -2578,7 +2577,7 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND -asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) +SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; -- cgit v1.2.3 From 9316fcacb89c59fe556c48587ac02cd7f5d38045 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Jan 2009 09:35:44 -0800 Subject: kernel/up.c: omit it if SMP=y, USE_GENERIC_SMP_HELPERS=n Fix the sparc build - we were including `up.o' on SMP builds, when CONFIG_USE_GENERIC_SMP_HELPERS=n. Tested-by: Robert Reif Fixed-by: Robert Reif Cc: David Miller Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2aebc4cd787..170a9213c1b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,9 +40,8 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) -obj-y += smp.o -else +obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_SMP) += spinlock.o -- cgit v1.2.3 From 2ea038917bbdd51a7ae4a898c6a04641324dd033 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 14 Jan 2009 21:38:20 +0100 Subject: Revert "kbuild: strip generated symbols from *.ko" This reverts commit ad7a953c522ceb496611d127e51e278bfe0ff483. And commit: ("allow stripping of generated symbols under CONFIG_KALLSYMS_ALL") 9bb482476c6c9d1ae033306440c51ceac93ea80c These stripping patches has caused a set of issues: 1) People have reported compatibility issues with binutils due to lack of support for `--strip-unneeded-symbols' with objcopy 2.15.92.0.2 Reported by: Wenji 2) ccache and distcc no longer works as expeced Reported by: Ted, Roland, + others 3) The installed modules increased a lot in size Reported by: Ted, Davej + others Reported-by: Wenji Huang Reported-by: "Theodore Ts'o" Reported-by: Dave Jones Reported-by: Roland McGrath Signed-off-by: Sam Ravnborg --- kernel/kallsyms.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index e694afa0eb8..7b8b0f21a5b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -30,19 +30,20 @@ #define all_var 0 #endif -extern const unsigned long kallsyms_addresses[]; -extern const u8 kallsyms_names[]; +/* These will be re-linked against their real values during the second link stage */ +extern const unsigned long kallsyms_addresses[] __attribute__((weak)); +extern const u8 kallsyms_names[] __attribute__((weak)); /* tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV) */ extern const unsigned long kallsyms_num_syms - __attribute__((__section__(".rodata"))); +__attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[]; -extern const u16 kallsyms_token_index[]; +extern const u8 kallsyms_token_table[] __attribute__((weak)); +extern const u16 kallsyms_token_index[] __attribute__((weak)); -extern const unsigned long kallsyms_markers[]; +extern const unsigned long kallsyms_markers[] __attribute__((weak)); static inline int is_kernel_inittext(unsigned long addr) { @@ -167,6 +168,9 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; + /* This kernel should never had been booted. */ + BUG_ON(!kallsyms_addresses); + /* do a binary search on the sorted kallsyms_addresses array */ low = 0; high = kallsyms_num_syms; -- cgit v1.2.3 From 934d96eafadcf3eb3ccd094af9919f020907fc41 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 14 Jan 2009 20:38:17 +0530 Subject: time-sched.c: tick_nohz_update_jiffies should be static Impact: cleanup, reduce kernel size a bit, avoid sparse warning Fixes sparse warning: kernel/time/tick-sched.c:137:6: warning: symbol 'tick_nohz_update_jiffies' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1b6c05bd0d0..d3f1ef4d5cb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz); * value. We do this unconditionally on any cpu, as we don't know whether the * cpu, which has the update task assigned is in a long sleep. */ -void tick_nohz_update_jiffies(void) +static void tick_nohz_update_jiffies(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -- cgit v1.2.3 From 98a4826b99bc4bcc34c604b2fc4fcf4d771600ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 10:56:32 +0100 Subject: sched: fix bandwidth validation for UID grouping Impact: make rt-limit tunables work again Mark Glines reported: > I've got an issue on x86-64 where I can't configure the system to allow > RT tasks for a non-root user. > > In 2.6.26.5, I was able to do the following to set things up nicely: > echo 450000 >/sys/kernel/uids/0/cpu_rt_runtime > echo 450000 >/sys/kernel/uids/1000/cpu_rt_runtime > > Seems like every value I try to echo into the /sys files returns EINVAL. For UID grouping we initialize the root group with infinite bandwidth which by default is actually more than the global limit, therefore the bandwidth check always fails. Because the root group is a phantom group (for UID grouping) we cannot runtime adjust it, therefore we let it reflect the global bandwidth settings. Reported-by: Mark Glines Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3b630d88266..ed62d1cee05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9050,6 +9050,13 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } +#ifdef CONFIG_USER_SCHED + if (tg == &root_task_group) { + period = global_rt_period(); + runtime = global_rt_runtime(); + } +#endif + /* * Cannot have more runtime than the period. */ -- cgit v1.2.3 From cce7ade803699463ecc62a065ca522004f7ccb3d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jan 2009 14:53:37 +0100 Subject: sched: SCHED_IDLE weight change Increase the SCHED_IDLE weight from 2 to 3, this gives much more stable vruntime numbers. time advanced in 100ms: weight=2 64765.988352 67012.881408 88501.412352 weight=3 35496.181411 34130.971298 35497.411573 Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ed62d1cee05..6acfb3c2398 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1323,8 +1323,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) * slice expiry etc. */ -#define WEIGHT_IDLEPRIO 2 -#define WMULT_IDLEPRIO (1 << 31) +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 /* * Nice levels are multiplicative, with a gentle 10% change for every -- cgit v1.2.3 From 6bc912b71b6f33b041cfde93ca3f019cbaa852bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jan 2009 14:53:38 +0100 Subject: sched: SCHED_OTHER vs SCHED_IDLE isolation Stronger SCHED_IDLE isolation: - no SCHED_IDLE buddies - never let SCHED_IDLE preempt on wakeup - always preempt SCHED_IDLE on wakeup - limit SLEEPER fairness for SCHED_IDLE. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8e1352c7555..cdebd8089cb 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -677,9 +677,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) unsigned long thresh = sysctl_sched_latency; /* - * convert the sleeper threshold into virtual time + * Convert the sleeper threshold into virtual time. + * SCHED_IDLE is a special sub-class. We care about + * fairness only relative to other SCHED_IDLE tasks, + * all of which have the same weight. */ - if (sched_feat(NORMALIZED_SLEEPER)) + if (sched_feat(NORMALIZED_SLEEPER) && + task_of(se)->policy != SCHED_IDLE) thresh = calc_delta_fair(thresh, se); vruntime -= thresh; @@ -1340,14 +1344,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - for_each_sched_entity(se) - cfs_rq_of(se)->last = se; + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) { - for_each_sched_entity(se) - cfs_rq_of(se)->next = se; + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; + } } /* @@ -1393,12 +1401,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; /* - * Batch tasks do not preempt (their preemption is driven by + * Batch and idle tasks do not preempt (their preemption is driven by * the tick): */ - if (unlikely(p->policy == SCHED_BATCH)) + if (unlikely(p->policy != SCHED_NORMAL)) return; + /* Idle tasks are by definition preempted by everybody. */ + if (unlikely(curr->policy == SCHED_IDLE)) { + resched_task(curr); + return; + } + if (!sched_feat(WAKEUP_PREEMPT)) return; -- cgit v1.2.3 From e17036dac189dd034c092a91df56aa740db7146d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jan 2009 14:53:39 +0100 Subject: sched: fix update_min_vruntime Impact: fix SCHED_IDLE latency problems OK, so we have 1 running task A (which is obviously curr and the tree is equally obviously empty). 'A' nicely chugs along, doing its thing, carrying min_vruntime along as it goes. Then some whacko speed freak SCHED_IDLE task gets inserted due to SMP balancing, which is very likely far right, in that case update_curr update_min_vruntime cfs_rq->rb_leftmost := true (the crazy task sitting in a tree) vruntime = se->vruntime and voila, min_vruntime is waaay right of where it ought to be. OK, so why did I write it like that to begin with... Aah, yes. Say we've just dequeued current schedule deactivate_task(prev) dequeue_entity update_min_vruntime Then we'll set vruntime = cfs_rq->min_vruntime; we find !cfs_rq->curr, but do find someone in the tree. Then we _must_ do vruntime = se->vruntime, because vruntime = min_vruntime(vruntime := cfs_rq->min_vruntime, se->vruntime) will not advance vruntime, and cause lags the other way around (which we fixed with that initial patch: 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69 (sched: more accurate min_vruntime accounting). Signed-off-by: Peter Zijlstra Tested-by: Mike Galbraith Acked-by: Mike Galbraith Cc: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cdebd8089cb..16b419bb8b0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -283,7 +283,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) struct sched_entity, run_node); - if (vruntime == cfs_rq->min_vruntime) + if (!cfs_rq->curr) vruntime = se->vruntime; else vruntime = min_vruntime(vruntime, se->vruntime); -- cgit v1.2.3 From 88fc241f54459ac3d86c5e13b449730199f66061 Mon Sep 17 00:00:00 2001 From: Doug Chapman Date: Thu, 15 Jan 2009 10:38:56 -0800 Subject: [IA64] dump stack on kernel unaligned warnings Often the cause of kernel unaligned access warnings is not obvious from just the ip displayed in the warning. This adds the option via proc to dump the stack in addition to the warning. The default is off (just display the 1 line warning). To enable the stack to be shown: echo 1 > /proc/sys/kernel/unaligned-dump-stack Signed-off-by: Doug Chapman Signed-off-by: Tony Luck --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e38b74b612..368d1638ee7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -144,6 +144,7 @@ extern int acct_parm[]; #ifdef CONFIG_IA64 extern int no_unaligned_warning; +extern int unaligned_dump_stack; #endif #ifdef CONFIG_RT_MUTEXES @@ -781,6 +782,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "unaligned-dump-stack", + .data = &unaligned_dump_stack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif #ifdef CONFIG_DETECT_SOFTLOCKUP { -- cgit v1.2.3 From 6272d68cc6a5f90c6b1a2228cf0f67b895305d17 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Thu, 15 Jan 2009 17:17:15 +0100 Subject: sched: sched_slice() fixlet Mike's change: 0a582440f "sched: fix sched_slice())" broke group scheduling by forgetting to reload cfs_rq on each loop. This patch fixes aim7 regression and specjbb2005 regression becomes less than 1.5% on 8-core stokley. Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra Tested-by: Jayson King Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 16b419bb8b0..5cc1c162044 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -429,7 +429,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); for_each_sched_entity(se) { - struct load_weight *load = &cfs_rq->load; + struct load_weight *load; + + cfs_rq = cfs_rq_of(se); + load = &cfs_rq->load; if (unlikely(!se->on_rq)) { struct load_weight lw = cfs_rq->load; -- cgit v1.2.3 From 45ce80fb6b6f9594d1396d44dd7e7c02d596fef8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:50:59 -0800 Subject: cgroups: consolidate cgroup documents Move Documentation/cpusets.txt and Documentation/controllers/* to Documentation/cgroups/ Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 647c77a88fc..a85678865c5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -568,7 +568,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cpusets.txt + * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt * for a background explanation of this. * * Does not return errors, on the theory that the callers of this -- cgit v1.2.3 From 6ae301e85c9c58d2f430a8a7057ce488b7ff76df Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Jan 2009 13:51:01 -0800 Subject: resources: fix parameter name and kernel-doc Fix __request_region() parameter kernel-doc notation and parameter name: Warning(linux-2.6.28-git10//kernel/resource.c:627): No description found for parameter 'flags' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index ca6a1536b20..fd5d7d574bb 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -620,6 +620,7 @@ resource_size_t resource_alignment(struct resource *res) * @start: resource start address * @n: resource region size * @name: reserving caller's ID string + * @flags: IO resource flags */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, -- cgit v1.2.3 From 33f1d7ecc6cffff3c618a02295de969ebbacd95d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 Jan 2009 21:14:04 +0100 Subject: PM: Fix freezer compilation if PM_SLEEP is unset Freezer fails to compile if with the following configuration settings: CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y CONFIG_MODULES=y CONFIG_FREEZER=y CONFIG_PM=y CONFIG_PM_SLEEP=n Fix this by making process.o compilation depend on CONFIG_FREEZER. Reported-by: Cheng Renquan Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 597823b5b70..d7a10167a25 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -4,7 +4,8 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := main.o -obj-$(CONFIG_PM_SLEEP) += process.o console.o +obj-$(CONFIG_PM_SLEEP) += console.o +obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o -- cgit v1.2.3 From 5a4ccaf37ffece09ef33f1cfec67efa8ee56f967 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 21:15:32 +0100 Subject: kprobes: check CONFIG_FREEZER instead of CONFIG_PM Check CONFIG_FREEZER instead of CONFIG_PM because kprobe booster depends on freeze_processes() and thaw_processes() when CONFIG_PREEMPT=y. This fixes a linkage error which occurs when CONFIG_PREEMPT=y, CONFIG_PM=y and CONFIG_FREEZER=n. Reported-by: Cheng Renquan Signed-off-by: Masami Hiramatsu Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar Signed-off-by: Len Brown --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1b9cbdc0127..7ba8cd9845c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -123,7 +123,7 @@ static int collect_garbage_slots(void); static int __kprobes check_safety(void) { int ret = 0; -#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) +#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER) ret = freeze_processes(); if (ret == 0) { struct task_struct *p, *q; -- cgit v1.2.3 From b786c6a98ef6fa81114ba7b9fbfc0d67060775e3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Sat, 17 Jan 2009 12:04:36 +0100 Subject: relay: fix lock imbalance in relay_late_setup_files One fail path in relay_late_setup_files() omits mutex_unlock(&relay_channels_mutex); Add it. Signed-off-by: Jiri Slaby Signed-off-by: Ingo Molnar --- kernel/relay.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 09ac2008f77..9d79b7854fa 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan, mutex_lock(&relay_channels_mutex); /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) + if (unlikely(chan->has_base_filename)) { + mutex_unlock(&relay_channels_mutex); return -EEXIST; + } chan->has_base_filename = 1; chan->parent = parent; curr_cpu = get_cpu(); -- cgit v1.2.3 From 1d4a7f1c4faf53eb9e822743ec8a70b3019a26d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 18 Jan 2009 16:39:29 +0100 Subject: hrtimers: fix inconsistent lock state on resume in hres_timers_resume Andrey Borzenkov reported this lockdep assert: > [17854.688347] ================================= > [17854.688347] [ INFO: inconsistent lock state ] > [17854.688347] 2.6.29-rc2-1avb #1 > [17854.688347] --------------------------------- > [17854.688347] inconsistent {in-hardirq-W} -> {hardirq-on-W} usage. > [17854.688347] pm-suspend/18240 [HC0[0]:SC0[0]:HE1:SE1] takes: > [17854.688347] (&cpu_base->lock){++..}, at: [] retrigger_next_event+0x5c/0xa0 > [17854.688347] {in-hardirq-W} state was registered at: > [17854.688347] [] __lock_acquire+0x79d/0x1930 > [17854.688347] [] lock_acquire+0x5c/0x80 > [17854.688347] [] _spin_lock+0x35/0x70 > [17854.688347] [] hrtimer_run_queues+0x31/0x140 > [17854.688347] [] run_local_timers+0x8/0x20 > [17854.688347] [] update_process_times+0x23/0x60 > [17854.688347] [] tick_periodic+0x24/0x80 > [17854.688347] [] tick_handle_periodic+0x12/0x70 > [17854.688347] [] timer_interrupt+0x14/0x20 > [17854.688347] [] handle_IRQ_event+0x29/0x60 > [17854.688347] [] handle_level_irq+0x69/0xe0 > [17854.688347] [] 0xffffffff > [17854.688347] irq event stamp: 55771 > [17854.688347] hardirqs last enabled at (55771): [] _spin_unlock_irqrestore+0x35/0x60 > [17854.688347] hardirqs last disabled at (55770): [] _spin_lock_irqsave+0x19/0x80 > [17854.688347] softirqs last enabled at (54836): [] __do_softirq+0xc4/0x110 > [17854.688347] softirqs last disabled at (54831): [] do_softirq+0x8e/0xe0 > [17854.688347] > [17854.688347] other info that might help us debug this: > [17854.688347] 3 locks held by pm-suspend/18240: > [17854.688347] #0: (&buffer->mutex){--..}, at: [] sysfs_write_file+0x25/0x100 > [17854.688347] #1: (pm_mutex){--..}, at: [] enter_state+0x4f/0x140 > [17854.688347] #2: (dpm_list_mtx){--..}, at: [] device_pm_lock+0xf/0x20 > [17854.688347] > [17854.688347] stack backtrace: > [17854.688347] Pid: 18240, comm: pm-suspend Not tainted 2.6.29-rc2-1avb #1 > [17854.688347] Call Trace: > [17854.688347] [] ? printk+0x18/0x20 > [17854.688347] [] print_usage_bug+0x16c/0x1d0 > [17854.688347] [] mark_lock+0x8bf/0xc90 > [17854.688347] [] ? pit_next_event+0x2f/0x40 > [17854.688347] [] __lock_acquire+0x580/0x1930 > [17854.688347] [] ? _spin_unlock+0x1d/0x20 > [17854.688347] [] ? pit_next_event+0x2f/0x40 > [17854.688347] [] ? clockevents_program_event+0x98/0x160 > [17854.688347] [] ? mark_held_locks+0x48/0x90 > [17854.688347] [] ? _spin_unlock_irqrestore+0x35/0x60 > [17854.688347] [] ? trace_hardirqs_on_caller+0x139/0x190 > [17854.688347] [] ? trace_hardirqs_on+0xb/0x10 > [17854.688347] [] lock_acquire+0x5c/0x80 > [17854.688347] [] ? retrigger_next_event+0x5c/0xa0 > [17854.688347] [] _spin_lock+0x35/0x70 > [17854.688347] [] ? retrigger_next_event+0x5c/0xa0 > [17854.688347] [] retrigger_next_event+0x5c/0xa0 > [17854.688347] [] hres_timers_resume+0xa/0x10 > [17854.688347] [] timekeeping_resume+0xee/0x150 > [17854.688347] [] __sysdev_resume+0x14/0x50 > [17854.688347] [] sysdev_resume+0x47/0x80 > [17854.688347] [] device_power_up+0xb/0x20 > [17854.688347] [] suspend_devices_and_enter+0xcf/0x150 > [17854.688347] [] ? freeze_processes+0x3f/0x90 > [17854.688347] [] enter_state+0xf4/0x140 > [17854.688347] [] state_store+0x7d/0xc0 > [17854.688347] [] ? state_store+0x0/0xc0 > [17854.688347] [] kobj_attr_store+0x24/0x30 > [17854.688347] [] sysfs_write_file+0x9c/0x100 > [17854.688347] [] vfs_write+0x9c/0x160 > [17854.688347] [] ? restore_nocheck_notrace+0x0/0xe > [17854.688347] [] ? sysfs_write_file+0x0/0x100 > [17854.688347] [] sys_write+0x3d/0x70 > [17854.688347] [] sysenter_do_call+0x12/0x31 Andrey's analysis: > timekeeping_resume() is called via class ->resume > method; and according to comments in sysdev_resume() and > device_power_up(), they are called with interrupts disabled. > > Looking at suspend_enter, irqs *are* disabled at this point. > > So it actually looks like something (may be some driver) > unconditionally enabled irqs in resume path. Add a debug check to test this theory. If it triggers then it triggers because the resume code calls it with irqs enabled, which is a no-no not just for timekeeping_resume(), but also bad for a number of other resume handlers. Reported-by: Andrey Borzenkov Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1455b7651b6..cb83c6d4c07 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -614,7 +614,9 @@ void clock_was_set(void) */ void hres_timers_resume(void) { - /* Retrigger the CPU local events: */ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hres_timers_resume() called with IRQs enabled!"); + retrigger_next_event(NULL); } -- cgit v1.2.3 From f90d4118bacef87894621a3e8aba853fa0c89abc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 16 Jan 2009 10:24:10 +0800 Subject: cpuset: fix possible deadlock in async_rebuild_sched_domains Lockdep reported some possible circular locking info when we tested cpuset on NUMA/fake NUMA box. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.29-rc1-00224-ga652504 #111 ------------------------------------------------------- bash/2968 is trying to acquire lock: (events){--..}, at: [] flush_work+0x24/0xd8 but task is already holding lock: (cgroup_mutex){--..}, at: [] cgroup_lock_live_group+0x12/0x29 which lock already depends on the new lock. ...... ------------------------------------------------------- Steps to reproduce: # mkdir /dev/cpuset # mount -t cpuset xxx /dev/cpuset # mkdir /dev/cpuset/0 # echo 0 > /dev/cpuset/0/cpus # echo 0 > /dev/cpuset/0/mems # echo 1 > /dev/cpuset/0/memory_migrate # cat /dev/zero > /dev/null & # echo $! > /dev/cpuset/0/tasks This is because async_rebuild_sched_domains has the following lock sequence: run_workqueue(async_rebuild_sched_domains) -> do_rebuild_sched_domains -> cgroup_lock But, attaching tasks when memory_migrate is set has following: cgroup_lock_live_group(cgroup_tasks_write) -> do_migrate_pages -> flush_work This patch fixes it by using a separate workqueue thread. Signed-off-by: Miao Xie Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a85678865c5..f76db9dcaa0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -60,6 +60,14 @@ #include #include +/* + * Workqueue for cpuset related tasks. + * + * Using kevent workqueue may cause deadlock when memory_migrate + * is set. So we create a separate workqueue thread for cpuset. + */ +static struct workqueue_struct *cpuset_wq; + /* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can @@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); */ static void async_rebuild_sched_domains(void) { - schedule_work(&rebuild_sched_domains_work); + queue_work(cpuset_wq, &rebuild_sched_domains_work); } /* @@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void) hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); + + cpuset_wq = create_singlethread_workqueue("cpuset"); + BUG_ON(!cpuset_wq); } /** -- cgit v1.2.3 From 31ad9081200c06ccc350625d41d1f8b2d1cef29f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: don't try to get_online_cpus() in work_on_cpu. Impact: remove potential circular lock dependency with cpu hotplug lock This has caused more problems than it solved, with a pile of cpu hotplug locking issues. Followup patches will get_online_cpus() in callers that need it, but if they don't do it they're no worse than before when they were using set_cpus_allowed without locking. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f445833ae3..a35afdbc016 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -991,8 +991,8 @@ static void do_work_for_cpu(struct work_struct *w) * @fn: the function to run * @arg: the function arg * - * This will return -EINVAL in the cpu is not online, or the return value - * of @fn otherwise. + * This will return the value @fn returns. + * It is up to the caller to ensure that the cpu doesn't go offline. */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { @@ -1001,14 +1001,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - get_online_cpus(); - if (unlikely(!cpu_online(cpu))) - wfc.ret = -EINVAL; - else { - schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); - } - put_online_cpus(); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } -- cgit v1.2.3 From 8ccad40df8d314f786fdb06bdbedd4f43f3257cd Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: Use our own workqueue. Impact: remove potential clashes with generic kevent workqueue Annoyingly, some places we want to use work_on_cpu are already in workqueues. As per Ingo's suggestion, we create a different workqueue for work_on_cpu. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a35afdbc016..1f0c509b40d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -971,6 +971,8 @@ undo: } #ifdef CONFIG_SMP +static struct workqueue_struct *work_on_cpu_wq __read_mostly; + struct work_for_cpu { struct work_struct work; long (*fn)(void *); @@ -1001,7 +1003,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - schedule_work_on(cpu, &wfc.work); + queue_work_on(cpu, work_on_cpu_wq, &wfc.work); flush_work(&wfc.work); return wfc.ret; @@ -1019,4 +1021,8 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); +#ifdef CONFIG_SMP + work_on_cpu_wq = create_workqueue("work_on_cpu"); + BUG_ON(!work_on_cpu_wq); +#endif } -- cgit v1.2.3 From 082605de5f82eb692cc90f7fda071cc01bb5ac34 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Jan 2009 14:32:51 -0500 Subject: ring-buffer: fix alignment problem Impact: fix to allow some archs to use the ring buffer Commits in the ring buffer are checked by pointer arithmetic. If the calculation is incorrect, then the commits will never take place and the buffer will simply fill up and report an error. Each page in the ring buffer has a small header: struct buffer_data_page { u64 time_stamp; local_t commit; unsigned char data[]; }; Unfortuntely, some of the calculations used sizeof(struct buffer_data_page) to know the size of the header. But this is incorrect on some archs, where sizeof(struct buffer_data_page) does not equal offsetof(struct buffer_data_page, data), and on those archs, the commits are never processed. This patch replaces the sizeof with offsetof. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8b0daf0662e..1d6526361d0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -246,7 +246,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) +#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) /* * head_page == tail_page && head == tail then buffer is empty. -- cgit v1.2.3 From cdf57cab27aef72f13a19c86858c6cac9951dc24 Mon Sep 17 00:00:00 2001 From: Adrian McMenamin Date: Wed, 21 Jan 2009 18:47:38 +0900 Subject: dma-coherent: per-device coherent area is in pages, not bytes. Commit 58c6d3dfe436eb8cfb451981d8fdc9044eaf42da ("dma-coherent: catch oversized requests to dma_alloc_from_coherent()") attempted to add a sanity check to bail out on allocations larger than the coherent area. Unfortunately when this was implemented, the fact the coherent area is tracked in pages rather than bytes was overlooked, which subsequently broke every single dma_alloc_from_coherent() user, forcing the allocation silently through generic memory instead. Signed-off-by: Adrian McMenamin Signed-off-by: Paul Mundt --- kernel/dma-coherent.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 038707404b7..38fa292c6aa 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -118,8 +118,8 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, mem = dev->dma_mem; if (!mem) return 0; - if (unlikely(size > mem->size)) - return 0; + if (unlikely(size > (mem->size << PAGE_SHIFT))) + return 0; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); if (pageno >= 0) { -- cgit v1.2.3 From 0609697eab9775564845d4c94f9e3780fb791ffd Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 21 Jan 2009 18:51:53 +0900 Subject: dma-coherent: Restore dma_alloc_from_coherent() large alloc fall back policy. When doing large allocations (larger than the per-device coherent area) the generic memory allocators are silently fallen back on regardless of consideration for the per-device constraints. In the DMA_MEMORY_EXCLUSIVE case falling back on generic memory is not an option, as it tends not to be addressable by the DMA hardware in question. This issue showed up with the 8139too breakage on the Dreamcast, where non-addressable buffers were silently allocated due to the size mismatch calculation -- while it should have simply errored out upon being unable to satisfy the allocation with the given device constraints. This restores fall back behaviour to what it was before the oversized request change caused multiple regressions. Signed-off-by: Paul Mundt --- kernel/dma-coherent.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 38fa292c6aa..962a3b574f2 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); * @size: size of requested memory area * @dma_handle: This will be filled with the correct dma handle * @ret: This pointer will be filled with the virtual address - * to allocated area. + * to allocated area. * * This function should be only called from per-arch dma_alloc_coherent() * to support allocation from per-device coherent memory pools. @@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, mem = dev->dma_mem; if (!mem) return 0; + + *ret = NULL; + if (unlikely(size > (mem->size << PAGE_SHIFT))) - return 0; + goto err; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (pageno >= 0) { - /* - * Memory was found in the per-device arena. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - *ret = mem->virt_base + (pageno << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { - /* - * The per-device arena is exhausted and we are not - * permitted to fall back to generic memory. - */ - *ret = NULL; - } else { - /* - * The per-device arena is exhausted and we are - * permitted to fall back to generic memory. - */ - return 0; - } + if (unlikely(pageno < 0)) + goto err; + + /* + * Memory was found in the per-device area. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *ret = mem->virt_base + (pageno << PAGE_SHIFT); + memset(*ret, 0, size); + return 1; + +err: + /* + * In the case where the allocation can not be satisfied from the + * per-device area, try to fall back to generic memory if the + * constraints allow it. + */ + return mem->flags & DMA_MEMORY_EXCLUSIVE; } EXPORT_SYMBOL(dma_alloc_from_coherent); -- cgit v1.2.3 From 00f57f545afa422db3003b0d0b30a30f8de7ecb2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Jan 2009 13:33:27 -0800 Subject: tracing/function-graph-tracer: fix a regression while suspend to disk Impact: fix a crash while kernel image restore When the function graph tracer is running and while suspend to disk, some racy and dangerous things happen against this tracer. The current task will save its registers including the stack pointer which contains the return address hooked by the tracer. But the current task will continue to enter other functions after that to save the memory, and then it will store other return addresses, and finally loose the old depth which matches the return address saved in the old stack (during the registers saving). So on image restore, the code will return to wrong addresses. And there are other things: on restore, the task will have it's "current" pointer overwritten during registers restoring....switching from one task to another... That would be insane to try to trace function graphs at these stages. This patch makes the function graph tracer listening on power events, making it's tracing disabled for the current task (the one that performs the hibernation work) while suspend/resume to disk, making the tracing safe during hibernation. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2f32969c09d..7dcf6e9f2b0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1965,6 +1966,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static atomic_t ftrace_graph_active; +static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -2043,6 +2045,27 @@ static int start_graph_tracing(void) return ret; } +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -2050,6 +2073,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_sysctl_lock); + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; + register_pm_notifier(&ftrace_suspend_notifier); + atomic_inc(&ftrace_graph_active); ret = start_graph_tracing(); if (ret) { @@ -2075,6 +2101,7 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); mutex_unlock(&ftrace_sysctl_lock); } -- cgit v1.2.3 From 551b4048b3d4acf15aff9fe4aed89b892c135b02 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 12 Jan 2009 11:06:18 +0800 Subject: ring_buffer: reset write when reserve buffer fail Impact: reset struct buffer_page.write when interrupt storm if struct buffer_page.write is not reset, any succedent committing will corrupted ring_buffer: static inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { ...... cpu_buffer->commit_page->commit = cpu_buffer->commit_page->write; ...... } when "if (RB_WARN_ON(cpu_buffer, next_page == reader_page))", ring_buffer is disabled, but some reserved buffers may haven't been committed. we need reset struct buffer_page.write. when "if (unlikely(next_page == cpu_buffer->commit_page))", ring_buffer is still available, we should not corrupt it. Signed-off-by: Lai Jiangshan Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d6526361d0..9c1e73da4e3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1025,12 +1025,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, } if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) { - /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); + if (!(buffer->flags & RB_FL_OVERWRITE)) goto out_unlock; - } /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { @@ -1105,6 +1101,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; out_unlock: + /* reset write */ + if (tail <= BUF_PAGE_SIZE) + local_set(&tail_page->write, tail); + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; -- cgit v1.2.3 From faf6861ebd776871e77b761c43ec045cd20b5716 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 Jan 2009 12:24:42 -0500 Subject: trace: print ftrace_dump at KERN_EMERG log level Impact: fix to print out ftrace_dump when expected I was debugging a hard race condition to only find out that after I hit the race, my log level was not at level to show KERN_INFO. The time it took to trigger the race was wasted because I did not capture the trace. Since ftrace_dump is only called from kernel oops (and only when it is set in the kernel command line to do so), or when a developer adds it to their own local tree, the log level of the print should be at KERN_EMERG to make sure the print appears. ftrace_dump is not called by a normal user setup, and will not add extra unwanted print out to the console. There is no reason it should be at KERN_INFO. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c580233add9..1a1c5a6ab24 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3736,7 +3736,7 @@ static struct notifier_block trace_die_notifier = { * it if we decide to change what log level the ftrace dump * should be at. */ -#define KERN_TRACE KERN_INFO +#define KERN_TRACE KERN_EMERG static void trace_printk_seq(struct trace_seq *s) -- cgit v1.2.3 From a442e5e0a2011af5b2d1f118fee0a8f9079f1d88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 Jan 2009 14:50:19 -0500 Subject: trace: stop all recording to ring buffer on ftrace_dump Impact: limit ftrace dump output Currently ftrace_dump only calls ftrace_kill that is a fast way to prevent the function tracer functions from being called (just sets a flag and clears the function to call, nothing else). It is better to also turn off any recording to the ring buffers as well. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a1c5a6ab24..4d89e84f0f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3770,6 +3770,7 @@ void ftrace_dump(void) dump_ran = 1; /* No turning back! */ + tracing_off(); ftrace_kill(); for_each_tracing_cpu(cpu) { -- cgit v1.2.3 From 1092307d582a7566d23779c304cf86f3075ac5f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 23:40:11 -0500 Subject: trace: set max latency variable to zero on default Impact: trace max latencies on start of latency tracing This patch sets the max latency to zero whenever one of the irq variant tracers or the wakeup tracer is set to current tracer. Most developers expect to see output when starting up a latency tracer. But since the max_latency is already set to max, and it takes a latency greater than max_latency to be recorded, there is no trace. This is not the expected behavior and has even confused myself. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- kernel/trace/trace_irqsoff.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4d89e84f0f4..17bb88d86ac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -40,7 +40,7 @@ #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) -unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_max_latency; unsigned long __read_mostly tracing_thresh; /* diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7c2e326bbc8..62a78d94353 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) static void __irqsoff_tracer_init(struct trace_array *tr) { + tracing_max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 43586b689e3..42ae1e77b6b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + tracing_max_latency = 0; wakeup_trace = tr; start_wakeup_tracer(tr); return 0; -- cgit v1.2.3 From 91a8d07d82cac3aae3ef2ea1aaba5c9c4a934e91 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Jan 2009 18:45:57 -0500 Subject: ring-buffer: reset timestamps when ring buffer is reset Impact: fix bad times of recent resets The ring buffer needs to reset its timestamps when reseting of the buffer, otherwise the timestamps are stale and might be used to calculate times in the buffer causing funny timestamps to appear. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c1e73da4e3..bd38c5cfd8a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2174,6 +2174,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->overrun = 0; cpu_buffer->entries = 0; + + cpu_buffer->write_stamp = 0; + cpu_buffer->read_stamp = 0; } /** -- cgit v1.2.3 From 3a9f84d354ce1e19956083c8e691727dea33bd5a Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Mon, 26 Jan 2009 15:33:31 -0800 Subject: signals, debug: fix BUG: using smp_processor_id() in preemptible code in print_fatal_signal() With print-fatal-signals=1 on a kernel with CONFIG_PREEMPT=y, sending an unexpected signal to a process causes a BUG: using smp_processor_id() in preemptible code. get_signal_to_deliver() releases the siglock before calling print_fatal_signal(), which calls show_regs(), which calls smp_processor_id(), which is not supposed to be called from a preemptible thread. Make sure show_regs() runs with preemption disabled. Signed-off-by: Ed Swierk Signed-off-by: Ingo Molnar --- kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e73759783dc..b6b36768b75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -909,7 +909,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) } #endif printk("\n"); + preempt_disable(); show_regs(regs); + preempt_enable(); } static int __init setup_print_fatal_signals(char *str) -- cgit v1.2.3 From abfe2d7b915c872f3a1fd203267cedebf90daa45 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Jan 2009 20:54:54 +0100 Subject: Hibernation: Introduce system_entering_hibernation Introduce boolean function system_entering_hibernation() returning 'true' during the last phase of hibernation, in which devices are being put into low power states and the sleep state (for example, ACPI S4) is finally entered. Some device drivers need such a function to check if the system is in the final phase of hibernation. In particular, some SATA drivers are going to use it for blacklisting systems in which the disks should not be spun down during the last phase of hibernation (the BIOS will do that anyway). Signed-off-by: Rafael J. Wysocki Signed-off-by: Jeff Garzik --- kernel/power/disk.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 45e8541ab7e..432ee575c9e 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops) mutex_unlock(&pm_mutex); } +static bool entering_platform_hibernation; + +bool system_entering_hibernation(void) +{ + return entering_platform_hibernation; +} +EXPORT_SYMBOL(system_entering_hibernation); + #ifdef CONFIG_PM_DEBUG static void hibernation_debug_sleep(void) { @@ -411,6 +419,7 @@ int hibernation_platform_enter(void) if (error) goto Close; + entering_platform_hibernation = true; suspend_console(); error = device_suspend(PMSG_HIBERNATE); if (error) { @@ -445,6 +454,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: + entering_platform_hibernation = false; device_resume(PMSG_RESTORE); resume_console(); Close: -- cgit v1.2.3 From 1267a8df209c7453d65acbdd56e3588954bf890b Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 27 Jan 2009 09:53:21 -0800 Subject: Make irq_*_affinity depend on CONFIG_GENERIC_HARDIRQS too. In interrupt.h these functions are declared only if CONFIG_GENERIC_HARDIRQS is set. We should define them under identical conditions. Signed-off-by: David Daney Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cd0cd8dcb34..618a64f1915 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -15,7 +15,7 @@ #include "internals.h" -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) cpumask_var_t irq_default_affinity; static int init_irq_default_affinity(void) -- cgit v1.2.3 From 97179fd46da7ddedd18e95388130ed3e06c5a0c7 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 27 Jan 2009 09:53:22 -0800 Subject: cpumask fallout: Initialize irq_default_affinity earlier Move the initialization of irq_default_affinity to early_irq_init as core_initcall is too late. irq_default_affinity can be used in init_IRQ and potentially timer and SMP init as well. All of these happen before core_initcall. Moving the initialization to early_irq_init ensures that it is initialized before it is used. Signed-off-by: David Daney Acked-by: Mike Travis Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 16 ++++++++++++++++ kernel/irq/manage.c | 8 -------- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c20db0be917..3aba8d12f32 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -39,6 +39,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) ack_bad_irq(irq); } +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +static void __init init_irq_default_affinity(void) +{ + alloc_bootmem_cpumask_var(&irq_default_affinity); + cpumask_setall(irq_default_affinity); +} +#else +static void __init init_irq_default_affinity(void) +{ +} +#endif + /* * Linux has a controller-independent interrupt architecture. * Every controller has a 'controller-template', that is used @@ -134,6 +146,8 @@ int __init early_irq_init(void) int legacy_count; int i; + init_irq_default_affinity(); + desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); @@ -219,6 +233,8 @@ int __init early_irq_init(void) int count; int i; + init_irq_default_affinity(); + desc = irq_desc; count = ARRAY_SIZE(irq_desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 618a64f1915..291f0366455 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -18,14 +18,6 @@ #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) cpumask_var_t irq_default_affinity; -static int init_irq_default_affinity(void) -{ - alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL); - cpumask_setall(irq_default_affinity); - return 0; -} -core_initcall(init_irq_default_affinity); - /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for -- cgit v1.2.3 From baef99a08a2e23d9386b47e53fa5f0d44fc98f66 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 29 Jan 2009 14:25:10 -0800 Subject: cgroups: use hierarchy mutex in creation failure path Now, cgrp->sibling is handled under hierarchy mutex. error route should do so, too. Signed-off-by: KAMEZAWA Hiroyuki Cc: Li Zefan Acked-by Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c29831076e7..2ae7cb47dbf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2434,7 +2434,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: + cgroup_lock_hierarchy(root); list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(root); root->number_of_cgroups--; err_destroy: -- cgit v1.2.3 From 1404f06565ee89e0ce04d4a5859c00b0e3a0dc8d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Jan 2009 14:25:21 -0800 Subject: cgroups: fix lock inconsistency in cgroup_clone() I fixed a bug in cgroup_clone() in Linus' tree in commit 7b574b7 ("cgroups: fix a race between cgroup_clone and umount") without noticing there was a cleanup patch in -mm tree that should be rebased (now commit 104cbd5, "cgroups: use task_lock() for access tsk->cgroups safe in cgroup_clone()"), thus resulted in lock inconsistency. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2ae7cb47dbf..0066092de19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2993,20 +2993,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } - task_lock(tsk); - cg = tsk->cgroups; - parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { + if (!atomic_inc_not_zero(&root->sb->s_active)) { /* We race with the final deactivate_super() */ mutex_unlock(&cgroup_mutex); return 0; } /* Keep the cgroup alive */ + task_lock(tsk); + parent = task_cgroup(tsk, subsys->subsys_id); + cg = tsk->cgroups; get_css_set(cg); task_unlock(tsk); + mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ @@ -3045,7 +3046,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&inode->i_mutex); put_css_set(cg); - deactivate_super(parent->root->sb); + deactivate_super(root->sb); /* The cgroup is still accessible in the VFS, but * we're not going to try to rmdir() it at this * point. */ @@ -3071,7 +3072,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_lock(&cgroup_mutex); put_css_set(cg); mutex_unlock(&cgroup_mutex); - deactivate_super(parent->root->sb); + deactivate_super(root->sb); return ret; } -- cgit v1.2.3 From 804b3c28a4e4fa1c224571bf76edb534b9c4b1ed Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Thu, 29 Jan 2009 14:25:21 -0800 Subject: cgroups: add cpu_relax() calls in css_tryget() and cgroup_clear_css_refs() css_tryget() and cgroup_clear_css_refs() contain polling loops; these loops should have cpu_relax calls in them to reduce cross-cache traffic. Signed-off-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0066092de19..492215d67fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2509,7 +2509,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int refcnt; - do { + while (1) { /* We can only remove a CSS with a refcnt==1 */ refcnt = atomic_read(&css->refcnt); if (refcnt > 1) { @@ -2523,7 +2523,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) * css_tryget() to spin until we set the * CSS_REMOVED bits or abort */ - } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) + break; + cpu_relax(); + } } done: for_each_subsys(cgrp->root, ss) { -- cgit v1.2.3 From 839ec5452ebfd5905b9c69b20ceb640903a8ea1a Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Thu, 29 Jan 2009 14:25:22 -0800 Subject: cgroup: fix root_count when mount fails due to busy subsystem root_count was being incremented in cgroup_get_sb() after all error checking was complete, but decremented in cgroup_kill_sb(), which can be called on a superblock that we gave up on due to an error. This patch changes cgroup_kill_sb() to only decrement root_count if the root was previously linked into the list of roots. Signed-off-by: Paul Menage Tested-by: Serge Hallyn Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 492215d67fa..5a54ff42874 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1115,8 +1115,10 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - list_del(&root->root_list); - root_count--; + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + root_count--; + } mutex_unlock(&cgroup_mutex); -- cgit v1.2.3 From d7240b988017521ebf89edfadd42c0942f166850 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 Jan 2009 10:08:01 -0500 Subject: generic-ipi: use per cpu data for single cpu ipi calls The smp_call_function can be passed a wait parameter telling it to wait for all the functions running on other CPUs to complete before returning, or to return without waiting. Unfortunately, this is currently just a suggestion and not manditory. That is, the smp_call_function can decide not to return and wait instead. The reason for this is because it uses kmalloc to allocate storage to send to the called CPU and that CPU will free it when it is done. But if we fail to allocate the storage, the stack is used instead. This means we must wait for the called CPU to finish before continuing. Unfortunatly, some callers do no abide by this hint and act as if the non-wait option is mandatory. The MTRR code for instance will deadlock if the smp_call_function is set to wait. This is because the smp_call_function will wait for the other CPUs to finish their called functions, but those functions are waiting on the caller to continue. This patch changes the generic smp_call_function code to use per cpu variables if the allocation of the data fails for a single CPU call. The smp_call_function_many will fall back to the smp_call_function_single if it fails its alloc. The smp_call_function_single is modified to not force the wait state. Since we now are using a single data per cpu we must synchronize the callers to prevent a second caller modifying the data before the first called IPI functions complete. To do so, I added a flag to the call_single_data called CSD_FLAG_LOCK. When the single CPU is called (which can be called when a many call fails an alloc), we set the LOCK bit on this per cpu data. When the caller finishes it clears the LOCK bit. The caller must wait till the LOCK bit is cleared before setting it. When it is cleared, there is no IPI function using it. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Acked-by: Jens Axboe Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/smp.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 5cfa0e5e3e8..bbedbb7efe3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -18,6 +18,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); enum { CSD_FLAG_WAIT = 0x01, CSD_FLAG_ALLOC = 0x02, + CSD_FLAG_LOCK = 0x04, }; struct call_function_data { @@ -186,6 +187,9 @@ void generic_smp_call_function_single_interrupt(void) if (data_flags & CSD_FLAG_WAIT) { smp_wmb(); data->flags &= ~CSD_FLAG_WAIT; + } else if (data_flags & CSD_FLAG_LOCK) { + smp_wmb(); + data->flags &= ~CSD_FLAG_LOCK; } else if (data_flags & CSD_FLAG_ALLOC) kfree(data); } @@ -196,6 +200,8 @@ void generic_smp_call_function_single_interrupt(void) } } +static DEFINE_PER_CPU(struct call_single_data, csd_data); + /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. @@ -224,14 +230,38 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, func(info); local_irq_restore(flags); } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data = NULL; + struct call_single_data *data; if (!wait) { + /* + * We are calling a function on a single CPU + * and we are not going to wait for it to finish. + * We first try to allocate the data, but if we + * fail, we fall back to use a per cpu data to pass + * the information to that CPU. Since all callers + * of this code will use the same data, we must + * synchronize the callers to prevent a new caller + * from corrupting the data before the callee + * can access it. + * + * The CSD_FLAG_LOCK is used to let us know when + * the IPI handler is done with the data. + * The first caller will set it, and the callee + * will clear it. The next caller must wait for + * it to clear before we set it again. This + * will make sure the callee is done with the + * data before a new caller will use it. + */ data = kmalloc(sizeof(*data), GFP_ATOMIC); if (data) data->flags = CSD_FLAG_ALLOC; - } - if (!data) { + else { + data = &per_cpu(csd_data, me); + while (data->flags & CSD_FLAG_LOCK) + cpu_relax(); + data->flags = CSD_FLAG_LOCK; + } + } else { data = &d; data->flags = CSD_FLAG_WAIT; } -- cgit v1.2.3 From 7f22391cbe82a80a9f891d8bd10fc28ff248d1e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Dec 2008 02:24:48 +0100 Subject: hrtimers: increase clock min delta threshold while interrupt hanging Impact: avoid timer IRQ hanging slow systems While using the function graph tracer on a virtualized system, the hrtimer_interrupt can hang the system on an infinite loop. This can be caused in several situations: - the hardware is very slow and HZ is set too high - something intrusive is slowing the system down (tracing under emulation) ... and the next clock events to program are always before the current time. This patch implements a reasonable compromise: if such a situation is detected, we share the CPUs time in 1/4 to process the hrtimer interrupts. This is enough to let the system running without serious starvation. It has been successfully tested under VirtualBox with 1000 HZ and 100 HZ with function graph tracer launched. On both cases, the clock events were increased until about 25 ms periodic ticks, which means 40 HZ. So we change a hard to debug hang into a warning message and a system that still manages to limp along. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f33afb0407b..8fea312ca36 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1158,6 +1158,29 @@ static void __run_hrtimer(struct hrtimer *timer) #ifdef CONFIG_HIGH_RES_TIMERS +static int force_clock_reprogram; + +/* + * After 5 iteration's attempts, we consider that hrtimer_interrupt() + * is hanging, which could happen with something that slows the interrupt + * such as the tracing. Then we force the clock reprogramming for each future + * hrtimer interrupts to avoid infinite loops and use the min_delta_ns + * threshold that we will overwrite. + * The next tick event will be scheduled to 3 times we currently spend on + * hrtimer_interrupt(). This gives a good compromise, the cpus will spend + * 1/4 of their time to process the hrtimer interrupts. This is enough to + * let it running without serious starvation. + */ + +static inline void +hrtimer_interrupt_hanging(struct clock_event_device *dev, + ktime_t try_time) +{ + force_clock_reprogram = 1; + dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; + printk(KERN_WARNING "hrtimer: interrupt too slow, " + "forcing clock min delta to %lu ns\n", dev->min_delta_ns); +} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1167,6 +1190,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; + int nr_retries = 0; int i; BUG_ON(!cpu_base->hres_active); @@ -1174,6 +1198,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; retry: + /* 5 retries is enough to notice a hang */ + if (!(++nr_retries % 5)) + hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); + now = ktime_get(); expires_next.tv64 = KTIME_MAX; @@ -1226,7 +1254,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, 0)) + if (tick_program_event(expires_next, force_clock_reprogram)) goto retry; } } -- cgit v1.2.3 From 94df7de0289bc2df3d6e85cd2ece52bf42682f45 Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Mon, 1 Dec 2008 14:09:07 +0100 Subject: hrtimers: allow the hot-unplugging of all cpus Impact: fix CPU hotplug hang on Power6 testbox On architectures that support offlining all cpus (at least powerpc/pseries), hot-unpluging the tick_do_timer_cpu can result in a system hang. This comes from the fact that if the cpu going down happens to be the cpu doing the tick, then as the tick_do_timer_cpu handover happens after the cpu is dead (via the CPU_DEAD notification), we're left without ticks, jiffies are frozen and any task relying on timers (msleep, ...) is stuck. That's particularly the case for the cpu looping in __cpu_die() waiting for the dying cpu to be dead. This patch addresses this by having the tick_do_timer_cpu handover happen earlier during the CPU_DYING notification. For this, a new clockevent notification type is introduced (CLOCK_EVT_NOTIFY_CPU_DYING) which is triggered in hrtimer_cpu_notify(). Signed-off-by: Sebastien Dugue Cc: Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 4 ++++ kernel/time/tick-common.c | 26 +++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8fea312ca36..647a40e2fea 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1608,6 +1608,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 63e05d423a0..21a5ca84951 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -273,6 +273,21 @@ out_bc: return ret; } +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. + */ +static void tick_handover_do_timer(int *cpup) +{ + if (*cpup == tick_do_timer_cpu) { + int cpu = cpumask_first(cpu_online_mask); + + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : + TICK_DO_TIMER_NONE; + } +} + /* * Shutdown an event device on a given cpu: * @@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - /* Transfer the do_timer job away from this cpu */ - if (*cpup == tick_do_timer_cpu) { - int cpu = cpumask_first(cpu_online_mask); - - tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : - TICK_DO_TIMER_NONE; - } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, tick_broadcast_oneshot_control(reason); break; + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(dev); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(dev); tick_shutdown_broadcast(dev); -- cgit v1.2.3 From b0a9b5111abf60ef07eade834f480e89004c7920 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Jan 2009 11:31:36 +0100 Subject: hrtimer: prevent negative expiry value after clock_was_set() Impact: prevent false positive WARN_ON() in clockevents_program_event() clock_was_set() changes the base->offset of CLOCK_REALTIME and enforces the reprogramming of the clockevent device to expire timers which are based on CLOCK_REALTIME. If the clock change is large enough then the subtraction of the timer expiry value and base->offset can become negative which triggers the warning in clockevents_program_event(). Check the subtraction result and set a negative value to 0. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 647a40e2fea..f394d2a42ca 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) continue; timer = rb_entry(base->first, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } -- cgit v1.2.3 From d942fb6c7d391baba3dddb566eb735fbf3df8528 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Jan 2009 17:56:17 +0100 Subject: sched: fix sync wakeups Pawel Dziekonski reported that the openssl benchmark and his quantum chemistry application both show slowdowns due to the scheduler under-parallelizing execution. The reason are pipe wakeups still doing 'sync' wakeups which overrides the normal buddy wakeup logic - even if waker and wakee are loosely coupled. Fix an inversion of logic in the buddy wakeup code. Reported-by: Pawel Dziekonski Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ kernel/sched_fair.c | 11 ++--------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 52bbf1c842a..770b1f9ebe1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,6 +2266,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; + if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost)) + sync = 1; + #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { struct sched_domain *sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5cc1c162044..fdc41750468 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1179,20 +1179,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + struct task_group *tg; unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; - if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; - /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1419,9 +1414,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { + if (sched_feat(WAKEUP_OVERLAP) && sync) { resched_task(curr); return; } -- cgit v1.2.3 From 1596e29773eadd96b0a5fc6e736afa52394cafda Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Jan 2009 14:51:38 +0100 Subject: sched: symmetric sync vs avg_overlap Reinstate the weakening of the sync hint if set. This yields a more symmetric usage of avg_overlap. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 770b1f9ebe1..242d0d47a70 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,9 +2266,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; - if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost)) - sync = 1; + if (!sync) { + if (current->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + } else { + if (current->se.avg_overlap >= sysctl_sched_migration_cost || + p->se.avg_overlap >= sysctl_sched_migration_cost) + sync = 0; + } #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { -- cgit v1.2.3 From a9f3e2b549f83a9cdab873abf4140be27c05a3f2 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 28 Jan 2009 14:51:39 +0100 Subject: sched: clear buddies more aggressively It was noticed that a task could get re-elected past its run quota due to buddy affinities. This could increase latency a little. Cure it by more aggresively clearing buddy state. We do so in two situations: - when we force preempt - when we select a buddy to run Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fdc41750468..75248b9ff4c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -768,8 +768,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) + if (delta_exec > ideal_runtime) { resched_task(rq_of(cfs_rq)->curr); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, curr); + } } static void @@ -1445,6 +1451,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); + /* + * If se was a buddy, clear it so that it will have to earn + * the favour again. + */ + clear_buddies(cfs_rq, se); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From a571bbeafbcc501d9989fbce1cddcd810bd51d71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 Jan 2009 14:51:40 +0100 Subject: sched: fix buddie group latency Similar to the previous patch, by not clearing buddies we can select entities past their run quota, which can increase latency. This means we have to clear group buddies as well. Do not use the group clear for pick_next_task(), otherwise that'll get O(n^2). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 75248b9ff4c..a7e50ba185a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -719,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } -static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->last == se) cfs_rq->last = NULL; @@ -728,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->next = NULL; } +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + for_each_sched_entity(se) + __clear_buddies(cfs_rq_of(se), se); +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -1455,7 +1461,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) * If se was a buddy, clear it so that it will have to earn * the favour again. */ - clear_buddies(cfs_rq, se); + __clear_buddies(cfs_rq, se); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From 3d398703ef06fd97b4c28c86b580546d5b57e7b7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 31 Jan 2009 23:21:24 +1030 Subject: sched_rt: don't use first_cpu on cpumask created with cpumask_and cpumask_and() only initializes nr_cpu_ids bits, so the (deprecated) first_cpu() might find one of those uninitialized bits if nr_cpu_ids is less than NR_CPUS (as it can be for CONFIG_CPUMASK_OFFSTACK). Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 954e1a81b79..bac1061cea2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -968,8 +968,8 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) return this_cpu; - first = first_cpu(*mask); - if (first != NR_CPUS) + first = cpumask_first(mask); + if (first < nr_cpu_ids) return first; return -1; -- cgit v1.2.3 From 10b888d6cec2688e65e9e128b14bf98ecd199da2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 Jan 2009 14:50:07 -0800 Subject: irq, x86: fix lock status with numa_migrate_irq_desc Eric Paris reported: > I have an hp dl785g5 which is unable to successfully run > 2.6.29-0.66.rc3.fc11.x86_64 or 2.6.29-rc2-next-20090126. During bootup > (early in userspace daemons starting) I get the below BUG, which quickly > renders the machine dead. I assume it is because sparse_irq_lock never > gets released when the BUG kills that task. Adjust lock sequence when migrating a descriptor with CONFIG_NUMA_MIGRATE_IRQ_DESC enabled. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/irq/numa_migrate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ecf765c6a77..acd88356ac7 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -71,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = irq_desc_ptrs[irq]; if (desc && old_desc != desc) - goto out_unlock; + goto out_unlock; node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); @@ -84,10 +84,15 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; + spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ free_one_irq_desc(old_desc, desc); + spin_unlock(&old_desc->lock); kfree(old_desc); + spin_lock(&desc->lock); + + return desc; out_unlock: spin_unlock_irqrestore(&sparse_irq_lock, flags); -- cgit v1.2.3 From 720eba31f47aeade8ec130ca7f4353223c49170f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Feb 2009 13:31:36 +1030 Subject: modules: Use a better scheme for refcounting Current refcounting for modules (done if CONFIG_MODULE_UNLOAD=y) is using a lot of memory. Each 'struct module' contains an [NR_CPUS] array of full cache lines. This patch uses existing infrastructure (percpu_modalloc() & percpu_modfree()) to allocate percpu space for the refcount storage. Instead of wasting NR_CPUS*128 bytes (on i386), we now use nr_cpu_ids*sizeof(local_t) bytes. On a typical distro, where NR_CPUS=8, shiping 2000 modules, we reduce size of module files by about 2 Mbytes. (1Kb per module) Instead of having all refcounters in the same memory node - with TLB misses because of vmalloc() - this new implementation permits to have better NUMA properties, since each CPU will use storage on its preferred node, thanks to percpu storage. Signed-off-by: Eric Dumazet Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e8b51d41dd7..ba22484a987 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { - unsigned int i; + int cpu; INIT_LIST_HEAD(&mod->modules_which_use_me); - for (i = 0; i < NR_CPUS; i++) - local_set(&mod->ref[i].count, 0); + for_each_possible_cpu(cpu) + local_set(__module_ref_addr(mod, cpu), 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[raw_smp_processor_id()].count, 1); + local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced) unsigned int module_refcount(struct module *mod) { - unsigned int i, total = 0; + unsigned int total = 0; + int cpu; - for (i = 0; i < NR_CPUS; i++) - total += local_read(&mod->ref[i].count); + for_each_possible_cpu(cpu) + total += local_read(__module_ref_addr(mod, cpu)); return total; } EXPORT_SYMBOL(module_refcount); @@ -894,7 +895,7 @@ void module_put(struct module *module) { if (module) { unsigned int cpu = get_cpu(); - local_dec(&module->ref[cpu].count); + local_dec(__module_ref_addr(module, cpu)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1464,7 +1465,10 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); - +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + if (mod->refptr) + percpu_modfree(mod->refptr); +#endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto free_mod; +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), + mod->name); + if (!mod->refptr) { + err = -ENOMEM; + goto free_mod; + } +#endif if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, @@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod, mod->name); if (!percpu) { err = -ENOMEM; - goto free_mod; + goto free_percpu; } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; mod->percpu = percpu; @@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod, free_percpu: if (percpu) percpu_modfree(percpu); +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + percpu_modfree(mod->refptr); +#endif free_mod: kfree(args); free_hdr: -- cgit v1.2.3 From 229c4ef8ae56d69f8dec64533bf1c7f8070c1a4a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Feb 2009 20:39:04 +0100 Subject: ftrace: do_each_pid_task() needs rcu lock "ftrace: use struct pid" commit 978f3a45d9499c7a447ca7615455cefb63d44165 converted ftrace_pid_trace to "struct pid*". But we can't use do_each_pid_task() without rcu_read_lock() even if we know the pid itself can't go away (it was pinned in ftrace_pid_write). The exiting task can detach itself from this pid at any moment. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7dcf6e9f2b0..9a236ffe2aa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1737,9 +1737,12 @@ static void clear_ftrace_pid(struct pid *pid) { struct task_struct *p; + rcu_read_lock(); do_each_pid_task(pid, PIDTYPE_PID, p) { clear_tsk_trace_trace(p); } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); + put_pid(pid); } @@ -1747,9 +1750,11 @@ static void set_ftrace_pid(struct pid *pid) { struct task_struct *p; + rcu_read_lock(); do_each_pid_task(pid, PIDTYPE_PID, p) { set_tsk_trace_trace(p); } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); } static void clear_ftrace_pid_task(struct pid **pid) -- cgit v1.2.3 From 58763a297405024d23d8f1d0bba3e6603660c4b6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Feb 2009 15:11:58 -0800 Subject: kernel/async.c: fix printk warnings alpha: kernel/async.c: In function 'run_one_entry': kernel/async.c:141: warning: format '%lli' expects type 'long long int', but argument 2 has type 'async_cookie_t' kernel/async.c:149: warning: format '%lli' expects type 'long long int', but argument 2 has type 'async_cookie_t' kernel/async.c:149: warning: format '%lld' expects type 'long long int', but argument 4 has type 's64' kernel/async.c: In function 'async_synchronize_cookie_special': kernel/async.c:250: warning: format '%lli' expects type 'long long int', but argument 3 has type 's64' Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/async.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 608b32b4281..67a2be71f51 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -138,15 +138,18 @@ static void run_one_entry(void) /* 3) run it (and print duration)*/ if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); + printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, + entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, - entry->func, ktime_to_ns(delta) >> 10); + printk("initcall %lli_%pF returned 0 after %lld usecs\n", + (long long)entry->cookie, + entry->func, + (long long)ktime_to_ns(delta) >> 10); } /* 4) remove it from the running queue */ @@ -247,7 +250,8 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r delta = ktime_sub(endtime, starttime); printk("async_continuing @ %i after %lli usec\n", - task_pid_nr(current), ktime_to_ns(delta) >> 10); + task_pid_nr(current), + (long long)ktime_to_ns(delta) >> 10); } } EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); -- cgit v1.2.3 From 60fd760fb9ff7034360bab7137c917c0330628c2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Feb 2009 15:12:06 -0800 Subject: revert "rlimit: permit setting RLIMIT_NOFILE to RLIM_INFINITY" Revert commit 0c2d64fb6cae9aae480f6a46cfe79f8d7d48b59f because it causes (arguably poorly designed) existing userspace to spend interminable periods closing billions of not-open file descriptors. We could bring this back, with some sort of opt-in tunable in /proc, which defaults to "off". Peter's alanysis follows: : I spent several hours trying to get to the bottom of a serious : performance issue that appeared on one of our servers after upgrading to : 2.6.28. In the end it's what could be considered a userspace bug that : was triggered by a change in 2.6.28. Since this might also affect other : people I figured I'd at least document what I found here, and maybe we : can even do something about it: : : : So, I upgraded some of debian.org's machines to 2.6.28.1 and immediately : the team maintaining our ftp archive complained that one of their : scripts that previously ran in a few minutes still hadn't even come : close to being done after an hour or so. Downgrading to 2.6.27 fixed : that. : : Turns out that script is forking a lot and something in it or python or : whereever closes all the file descriptors it doesn't want to pass on. : That is, it starts at zero and goes up to ulimit -n/RLIMIT_NOFILE and : closes them all with a few exceptions. : : Turns out that takes a long time when your limit -n is now 2^20 (1048576). : : With 2.6.27.* the ulimit -n was the standard 1024, but with 2.6.28 it is : now a thousand times that. : : 2.6.28 included a patch titled "rlimit: permit setting RLIMIT_NOFILE to : RLIM_INFINITY" (0c2d64fb6cae9aae480f6a46cfe79f8d7d48b59f)[1] that : allows, as the title implies, to set the limit for number of files to : infinity. : : Closer investigation showed that the broken default ulimit did not apply : to "system" processes (like stuff started from init). In the end I : could establish that all processes that passed through pam_limit at one : point had the bad resource limit. : : Apparently the pam library in Debian etch (4.0) initializes the limits : to some default values when it doesn't have any settings in limit.conf : to override them. Turns out that for nofiles this is RLIM_INFINITY. : Commenting out "case RLIMIT_NOFILE" in pam_limit.c:267 of our pam : package version 0.79-5 fixes that - tho I'm not sure what side effects : that has. : : Debian lenny (the upcoming 5.0 version) doesn't have this issue as it : uses a different pam (version). Reported-by: Peter Palfrader Cc: Adam Tkac Cc: Michael Kerrisk Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index e7dc0e10a48..f145c415bc1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1525,22 +1525,14 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) return -EINVAL; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - - if (resource == RLIMIT_NOFILE) { - if (new_rlim.rlim_max == RLIM_INFINITY) - new_rlim.rlim_max = sysctl_nr_open; - if (new_rlim.rlim_cur == RLIM_INFINITY) - new_rlim.rlim_cur = sysctl_nr_open; - if (new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; - } - - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; + if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; retval = security_task_setrlimit(resource, &new_rlim); if (retval) -- cgit v1.2.3 From 777c6c5f1f6e757ae49ecca2ed72d6b1f523c007 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Feb 2009 15:12:14 -0800 Subject: wait: prevent exclusive waiter starvation With exclusive waiters, every process woken up through the wait queue must ensure that the next waiter down the line is woken when it has finished. Interruptible waiters don't do that when aborting due to a signal. And if an aborting waiter is concurrently woken up through the waitqueue, noone will ever wake up the next waiter. This has been observed with __wait_on_bit_lock() used by lock_page_killable(): the first contender on the queue was aborting when the actual lock holder woke it up concurrently. The aborted contender didn't acquire the lock and therefor never did an unlock followed by waking up the next waiter. Add abort_exclusive_wait() which removes the process' wait descriptor from the waitqueue, iff still queued, or wakes up the next waiter otherwise. It does so under the waitqueue lock. Racing with a wake up means the aborting process is either already woken (removed from the queue) and will wake up the next waiter, or it will remove itself from the queue and the concurrent wake up will apply to the next waiter after it. Use abort_exclusive_wait() in __wait_event_interruptible_exclusive() and __wait_on_bit_lock() when they were interrupted by other means than a wake up through the queue. [akpm@linux-foundation.org: coding-style fixes] Reported-by: Chris Mason Signed-off-by: Johannes Weiner Mentored-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Matthew Wilcox Cc: Chuck Lever Cc: Nick Piggin Cc: Ingo Molnar Cc: ["after some testing"] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 ++-- kernel/wait.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 54 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d47a70..8ee437a5ec1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4697,8 +4697,8 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) +void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; diff --git a/kernel/wait.c b/kernel/wait.c index cd87131f2fc..42a2dbc181c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); +/* + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) { unsigned long flags; @@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); +/* + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @state: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and noone wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_common(q, mode, 1, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -177,17 +219,20 @@ int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, int (*action)(void *), unsigned mode) { - int ret = 0; - do { + int ret; + prepare_to_wait_exclusive(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) { - if ((ret = (*action)(q->key.flags))) - break; - } + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); finish_wait(wq, &q->wait); - return ret; + return 0; } EXPORT_SYMBOL(__wait_on_bit_lock); -- cgit v1.2.3