From c98aa86df3169e5d6275305376043134caa69831 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 12 Feb 2008 13:52:37 -0800 Subject: timeconst.pl: correct reversal of USEC_TO_HZ and HZ_TO_USEC The USEC_TO_HZ and HZ_TO_USEC constant sets were mislabelled, with seriously incorrect results. This among other things manifested itself as cpufreq not working when a tickless kernel was configured. Signed-off-by: H. Peter Anvin Tested-by: Carlos R. Mafra Signed-off-by: Linus Torvalds --- kernel/timeconst.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl index 62b1287932e..41468035473 100644 --- a/kernel/timeconst.pl +++ b/kernel/timeconst.pl @@ -339,7 +339,7 @@ sub output($@) print "\n"; foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', - 'USEC_TO_HZ','HZ_TO_USEC') { + 'HZ_TO_USEC','USEC_TO_HZ') { foreach $bit (32, 64) { foreach $suf ('MUL', 'ADJ', 'SHR') { printf "#define %-23s %s\n", -- cgit v1.2.3 From 720a2592cf1b9af86f30c44e8d89348826c03372 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:36 +0100 Subject: hrtimer: more hrtimer_init_sleeper() fallout. Missed an instance... futex_lock_pi() hrtimer_init_sleeper() rt_mutex_timed_lock() rt_mutex_timed_fastlock() rt_mutex_slowlock() hrtimer_start() Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 0deef71ff8d..6522ae5b14a 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) + if (unlikely(timeout)) { hrtimer_start(&timeout->timer, timeout->timer.expires, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } for (;;) { /* Try to acquire the lock: */ -- cgit v1.2.3 From 8ed3699682be75fd68281239c202ad3830f9c72d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:39 +0100 Subject: sched: fair-group: separate tg->shares from task_group_lock On Mon, 2008-02-11 at 15:09 +0300, Denis V. Lunev wrote: > BUG: sleeping function called from invalid context > at /home/den/src/linux-netns26/kernel/mutex.c:209 > in_atomic():1, irqs_disabled():0 > no locks held by swapper/0. > Pid: 0, comm: swapper Not tainted 2.6.24 #304 > > Call Trace: > [] ? __debug_show_held_locks+0x15/0x27 > [] __might_sleep+0xc0/0xdf > [] mutex_lock_nested+0x28/0x2a9 > [] sched_destroy_group+0x18/0xea > [] sched_destroy_user+0xd/0xf > [] free_uid+0x8a/0xab > [] __put_task_struct+0x3f/0xd3 > [] delayed_put_task_struct+0x23/0x25 > [] __rcu_process_callbacks+0x8d/0x215 > [] rcu_process_callbacks+0x23/0x44 > [] __do_softirq+0x79/0xf8 > [] ? profile_pc+0x2a/0x67 > [] call_softirq+0x1c/0x30 > [] do_softirq+0x61/0x9c > [] irq_exit+0x51/0x53 > [] smp_apic_timer_interrupt+0x77/0xad > [] apic_timer_interrupt+0x6b/0x70 > [] ? default_idle+0x43/0x76 > [] ? default_idle+0x41/0x76 > [] ? default_idle+0x0/0x76 > [] ? cpu_idle+0x76/0x98 separate the tg->shares protection from the task_group lock. Reported-by: Denis V. Lunev Tested-by: Denis V. Lunev Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3eedd526090..6b02276baaa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -232,10 +232,10 @@ static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; static struct rt_rq *init_rt_rq_p[NR_CPUS]; -/* task_group_mutex serializes add/remove of task groups and also changes to +/* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. */ -static DEFINE_MUTEX(task_group_mutex); +static DEFINE_SPINLOCK(task_group_lock); /* doms_cur_mutex serializes access to doms_cur[] array */ static DEFINE_MUTEX(doms_cur_mutex); @@ -295,16 +295,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) p->rt.parent = task_group(p)->rt_se[cpu]; } -static inline void lock_task_group_list(void) -{ - mutex_lock(&task_group_mutex); -} - -static inline void unlock_task_group_list(void) -{ - mutex_unlock(&task_group_mutex); -} - static inline void lock_doms_cur(void) { mutex_lock(&doms_cur_mutex); @@ -318,8 +308,6 @@ static inline void unlock_doms_cur(void) #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_task_group_list(void) { } -static inline void unlock_task_group_list(void) { } static inline void lock_doms_cur(void) { } static inline void unlock_doms_cur(void) { } @@ -7571,6 +7559,7 @@ struct task_group *sched_create_group(void) struct rt_rq *rt_rq; struct sched_rt_entity *rt_se; struct rq *rq; + unsigned long flags; int i; tg = kzalloc(sizeof(*tg), GFP_KERNEL); @@ -7620,7 +7609,7 @@ struct task_group *sched_create_group(void) init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); } - lock_task_group_list(); + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { rq = cpu_rq(i); cfs_rq = tg->cfs_rq[i]; @@ -7629,7 +7618,7 @@ struct task_group *sched_create_group(void) list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); } list_add_rcu(&tg->list, &task_groups); - unlock_task_group_list(); + spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -7650,9 +7639,10 @@ void sched_destroy_group(struct task_group *tg) { struct cfs_rq *cfs_rq = NULL; struct rt_rq *rt_rq = NULL; + unsigned long flags; int i; - lock_task_group_list(); + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { cfs_rq = tg->cfs_rq[i]; list_del_rcu(&cfs_rq->leaf_cfs_rq_list); @@ -7660,7 +7650,7 @@ void sched_destroy_group(struct task_group *tg) list_del_rcu(&rt_rq->leaf_rt_rq_list); } list_del_rcu(&tg->list); - unlock_task_group_list(); + spin_unlock_irqrestore(&task_group_lock, flags); BUG_ON(!cfs_rq); @@ -7728,13 +7718,16 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) } } +static DEFINE_MUTEX(shares_mutex); + int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; struct cfs_rq *cfs_rq; struct rq *rq; + unsigned long flags; - lock_task_group_list(); + mutex_lock(&shares_mutex); if (tg->shares == shares) goto done; @@ -7746,10 +7739,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * load_balance_fair) from referring to this group first, * by taking it off the rq->leaf_cfs_rq_list on each cpu. */ + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { cfs_rq = tg->cfs_rq[i]; list_del_rcu(&cfs_rq->leaf_cfs_rq_list); } + spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ synchronize_sched(); @@ -7769,13 +7764,15 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * Enable load balance activity on this group, by inserting it back on * each cpu's rq->leaf_cfs_rq_list. */ + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { rq = cpu_rq(i); cfs_rq = tg->cfs_rq[i]; list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); } + spin_unlock_irqrestore(&task_group_lock, flags); done: - unlock_task_group_list(); + mutex_unlock(&shares_mutex); return 0; } -- cgit v1.2.3 From 4cf5d77a6eefaa7a464bc34e8cb767356f10fd74 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:39 +0100 Subject: sched: fix incorrect irq lock usage in normalize_rt_tasks() lockdep spotted this bogus irq locking. normalize_rt_tasks() can be called from hardirq context through sysrq-n Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6b02276baaa..88a17c7128c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7291,7 +7291,7 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irq(&tasklist_lock); + read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { /* * Only normalize user tasks: @@ -7317,16 +7317,16 @@ void normalize_rt_tasks(void) continue; } - spin_lock_irqsave(&p->pi_lock, flags); + spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + spin_unlock(&p->pi_lock); } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + read_unlock_irqrestore(&tasklist_lock, flags); } #endif /* CONFIG_MAGIC_SYSRQ */ -- cgit v1.2.3 From 23b0fdfc9299b137bd126e9dc22f62a59dae546d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:39 +0100 Subject: sched: rt-group: deal with PI Steven mentioned the fun case where a lock holding task will be throttled. Simple fix: allow groups that have boosted tasks to run anyway. If a runnable task in a throttled group gets boosted the dequeue/enqueue done by rt_mutex_setprio() is enough to unthrottle the group. This is ofcourse not quite correct. Two possible ways forward are: - second prio array for boosted tasks - boost to a prio ceiling (this would also work for deadline scheduling) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ kernel/sched_rt.c | 43 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 88a17c7128c..cecaea67ae9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -362,6 +362,8 @@ struct rt_rq { u64 rt_time; #ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long rt_nr_boosted; + struct rq *rq; struct list_head leaf_rt_rq_list; struct task_group *tg; @@ -7112,6 +7114,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_throttled = 0; #ifdef CONFIG_FAIR_GROUP_SCHED + rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; #endif } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 274b40d7bef..8d426938123 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -110,6 +110,23 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + +static int rt_se_boosted(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = group_rt_rq(rt_se); + struct task_struct *p; + + if (rt_rq) + return !!rt_rq->rt_nr_boosted; + + p = rt_task_of(rt_se); + return p->prio != p->normal_prio; +} + #else static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) @@ -149,6 +166,10 @@ static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) { } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -172,7 +193,7 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) return 0; if (rt_rq->rt_throttled) - return 1; + return rt_rq_throttled(rt_rq); period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; @@ -183,8 +204,10 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) rq->rt_throttled = 1; rt_rq->rt_throttled = 1; - sched_rt_ratio_dequeue(rt_rq); - return 1; + if (rt_rq_throttled(rt_rq)) { + sched_rt_ratio_dequeue(rt_rq); + return 1; + } } return 0; @@ -265,6 +288,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif +#ifdef CONFIG_FAIR_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted++; +#endif } static inline @@ -295,6 +322,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted--; + + WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); +#endif } static void enqueue_rt_entity(struct sched_rt_entity *rt_se) @@ -303,7 +336,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); - if (group_rq && group_rq->rt_throttled) + if (group_rq && rt_rq_throttled(group_rq)) return; list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); @@ -496,7 +529,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (unlikely(!rt_rq->rt_nr_running)) return NULL; - if (sched_rt_ratio_exceeded(rt_rq)) + if (rt_rq_throttled(rt_rq)) return NULL; do { -- cgit v1.2.3 From 9f0c1e560c43327b70998e6c702b2f01321130d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:39 +0100 Subject: sched: rt-group: interface Change the rt_ratio interface to rt_runtime_us, to match rt_period_us. This avoids picking a granularity for the ratio. Extend the /sys/kernel/uids// interface to allow setting the group's rt_runtime. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 141 ++++++++++++++++++++++++++++++++++++++++++------------ kernel/sched_rt.c | 53 +++++++++----------- kernel/sysctl.c | 32 ++++++------- kernel/user.c | 28 +++++++++++ 4 files changed, 178 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cecaea67ae9..85a5fbff2b0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -176,7 +176,7 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - unsigned int rt_ratio; + u64 rt_runtime; /* * shares assigned to a task group governs how much of cpu bandwidth @@ -642,19 +642,21 @@ const_debug unsigned int sysctl_sched_features = const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we measure -rt task cpu usage in ms. + * period over which we measure -rt task cpu usage in us. * default: 1s */ -const_debug unsigned int sysctl_sched_rt_period = 1000; +unsigned int sysctl_sched_rt_period = 1000000; -#define SCHED_RT_FRAC_SHIFT 16 -#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; /* - * ratio of time -rt tasks may consume. - * default: 95% + * single value that denotes runtime == period, ie unlimited time. */ -const_debug unsigned int sysctl_sched_rt_ratio = 62259; +#define RUNTIME_INF ((u64)~0ULL) /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu @@ -7187,7 +7189,8 @@ void __init sched_init(void) &per_cpu(init_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1); - init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ + init_task_group.rt_runtime = + sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), @@ -7583,7 +7586,7 @@ struct task_group *sched_create_group(void) goto err; tg->shares = NICE_0_LOAD; - tg->rt_ratio = 0; /* XXX */ + tg->rt_runtime = 0; for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7785,30 +7788,76 @@ unsigned long sched_group_shares(struct task_group *tg) } /* - * Ensure the total rt_ratio <= sysctl_sched_rt_ratio + * Ensure that the real time constraints are schedulable. */ -int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 16; + + runtime *= (1ULL << 16); + div64_64(runtime, period); + return runtime; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi; unsigned long total = 0; + unsigned long global_ratio = + to_ratio(sysctl_sched_rt_period, + sysctl_sched_rt_runtime < 0 ? + RUNTIME_INF : sysctl_sched_rt_runtime); rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) - total += tgi->rt_ratio; - rcu_read_unlock(); + list_for_each_entry_rcu(tgi, &task_groups, list) { + if (tgi == tg) + continue; - if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) - return -EINVAL; + total += to_ratio(period, tgi->rt_runtime); + } + rcu_read_unlock(); - tg->rt_ratio = rt_ratio; - return 0; + return total + to_ratio(period, runtime) < global_ratio; } -unsigned long sched_group_rt_ratio(struct task_group *tg) +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { - return tg->rt_ratio; + u64 rt_runtime, rt_period; + int err = 0; + + rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us == -1) + rt_runtime = rt_period; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(tg, rt_period, rt_runtime)) { + err = -EINVAL; + goto unlock; + } + if (rt_runtime_us == -1) + rt_runtime = RUNTIME_INF; + tg->rt_runtime = rt_runtime; + unlock: + mutex_unlock(&rt_constraints_mutex); + + return err; } +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_CGROUP_SCHED @@ -7884,17 +7933,49 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } -static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_ratio_val) +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { - return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); + char buffer[64]; + int retval = 0; + s64 val; + char *end; + + if (!nbytes) + return -EINVAL; + if (nbytes >= sizeof(buffer)) + return -E2BIG; + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + + /* strip newline if necessary */ + if (nbytes && (buffer[nbytes-1] == '\n')) + buffer[nbytes-1] = 0; + val = simple_strtoll(buffer, &end, 0); + if (*end) + return -EINVAL; + + /* Pass to subsystem */ + retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + if (!retval) + retval = nbytes; + return retval; } -static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) +static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct task_group *tg = cgroup_tg(cgrp); + char tmp[64]; + long val = sched_group_rt_runtime(cgroup_tg(cgrp)); + int len = sprintf(tmp, "%ld\n", val); - return (u64) tg->rt_ratio; + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static struct cftype cpu_files[] = { @@ -7904,9 +7985,9 @@ static struct cftype cpu_files[] = { .write_uint = cpu_shares_write_uint, }, { - .name = "rt_ratio", - .read_uint = cpu_rt_ratio_read_uint, - .write_uint = cpu_rt_ratio_write_uint, + .name = "rt_runtime_us", + .read = cpu_rt_runtime_read, + .write = cpu_rt_runtime_write, }, }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8d426938123..35825b28e42 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -57,12 +57,12 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) #ifdef CONFIG_FAIR_GROUP_SCHED -static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { if (!rt_rq->tg) - return SCHED_RT_FRAC; + return RUNTIME_INF; - return rt_rq->tg->rt_ratio; + return rt_rq->tg->rt_runtime; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se); static void dequeue_rt_entity(struct sched_rt_entity *rt_se); -static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) +static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct sched_rt_entity *rt_se = rt_rq->rt_se; @@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) } } -static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) +static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { struct sched_rt_entity *rt_se = rt_rq->rt_se; @@ -129,9 +129,12 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) #else -static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - return sysctl_sched_rt_ratio; + if (sysctl_sched_rt_runtime == -1) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -158,11 +161,11 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return NULL; } -static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { } -static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } @@ -184,28 +187,24 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } -static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) +static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { - unsigned int rt_ratio = sched_rt_ratio(rt_rq); - u64 period, ratio; + u64 runtime = sched_rt_runtime(rt_rq); - if (rt_ratio == SCHED_RT_FRAC) + if (runtime == RUNTIME_INF) return 0; if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; - ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; - - if (rt_rq->rt_time > ratio) { + if (rt_rq->rt_time > runtime) { struct rq *rq = rq_of_rt_rq(rt_rq); rq->rt_throttled = 1; rt_rq->rt_throttled = 1; if (rt_rq_throttled(rt_rq)) { - sched_rt_ratio_dequeue(rt_rq); + sched_rt_rq_dequeue(rt_rq); return 1; } } @@ -219,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq) u64 period; while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; + period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; rq->rt_period_expire += period; for_each_leaf_rt_rq(rt_rq, rq) { - unsigned long rt_ratio = sched_rt_ratio(rt_rq); - u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; + u64 runtime = sched_rt_runtime(rt_rq); - rt_rq->rt_time -= min(rt_rq->rt_time, ratio); - if (rt_rq->rt_throttled) { + rt_rq->rt_time -= min(rt_rq->rt_time, runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { rt_rq->rt_throttled = 0; - sched_rt_ratio_enqueue(rt_rq); + sched_rt_rq_enqueue(rt_rq); } } @@ -262,12 +260,7 @@ static void update_curr_rt(struct rq *rq) cpuacct_charge(curr, delta_exec); rt_rq->rt_time += delta_exec; - /* - * might make it a tad more accurate: - * - * update_sched_rt_period(rq); - */ - if (sched_rt_ratio_exceeded(rt_rq)) + if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d41ef6b4cf7..924c674b76e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_period_ms", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_ratio", - .data = &sysctl_sched_rt_ratio, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) { .ctl_name = CTL_UNNUMBERED, @@ -346,6 +330,22 @@ static struct ctl_table kern_table[] = { }, #endif #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_compat_yield", diff --git a/kernel/user.c b/kernel/user.c index 7d7900c5a1f..9f6d471bfd0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -164,9 +164,37 @@ static ssize_t cpu_shares_store(struct kobject *kobj, static struct kobj_attribute cpu_share_attr = __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); +static ssize_t cpu_rt_runtime_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + + return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); +} + +static ssize_t cpu_rt_runtime_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t size) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + unsigned long rt_runtime; + int rc; + + sscanf(buf, "%lu", &rt_runtime); + + rc = sched_group_set_rt_runtime(up->tg, rt_runtime); + + return (rc ? rc : size); +} + +static struct kobj_attribute cpu_rt_runtime_attr = + __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); + /* default attributes per uid directory */ static struct attribute *uids_attributes[] = { &cpu_share_attr.attr, + &cpu_rt_runtime_attr.attr, NULL }; -- cgit v1.2.3 From 052f1dc7eb02300b05170ae341ccd03b76207778 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:40 +0100 Subject: sched: rt-group: make rt groups scheduling configurable Make the rt group scheduler compile time configurable. Keep it experimental for now. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 148 ++++++++++++++++++++++++++++++++++++++---------------- kernel/sched_rt.c | 12 ++--- kernel/user.c | 22 +++++--- 3 files changed, 126 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 85a5fbff2b0..5edc549edae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -155,7 +155,7 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED #include @@ -165,19 +165,16 @@ static LIST_HEAD(task_groups); /* task group related information */ struct task_group { -#ifdef CONFIG_FAIR_CGROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css; #endif + +#ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - u64 rt_runtime; - /* * shares assigned to a task group governs how much of cpu bandwidth * is allocated to the group. The more shares a group has, the more is @@ -213,24 +210,36 @@ struct task_group { * */ unsigned long shares; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + u64 rt_runtime; +#endif struct rcu_head rcu; struct list_head list; }; +#ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -240,6 +249,7 @@ static DEFINE_SPINLOCK(task_group_lock); /* doms_cur_mutex serializes access to doms_cur[] array */ static DEFINE_MUTEX(doms_cur_mutex); +#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* kernel thread that runs rebalance_shares() periodically */ static struct task_struct *lb_monitor_task; @@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused); static void set_se_shares(struct sched_entity *se, unsigned long shares); +#ifdef CONFIG_USER_SCHED +# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) +#else +# define INIT_TASK_GROUP_LOAD NICE_0_LOAD +#endif + +#define MIN_GROUP_SHARES 2 + +static int init_task_group_load = INIT_TASK_GROUP_LOAD; +#endif + /* Default task group. * Every task in system belong to this group at bootup. */ struct task_group init_task_group = { +#ifdef CONFIG_FAIR_GROUP_SCHED .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, +#endif +#ifdef CONFIG_RT_GROUP_SCHED .rt_se = init_sched_rt_entity_p, .rt_rq = init_rt_rq_p, -}; - -#ifdef CONFIG_FAIR_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif - -#define MIN_GROUP_SHARES 2 - -static int init_task_group_load = INIT_TASK_GROUP_LOAD; +}; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) { struct task_group *tg; -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED tg = p->user->tg; -#elif defined(CONFIG_FAIR_CGROUP_SCHED) +#elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else @@ -288,11 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { +#ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; +#endif +#ifdef CONFIG_RT_GROUP_SCHED p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; +#endif } static inline void lock_doms_cur(void) @@ -311,7 +330,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline void lock_doms_cur(void) { } static inline void unlock_doms_cur(void) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -351,7 +370,7 @@ struct cfs_rq { struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED int highest_prio; /* highest queued rt task prio */ #endif #ifdef CONFIG_SMP @@ -361,7 +380,7 @@ struct rt_rq { int rt_throttled; u64 rt_time; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; struct rq *rq; @@ -437,6 +456,8 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif @@ -7104,7 +7125,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED rt_rq->highest_prio = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP @@ -7115,7 +7136,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; #endif @@ -7139,7 +7160,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); se->parent = NULL; } +#endif +#ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, int add) @@ -7168,7 +7191,7 @@ void __init sched_init(void) init_defrootdomain(); #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); #endif @@ -7189,6 +7212,8 @@ void __init sched_init(void) &per_cpu(init_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1); +#endif +#ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_runtime = sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); @@ -7381,9 +7406,9 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED -#ifdef CONFIG_SMP +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP /* * distribute shares of all task groups among their schedulable entities, * to reflect load distribution across cpus. @@ -7539,20 +7564,28 @@ static void free_sched_group(struct task_group *tg) int i; for_each_possible_cpu(i) { +#ifdef CONFIG_FAIR_GROUP_SCHED if (tg->cfs_rq) kfree(tg->cfs_rq[i]); if (tg->se) kfree(tg->se[i]); +#endif +#ifdef CONFIG_RT_GROUP_SCHED if (tg->rt_rq) kfree(tg->rt_rq[i]); if (tg->rt_se) kfree(tg->rt_se[i]); +#endif } +#ifdef CONFIG_FAIR_GROUP_SCHED kfree(tg->cfs_rq); kfree(tg->se); +#endif +#ifdef CONFIG_RT_GROUP_SCHED kfree(tg->rt_rq); kfree(tg->rt_se); +#endif kfree(tg); } @@ -7560,10 +7593,14 @@ static void free_sched_group(struct task_group *tg) struct task_group *sched_create_group(void) { struct task_group *tg; +#ifdef CONFIG_FAIR_GROUP_SCHED struct cfs_rq *cfs_rq; struct sched_entity *se; +#endif +#ifdef CONFIG_RT_GROUP_SCHED struct rt_rq *rt_rq; struct sched_rt_entity *rt_se; +#endif struct rq *rq; unsigned long flags; int i; @@ -7572,12 +7609,18 @@ struct task_group *sched_create_group(void) if (!tg) return ERR_PTR(-ENOMEM); +#ifdef CONFIG_FAIR_GROUP_SCHED tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); if (!tg->cfs_rq) goto err; tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); if (!tg->se) goto err; + + tg->shares = NICE_0_LOAD; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); if (!tg->rt_rq) goto err; @@ -7585,12 +7628,13 @@ struct task_group *sched_create_group(void) if (!tg->rt_se) goto err; - tg->shares = NICE_0_LOAD; tg->rt_runtime = 0; +#endif for_each_possible_cpu(i) { rq = cpu_rq(i); +#ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!cfs_rq) @@ -7601,6 +7645,10 @@ struct task_group *sched_create_group(void) if (!se) goto err; + init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); +#endif + +#ifdef CONFIG_RT_GROUP_SCHED rt_rq = kmalloc_node(sizeof(struct rt_rq), GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!rt_rq) @@ -7611,17 +7659,21 @@ struct task_group *sched_create_group(void) if (!rt_se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); +#endif } spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { rq = cpu_rq(i); +#ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq = tg->cfs_rq[i]; list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); +#endif +#ifdef CONFIG_RT_GROUP_SCHED rt_rq = tg->rt_rq[i]; list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); +#endif } list_add_rcu(&tg->list, &task_groups); spin_unlock_irqrestore(&task_group_lock, flags); @@ -7643,23 +7695,21 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { - struct cfs_rq *cfs_rq = NULL; - struct rt_rq *rt_rq = NULL; unsigned long flags; int i; spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - rt_rq = tg->rt_rq[i]; - list_del_rcu(&rt_rq->leaf_rt_rq_list); +#ifdef CONFIG_FAIR_GROUP_SCHED + list_del_rcu(&tg->cfs_rq[i]->leaf_cfs_rq_list); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + list_del_rcu(&tg->rt_rq[i]->leaf_rt_rq_list); +#endif } list_del_rcu(&tg->list); spin_unlock_irqrestore(&task_group_lock, flags); - BUG_ON(!cfs_rq); - /* wait for possible concurrent references to cfs_rqs complete */ call_rcu(&tg->rcu, free_sched_group_rcu); } @@ -7699,6 +7749,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } +#ifdef CONFIG_FAIR_GROUP_SCHED /* rq->lock to be locked by caller */ static void set_se_shares(struct sched_entity *se, unsigned long shares) { @@ -7786,7 +7837,9 @@ unsigned long sched_group_shares(struct task_group *tg) { return tg->shares; } +#endif +#ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. */ @@ -7858,9 +7911,10 @@ long sched_group_rt_runtime(struct task_group *tg) do_div(rt_runtime_us, NSEC_PER_USEC); return rt_runtime_us; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif +#endif /* CONFIG_GROUP_SCHED */ -#ifdef CONFIG_FAIR_CGROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) @@ -7920,6 +7974,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, sched_move_task(tsk); } +#ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { @@ -7932,7 +7987,9 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } +#endif +#ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, @@ -7977,18 +8034,23 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } +#endif static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", .read = cpu_rt_runtime_read, .write = cpu_rt_runtime_write, }, +#endif }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) @@ -8007,7 +8069,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .early_init = 1, }; -#endif /* CONFIG_FAIR_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_CGROUP_CPUACCT diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 35825b28e42..f54792b175b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -55,7 +55,7 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return !list_empty(&rt_se->run_list); } -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { @@ -177,7 +177,7 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) static inline int rt_se_prio(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq) @@ -269,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_se_prio(rt_se) < rt_rq->highest_prio) rt_rq->highest_prio = rt_se_prio(rt_se); #endif @@ -281,7 +281,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; #endif @@ -293,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_rq->rt_nr_running) { struct rt_prio_array *array; @@ -315,7 +315,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted--; diff --git a/kernel/user.c b/kernel/user.c index 9f6d471bfd0..7132022a040 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -57,7 +57,7 @@ struct user_struct root_user = { .uid_keyring = &root_user_keyring, .session_keyring = &root_session_keyring, #endif -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED .tg = &init_task_group, #endif }; @@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) return NULL; } -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED static void sched_destroy_user(struct user_struct *up) { @@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p) sched_move_task(p); } -#else /* CONFIG_FAIR_USER_SCHED */ +#else /* CONFIG_USER_SCHED */ static void sched_destroy_user(struct user_struct *up) { } static int sched_create_user(struct user_struct *up) { return 0; } static void sched_switch_user(struct task_struct *p) { } -#endif /* CONFIG_FAIR_USER_SCHED */ +#endif /* CONFIG_USER_SCHED */ -#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) +#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ static DEFINE_MUTEX(uids_mutex); @@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void) } /* uid directory attributes */ +#ifdef CONFIG_FAIR_GROUP_SCHED static ssize_t cpu_shares_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -163,7 +164,9 @@ static ssize_t cpu_shares_store(struct kobject *kobj, static struct kobj_attribute cpu_share_attr = __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); +#endif +#ifdef CONFIG_RT_GROUP_SCHED static ssize_t cpu_rt_runtime_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -190,11 +193,16 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, static struct kobj_attribute cpu_rt_runtime_attr = __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); +#endif /* default attributes per uid directory */ static struct attribute *uids_attributes[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED &cpu_share_attr.attr, +#endif +#ifdef CONFIG_RT_GROUP_SCHED &cpu_rt_runtime_attr.attr, +#endif NULL }; @@ -297,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags) schedule_work(&up->work); } -#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ +#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ int uids_sysfs_init(void) { return 0; } static inline int uids_user_create(struct user_struct *up) { return 0; } @@ -401,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - /* This case is not possible when CONFIG_FAIR_USER_SCHED + /* This case is not possible when CONFIG_USER_SCHED * is defined, since we serialize alloc_uid() using * uids_mutex. Hence no need to call * sched_destroy_user() or remove_user_sysfs_dir(). -- cgit v1.2.3 From bccbe08a60973c873e6af6fdb9ec11ffb1a6e4de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:40 +0100 Subject: sched: rt-group: clean up the ifdeffery Clean up some of the excessive ifdeffery introduces in the last patch. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 210 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 139 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5edc549edae..d2f4398c5e6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7559,57 +7559,29 @@ static int load_balance_monitor(void *unused) } #endif /* CONFIG_SMP */ -static void free_sched_group(struct task_group *tg) +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) { int i; for_each_possible_cpu(i) { -#ifdef CONFIG_FAIR_GROUP_SCHED if (tg->cfs_rq) kfree(tg->cfs_rq[i]); if (tg->se) kfree(tg->se[i]); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); -#endif } -#ifdef CONFIG_FAIR_GROUP_SCHED kfree(tg->cfs_rq); kfree(tg->se); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - kfree(tg->rt_rq); - kfree(tg->rt_se); -#endif - kfree(tg); } -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +static int alloc_fair_sched_group(struct task_group *tg) { - struct task_group *tg; -#ifdef CONFIG_FAIR_GROUP_SCHED struct cfs_rq *cfs_rq; struct sched_entity *se; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; -#endif struct rq *rq; - unsigned long flags; int i; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); - if (!tg) - return ERR_PTR(-ENOMEM); - -#ifdef CONFIG_FAIR_GROUP_SCHED tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); if (!tg->cfs_rq) goto err; @@ -7618,23 +7590,10 @@ struct task_group *sched_create_group(void) goto err; tg->shares = NICE_0_LOAD; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - tg->rt_runtime = 0; -#endif for_each_possible_cpu(i) { rq = cpu_rq(i); -#ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!cfs_rq) @@ -7646,9 +7605,78 @@ struct task_group *sched_create_group(void) goto err; init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + } + + return 1; + + err: + return 0; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, + &cpu_rq(cpu)->leaf_cfs_rq_list); +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); +} +#else +static inline void free_fair_sched_group(struct task_group *tg) +{ +} + +static inline int alloc_fair_sched_group(struct task_group *tg) +{ + return 1; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +} #endif #ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +static int alloc_rt_sched_group(struct task_group *tg) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + struct rq *rq; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + tg->rt_runtime = 0; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + rt_rq = kmalloc_node(sizeof(struct rt_rq), GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!rt_rq) @@ -7660,20 +7688,71 @@ struct task_group *sched_create_group(void) goto err; init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); -#endif } + return 1; + + err: + return 0; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, + &cpu_rq(cpu)->leaf_rt_rq_list); +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); +} +#else +static inline void free_rt_sched_group(struct task_group *tg) +{ +} + +static inline int alloc_rt_sched_group(struct task_group *tg) +{ + return 1; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ +} +#endif + +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); + kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(void) +{ + struct task_group *tg; + unsigned long flags; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg)) + goto err; + + if (!alloc_rt_sched_group(tg)) + goto err; + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { - rq = cpu_rq(i); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq = tg->rt_rq[i]; - list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); -#endif + register_fair_sched_group(tg, i); + register_rt_sched_group(tg, i); } list_add_rcu(&tg->list, &task_groups); spin_unlock_irqrestore(&task_group_lock, flags); @@ -7700,12 +7779,8 @@ void sched_destroy_group(struct task_group *tg) spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { -#ifdef CONFIG_FAIR_GROUP_SCHED - list_del_rcu(&tg->cfs_rq[i]->leaf_cfs_rq_list); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - list_del_rcu(&tg->rt_rq[i]->leaf_rt_rq_list); -#endif + unregister_fair_sched_group(tg, i); + unregister_rt_sched_group(tg, i); } list_del_rcu(&tg->list); spin_unlock_irqrestore(&task_group_lock, flags); @@ -7780,8 +7855,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - struct cfs_rq *cfs_rq; - struct rq *rq; unsigned long flags; mutex_lock(&shares_mutex); @@ -7797,10 +7870,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * by taking it off the rq->leaf_cfs_rq_list on each cpu. */ spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - } + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ @@ -7822,11 +7893,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * each cpu's rq->leaf_cfs_rq_list. */ spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - } + for_each_possible_cpu(i) + register_fair_sched_group(tg, i); spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); -- cgit v1.2.3 From b68aa2300cabeb96801369a4bb37a4f19f59ed84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Feb 2008 15:45:40 +0100 Subject: sched: rt-group: refure unrunnable tasks Refuse to accept or create RT tasks in groups that can't run them. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d2f4398c5e6..f28f19e65b5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4584,6 +4584,15 @@ recheck: return -EPERM; } +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + return -EPERM; +#endif + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -8028,9 +8037,15 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { +#ifdef CONFIG_RT_GROUP_SCHED + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + return -EINVAL; +#else /* We don't support RT-tasks being in separate groups */ if (tsk->sched_class != &fair_sched_class) return -EINVAL; +#endif return 0; } -- cgit v1.2.3 From fbf6bfca76d50abef478ba902b8597ecbadfd390 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Feb 2008 15:03:15 -0800 Subject: rcupdate: fix comment This comment caused some consternation during fastcall removal. Make it truthful. Signed-off-by: Paul E. McKenney Cc: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 760dfc233a0..c09605f8d16 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -/* Because of FASTCALL declaration of complete, we use this wrapper */ +/* + * Awaken the corresponding synchronize_rcu() instance now that a + * grace period has elapsed. + */ static void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; -- cgit v1.2.3 From b5606c2d4447e80b1d72406af4e78af1eda611d4 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 13 Feb 2008 15:03:16 -0800 Subject: remove final fastcall users fastcall always expands to empty, remove it. Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2c1f08defac..84917fe507f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p) } } -int fastcall __fatal_signal_pending(struct task_struct *tsk) +int __fatal_signal_pending(struct task_struct *tsk) { return sigismember(&tsk->pending.signal, SIGKILL); } -- cgit v1.2.3 From 064d9efe947542097be669581f82d6b097e81d1a Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 13 Feb 2008 15:03:19 -0800 Subject: hugetlb: fix overcommit locking proc_doulongvec_minmax() calls copy_to_user()/copy_from_user(), so we can't hold hugetlb_lock over the call. Use a dummy variable to store the sysctl result, like in hugetlb_sysctl_handler(), then grab the lock to update nr_overcommit_huge_pages. Signed-off-by: Nishanth Aravamudan Reported-by: Miles Lane Cc: Adam Litke Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 924c674b76e..8b7e9541179 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -978,8 +978,8 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", - .data = &nr_overcommit_huge_pages, - .maxlen = sizeof(nr_overcommit_huge_pages), + .data = &sysctl_overcommit_huge_pages, + .maxlen = sizeof(sysctl_overcommit_huge_pages), .mode = 0644, .proc_handler = &hugetlb_overcommit_handler, }, -- cgit v1.2.3 From fb40bd78b0f91b274879cf5db8facd1e04b6052e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 13 Feb 2008 15:03:37 -0800 Subject: Linux Kernel Markers: support multiple probes RCU style multiple probes support for the Linux Kernel Markers. Common case (one probe) is still fast and does not require dynamic allocation or a supplementary pointer dereference on the fast path. - Move preempt disable from the marker site to the callback. Since we now have an internal callback, move the preempt disable/enable to the callback instead of the marker site. Since the callback change is done asynchronously (passing from a handler that supports arguments to a handler that does not setup the arguments is no arguments are passed), we can safely update it even if it is outside the preempt disable section. - Move probe arm to probe connection. Now, a connected probe is automatically armed. Remove MARK_MAX_FORMAT_LEN, unused. This patch modifies the Linux Kernel Markers API : it removes the probe "arm/disarm" and changes the probe function prototype : it now expects a va_list * instead of a "...". If we want to have more than one probe connected to a marker at a given time (LTTng, or blktrace, ssytemtap) then we need this patch. Without it, connecting a second probe handler to a marker will fail. It allow us, for instance, to do interesting combinations : Do standard tracing with LTTng and, eventually, to compute statistics with SystemTAP, or to have a special trigger on an event that would call a systemtap script which would stop flight recorder tracing. Signed-off-by: Mathieu Desnoyers Cc: Christoph Hellwig Cc: Mike Mason Cc: Dipankar Sarma Cc: David Smith Cc: "Paul E. McKenney" Cc: "Frank Ch. Eigler" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 677 ++++++++++++++++++++++++++++++++++++++++++-------------- kernel/module.c | 7 +- 2 files changed, 508 insertions(+), 176 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 5323cfaedbc..c4c2cd8b61f 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -27,22 +27,15 @@ extern struct marker __start___markers[]; extern struct marker __stop___markers[]; +/* Set to 1 to enable marker debug output */ +const int marker_debug; + /* * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers, the hash table and deferred_sync. + * and module markers and the hash table. */ static DEFINE_MUTEX(markers_mutex); -/* - * Marker deferred synchronization. - * Upon marker probe_unregister, we delay call to synchronize_sched() to - * accelerate mass unregistration (only when there is no more reference to a - * given module do we call synchronize_sched()). However, we need to make sure - * every critical region has ended before we re-arm a marker that has been - * unregistered and then registered back with a different probe data. - */ -static int deferred_sync; - /* * Marker hash table, containing the active markers. * Protected by module_mutex. @@ -50,12 +43,26 @@ static int deferred_sync; #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +/* + * Note about RCU : + * It is used to make sure every handler has finished using its private data + * between two consecutive operation (add or remove) on a given marker. It is + * also used to delay the free of multiple probes array until a quiescent state + * is reached. + * marker entries modifications are protected by the markers_mutex. + */ struct marker_entry { struct hlist_node hlist; char *format; - marker_probe_func *probe; - void *private; + void (*call)(const struct marker *mdata, /* Probe wrapper */ + void *call_private, const char *fmt, ...); + struct marker_probe_closure single; + struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + char rcu_pending:1; + char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; /** * __mark_empty_function - Empty probe callback - * @mdata: pointer of type const struct marker + * @probe_private: probe private data + * @call_private: call site private data * @fmt: format string * @...: variable argument list * @@ -72,12 +80,266 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; * though the function pointer change and the marker enabling are two distinct * operations that modifies the execution flow of preemptible code. */ -void __mark_empty_function(const struct marker *mdata, void *private, - const char *fmt, ...) +void __mark_empty_function(void *probe_private, void *call_private, + const char *fmt, va_list *args) { } EXPORT_SYMBOL_GPL(__mark_empty_function); +/* + * marker_probe_cb Callback that prepares the variable argument list for probes. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @fmt: format string + * @...: Variable argument list. + * + * Since we do not use "typical" pointer based RCU in the 1 argument case, we + * need to put a full smp_rmb() in this branch. This is why we do not use + * rcu_dereference() for the pointer read. + */ +void marker_probe_cb(const struct marker *mdata, void *call_private, + const char *fmt, ...) +{ + va_list args; + char ptype; + + /* + * disabling preemption to make sure the teardown of the callbacks can + * be done correctly when they are in modules and they insure RCU read + * coherency. + */ + preempt_disable(); + ptype = ACCESS_ONCE(mdata->ptype); + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = ACCESS_ONCE(mdata->single.func); + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + va_start(args, fmt); + func(mdata->single.probe_private, call_private, fmt, &args); + va_end(args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = ACCESS_ONCE(mdata->multi); + for (i = 0; multi[i].func; i++) { + va_start(args, fmt); + multi[i].func(multi[i].probe_private, call_private, fmt, + &args); + va_end(args); + } + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb); + +/* + * marker_probe_cb Callback that does not prepare the variable argument list. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @fmt: format string + * @...: Variable argument list. + * + * Should be connected to markers "MARK_NOARGS". + */ +void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, const char *fmt, ...) +{ + va_list args; /* not initialized */ + char ptype; + + preempt_disable(); + ptype = ACCESS_ONCE(mdata->ptype); + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = ACCESS_ONCE(mdata->single.func); + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + func(mdata->single.probe_private, call_private, fmt, &args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = ACCESS_ONCE(mdata->multi); + for (i = 0; multi[i].func; i++) + multi[i].func(multi[i].probe_private, call_private, fmt, + &args); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); + +static void free_old_closure(struct rcu_head *head) +{ + struct marker_entry *entry = container_of(head, + struct marker_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void debug_print_probes(struct marker_entry *entry) +{ + int i; + + if (!marker_debug) + return; + + if (!entry->ptype) { + printk(KERN_DEBUG "Single probe : %p %p\n", + entry->single.func, + entry->single.probe_private); + } else { + for (i = 0; entry->multi[i].func; i++) + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, + entry->multi[i].func, + entry->multi[i].probe_private); + } +} + +static struct marker_probe_closure * +marker_entry_add_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0; + struct marker_probe_closure *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->multi; + if (!entry->ptype) { + if (entry->single.func == probe && + entry->single.probe_private == probe_private) + return ERR_PTR(-EBUSY); + if (entry->single.func == __mark_empty_function) { + /* 0 -> 1 probes */ + entry->single.func = probe; + entry->single.probe_private = probe_private; + entry->refcount = 1; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* 1 -> 2 probes */ + nr_probes = 1; + old = NULL; + } + } else { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe + && old[nr_probes].probe_private + == probe_private) + return ERR_PTR(-EBUSY); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (!old) + new[0] = entry->single; + else + memcpy(new, old, + nr_probes * sizeof(struct marker_probe_closure)); + new[nr_probes].func = probe; + new[nr_probes].probe_private = probe_private; + entry->refcount = nr_probes + 1; + entry->multi = new; + entry->ptype = 1; + debug_print_probes(entry); + return old; +} + +static struct marker_probe_closure * +marker_entry_remove_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0, nr_del = 0, i; + struct marker_probe_closure *old, *new; + + old = entry->multi; + + debug_print_probes(entry); + if (!entry->ptype) { + /* 0 -> N is an error */ + WARN_ON(entry->single.func == __mark_empty_function); + /* 1 -> 0 probes */ + WARN_ON(probe && entry->single.func != probe); + WARN_ON(entry->single.probe_private != probe_private); + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if ((!probe || old[nr_probes].func == probe) + && old[nr_probes].probe_private + == probe_private) + nr_del++; + } + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + } else if (nr_probes - nr_del == 1) { + /* N -> 1, (N > 1) */ + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + entry->single = old[i]; + entry->refcount = 1; + entry->ptype = 0; + } else { + int j = 0; + /* N -> M, (N > 1, M > 1) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(struct marker_probe_closure), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->ptype = 1; + entry->multi = new; + } + debug_print_probes(entry); + return old; +} + /* * Get marker if the marker is present in the marker hash table. * Must be called with markers_mutex held. @@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name) * Add the marker to the marker hash table. Must be called with markers_mutex * held. */ -static int add_marker(const char *name, const char *format, - marker_probe_func *probe, void *private) +static struct marker_entry *add_marker(const char *name, const char *format) { struct hlist_head *head; struct hlist_node *node; @@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format, hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { printk(KERN_NOTICE - "Marker %s busy, probe %p already installed\n", - name, e->probe); - return -EBUSY; /* Already there */ + "Marker %s busy\n", name); + return ERR_PTR(-EBUSY); /* Already there */ } } /* @@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format, e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memcpy(&e->name[0], name, name_len); if (format) { e->format = &e->name[name_len]; memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; trace_mark(core_marker_format, "name %s format %s", e->name, e->format); - } else + } else { e->format = NULL; - e->probe = probe; - e->private = private; + e->call = marker_probe_cb; + } + e->single.func = __mark_empty_function; + e->single.probe_private = NULL; + e->multi = NULL; + e->ptype = 0; e->refcount = 0; + e->rcu_pending = 0; hlist_add_head(&e->hlist, head); - return 0; + return e; } /* * Remove the marker from the marker hash table. Must be called with mutex_lock * held. */ -static void *remove_marker(const char *name) +static int remove_marker(const char *name) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; int found = 0; size_t len = strlen(name) + 1; - void *private = NULL; u32 hash = jhash(name, len-1, 0); head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; @@ -167,12 +435,16 @@ static void *remove_marker(const char *name) break; } } - if (found) { - private = e->private; - hlist_del(&e->hlist); - kfree(e); - } - return private; + if (!found) + return -ENOENT; + if (e->single.func != __mark_empty_function) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier(); + kfree(e); + return 0; } /* @@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format) size_t name_len = strlen((*entry)->name) + 1; size_t format_len = strlen(format) + 1; + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) @@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format) memcpy(&e->name[0], (*entry)->name, name_len); e->format = &e->name[name_len]; memcpy(e->format, format, format_len); - e->probe = (*entry)->probe; - e->private = (*entry)->private; + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + e->single = (*entry)->single; + e->multi = (*entry)->multi; + e->ptype = (*entry)->ptype; e->refcount = (*entry)->refcount; + e->rcu_pending = 0; hlist_add_before(&e->hlist, &(*entry)->hlist); hlist_del(&(*entry)->hlist); + /* Make sure the call_rcu has been executed */ + if ((*entry)->rcu_pending) + rcu_barrier(); kfree(*entry); *entry = e; trace_mark(core_marker_format, "name %s format %s", @@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format) /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem) +static int set_marker(struct marker_entry **entry, struct marker *elem, + int active) { int ret; WARN_ON(strcmp((*entry)->name, elem->name) != 0); @@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem) if (ret) return ret; } - elem->call = (*entry)->probe; - elem->private = (*entry)->private; - elem->state = 1; + + /* + * probe_cb setup (statically known) is done here. It is + * asynchronous with the rest of execution, therefore we only + * pass from a "safe" callback (with argument) to an "unsafe" + * callback (does not set arguments). + */ + elem->call = (*entry)->call; + /* + * Sanity check : + * We only update the single probe private data when the ptr is + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) + */ + WARN_ON(elem->single.func != __mark_empty_function + && elem->single.probe_private + != (*entry)->single.probe_private && + !elem->ptype); + elem->single.probe_private = (*entry)->single.probe_private; + /* + * Make sure the private data is valid when we update the + * single probe ptr. + */ + smp_wmb(); + elem->single.func = (*entry)->single.func; + /* + * We also make sure that the new probe callbacks array is consistent + * before setting a pointer to it. + */ + rcu_assign_pointer(elem->multi, (*entry)->multi); + /* + * Update the function or multi probe array pointer before setting the + * ptype. + */ + smp_wmb(); + elem->ptype = (*entry)->ptype; + elem->state = active; + return 0; } @@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem) */ static void disable_marker(struct marker *elem) { + /* leave "call" as is. It is known statically. */ elem->state = 0; - elem->call = __mark_empty_function; + elem->single.func = __mark_empty_function; + /* Update the function before setting the ptype */ + smp_wmb(); + elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and * should be done only after a synchronize_sched(). These are never used @@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem) * marker_update_probe_range - Update a probe range * @begin: beginning of the range * @end: end of the range - * @probe_module: module address of the probe being updated - * @refcount: number of references left to the given probe_module (out) * * Updates the probe callback corresponding to a range of markers. */ void marker_update_probe_range(struct marker *begin, - struct marker *end, struct module *probe_module, - int *refcount) + struct marker *end) { struct marker *iter; struct marker_entry *mark_entry; @@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin, mutex_lock(&markers_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); - if (mark_entry && mark_entry->refcount) { - set_marker(&mark_entry, iter); + if (mark_entry) { + set_marker(&mark_entry, iter, + !!mark_entry->refcount); /* * ignore error, continue */ - if (probe_module) - if (probe_module == - __module_text_address((unsigned long)mark_entry->probe)) - (*refcount)++; } else { disable_marker(iter); } @@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin, * Issues a synchronize_sched() when no reference to the module passed * as parameter is found in the probes so the probe module can be * safely unloaded from now on. + * + * Internal callback only changed before the first probe is connected to it. + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 + * transitions. All other transitions will leave the old private data valid. + * This makes the non-atomicity of the callback/private data updates valid. + * + * "special case" updates : + * 0 -> 1 callback + * 1 -> 0 callback + * 1 -> 2 callbacks + * 2 -> 1 callbacks + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. + * Site effect : marker_set_format may delete the marker entry (creating a + * replacement). */ -static void marker_update_probes(struct module *probe_module) +static void marker_update_probes(void) { - int refcount = 0; - /* Core kernel markers */ - marker_update_probe_range(__start___markers, - __stop___markers, probe_module, &refcount); + marker_update_probe_range(__start___markers, __stop___markers); /* Markers in modules. */ - module_update_markers(probe_module, &refcount); - if (probe_module && refcount == 0) { - synchronize_sched(); - deferred_sync = 0; - } + module_update_markers(); } /** @@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module) * @name: marker name * @format: format string * @probe: probe handler - * @private: probe private data + * @probe_private: probe private data * * private data must be a valid allocated memory address, or NULL. * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. */ int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *private) + marker_probe_func *probe, void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); entry = get_marker(name); - if (entry && entry->refcount) { - ret = -EBUSY; - goto end; - } - if (deferred_sync) { - synchronize_sched(); - deferred_sync = 0; + if (!entry) { + entry = add_marker(name, format); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } } - ret = add_marker(name, format, probe, private); - if (ret) + /* + * If we detect that a call_rcu is pending for this marker, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_add_probe(entry, probe, probe_private); + if (IS_ERR(old)) { + ret = PTR_ERR(old); goto end; + } mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); return ret; @@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register); /** * marker_probe_unregister - Disconnect a probe from a marker * @name: marker name + * @probe: probe function pointer + * @probe_private: probe private data * * Returns the private data given to marker_probe_register, or an ERR_PTR(). + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -void *marker_probe_unregister(const char *name) +int marker_probe_unregister(const char *name, + marker_probe_func *probe, void *probe_private) { - struct module *probe_module; struct marker_entry *entry; - void *private; + struct marker_probe_closure *old; + int ret = 0; mutex_lock(&markers_mutex); entry = get_marker(name); if (!entry) { - private = ERR_PTR(-ENOENT); + ret = -ENOENT; goto end; } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(name); - deferred_sync = 1; + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); + remove_marker(name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); - return private; + return ret; } EXPORT_SYMBOL_GPL(marker_probe_unregister); -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @private: probe private data - * - * Unregister a marker by providing the registered private data. - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - */ -void *marker_probe_unregister_private_data(void *private) +static struct marker_entry * +get_marker_from_private_data(marker_probe_func *probe, void *probe_private) { - struct module *probe_module; - struct hlist_head *head; - struct hlist_node *node; struct marker_entry *entry; - int found = 0; unsigned int i; + struct hlist_head *head; + struct hlist_node *node; - mutex_lock(&markers_mutex); for (i = 0; i < MARKER_TABLE_SIZE; i++) { head = &marker_table[i]; hlist_for_each_entry(entry, node, head, hlist) { - if (entry->private == private) { - found = 1; - goto iter_end; + if (!entry->ptype) { + if (entry->single.func == probe + && entry->single.probe_private + == probe_private) + return entry; + } else { + struct marker_probe_closure *closure; + closure = entry->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func == probe && + closure[i].probe_private + == probe_private) + return entry; + } } } } -iter_end: - if (!found) { - private = ERR_PTR(-ENOENT); - goto end; - } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(entry->name); - deferred_sync = 1; - mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; -end: - mutex_unlock(&markers_mutex); - return private; + return NULL; } -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** - * marker_arm - Arm a marker - * @name: marker name + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @probe: probe function + * @probe_private: probe private data * - * Activate a marker. It keeps a reference count of the number of - * arming/disarming done. - * Returns 0 if ok, error value on error. + * Unregister a probe by providing the registered private data. + * Only removes the first marker found in hash table. + * Return 0 on success or error value. + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -int marker_arm(const char *name) +int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); - entry = get_marker(name); + entry = get_marker_from_private_data(probe, probe_private); if (!entry) { ret = -ENOENT; goto end; } - /* - * Only need to update probes when refcount passes from 0 to 1. - */ - if (entry->refcount++) - goto end; -end: + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; -} -EXPORT_SYMBOL_GPL(marker_arm); - -/** - * marker_disarm - Disarm a marker - * @name: marker name - * - * Disarm a marker. It keeps a reference count of the number of arming/disarming - * done. - * Returns 0 if ok, error value on error. - */ -int marker_disarm(const char *name) -{ - struct marker_entry *entry; - int ret = 0; - + marker_update_probes(); /* may update entry */ mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - ret = -ENOENT; - goto end; - } - /* - * Only permit decrement refcount if higher than 0. - * Do probe update only on 1 -> 0 transition. - */ - if (entry->refcount) { - if (--entry->refcount) - goto end; - } else { - ret = -EPERM; - goto end; - } + entry = get_marker_from_private_data(probe, probe_private); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu(&entry->rcu, free_old_closure); + remove_marker(entry->name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); - marker_update_probes(NULL); return ret; } -EXPORT_SYMBOL_GPL(marker_disarm); +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** * marker_get_private_data - Get a marker's probe private data * @name: marker name + * @probe: probe to match + * @num: get the nth matching probe's private data * + * Returns the nth private data pointer (starting from 0) matching, or an + * ERR_PTR. * Returns the private data pointer, or an ERR_PTR. * The private data pointer should _only_ be dereferenced if the caller is the * owner of the data, or its content could vanish. This is mostly used to * confirm that a caller is the owner of a registered probe. */ -void *marker_get_private_data(const char *name) +void *marker_get_private_data(const char *name, marker_probe_func *probe, + int num) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; size_t name_len = strlen(name) + 1; u32 hash = jhash(name, name_len-1, 0); - int found = 0; + int i; head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { - found = 1; - return e->private; + if (!e->ptype) { + if (num == 0 && e->single.func == probe) + return e->single.probe_private; + else + break; + } else { + struct marker_probe_closure *closure; + int match = 0; + closure = e->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func != probe) + continue; + if (match++ == num) + return closure[i].probe_private; + } + } } } return ERR_PTR(-ENOENT); diff --git a/kernel/module.c b/kernel/module.c index 4202da97a1d..92595bad381 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2038,7 +2038,7 @@ static struct module *load_module(void __user *umod, #ifdef CONFIG_MARKERS if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, NULL, NULL); + mod->markers + mod->num_markers); #endif err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2564,7 +2564,7 @@ EXPORT_SYMBOL(struct_module); #endif #ifdef CONFIG_MARKERS -void module_update_markers(struct module *probe_module, int *refcount) +void module_update_markers(void) { struct module *mod; @@ -2572,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount) list_for_each_entry(mod, &modules, list) if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, - probe_module, refcount); + mod->markers + mod->num_markers); mutex_unlock(&module_mutex); } #endif -- cgit v1.2.3 From 5a7780e725d1bb4c3094fcc12f1c5c5faea1e988 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Feb 2008 09:20:43 +0100 Subject: hrtimer: check relative timeouts for overflow Various user space callers ask for relative timeouts. While we fixed that overflow issue in hrtimer_start(), the sites which convert relative user space values to absolute timeouts themself were uncovered. Instead of putting overflow checks into each place add a function which does the sanity checking and convert all affected callers to use it. Thanks to Frans Pop, who reported the problem and tested the fixes. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Tested-by: Frans Pop --- kernel/futex.c | 2 +- kernel/futex_compat.c | 2 +- kernel/hrtimer.c | 37 ++++++++++++++++++++----------------- kernel/posix-timers.c | 8 +++++--- 4 files changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a6baaec44b8..221f2128a43 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2116,7 +2116,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, t = timespec_to_ktime(ts); if (cmd == FUTEX_WAIT) - t = ktime_add(ktime_get(), t); + t = ktime_add_safe(ktime_get(), t); tp = &t; } /* diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 133d558db45..7d5e4b016f3 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -176,7 +176,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = timespec_to_ktime(ts); if (cmd == FUTEX_WAIT) - t = ktime_add(ktime_get(), t); + t = ktime_add_safe(ktime_get(), t); tp = &t; } if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 3f4a57c7895..c2893af9479 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -325,6 +325,23 @@ u64 ktime_divns(const ktime_t kt, s64 div) } #endif /* BITS_PER_LONG >= 64 */ +/* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + /* * Check, whether the timer is on the callback pending list */ @@ -682,13 +699,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) */ orun++; } - timer->expires = ktime_add(timer->expires, interval); - /* - * Make sure, that the result did not wrap with a very large - * interval. - */ - if (timer->expires.tv64 < 0) - timer->expires = ktime_set(KTIME_SEC_MAX, 0); + timer->expires = ktime_add_safe(timer->expires, interval); return orun; } @@ -839,7 +850,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) new_base = switch_hrtimer_base(timer, base); if (mode == HRTIMER_MODE_REL) { - tim = ktime_add(tim, new_base->get_time()); + tim = ktime_add_safe(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures * to signal that they simply return xtime in @@ -848,16 +859,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add(tim, base->resolution); + tim = ktime_add_safe(tim, base->resolution); #endif - /* - * Careful here: User space might have asked for a - * very long sleep, so the add above might result in a - * negative number, which enqueues the timer in front - * of the queue. - */ - if (tim.tv64 < 0) - tim.tv64 = KTIME_MAX; } timer->expires = tim; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 022c9c3cee6..a9b04203a66 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -767,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags, /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) - timer->expires = ktime_add(timer->expires, - timer->base->get_time()); + if (mode == HRTIMER_MODE_REL) { + timer->expires = + ktime_add_safe(timer->expires, + timer->base->get_time()); + } return 0; } -- cgit v1.2.3 From 63070a79ba482c274bad10ac8c4b587a3e011f2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Feb 2008 00:58:36 +0100 Subject: hrtimer: catch expired CLOCK_REALTIME timers early A CLOCK_REALTIME timer, which has an absolute expiry time less than the clock realtime offset calls with a negative delta into the clock events code and triggers the WARN_ON() there. This is a false positive and needs to be prevented. Check the result of timer->expires - timer->base->offset right away and return -ETIME right away. Thanks to Frans Pop, who reported the problem and tested the fixes. Signed-off-by: Thomas Gleixner Tested-by: Frans Pop --- kernel/hrtimer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c2893af9479..98bee013f71 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -442,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer, ktime_t expires = ktime_sub(timer->expires, base->offset); int res; + WARN_ON_ONCE(timer->expires.tv64 < 0); + /* * When the callback is running, we do not reprogram the clock event * device. The timer callback is either running on a different CPU or @@ -452,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (hrtimer_callback_running(timer)) return 0; + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Nothing wrong + * about that, just avoid to call into the tick code, which + * has now objections against negative expiry values. + */ + if (expires.tv64 < 0) + return -ETIME; + if (expires.tv64 >= expires_next->tv64) return 0; -- cgit v1.2.3 From db74ece990ea59a9ec9f00f8881026059ef5caf5 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:34:29 -0800 Subject: Dont touch fs_struct in usermodehelper This test seems to be unnecessary since we always have rootfs mounted before calling a usermodehelper. Signed-off-by: Andreas Gruenbacher Signed-off-by: Jan Blunck Acked-by: Christoph Hellwig Acked-by: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index bb7df2a28bd..22be3ff3f36 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - retval = -EPERM; - if (current->fs->root) - retval = kernel_execve(sub_info->path, - sub_info->argv, sub_info->envp); + retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; -- cgit v1.2.3 From 4ac9137858e08a19f29feac4e1f4df7c268b0ba5 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:34:32 -0800 Subject: Embed a struct path into struct nameidata instead of nd->{dentry,mnt} This is the central patch of a cleanup series. In most cases there is no good reason why someone would want to use a dentry for itself. This series reflects that fact and embeds a struct path into nameidata. Together with the other patches of this series - it enforced the correct order of getting/releasing the reference count on pairs - it prepares the VFS for stacking support since it is essential to have a struct path in every place where the stack can be traversed - it reduces the overall code size: without patch series: text data bss dec hex filename 5321639 858418 715768 6895825 6938d1 vmlinux with patch series: text data bss dec hex filename 5320026 858418 715768 6894212 693284 vmlinux This patch: Switch from nd->{dentry,mnt} to nd->path.{dentry,mnt} everywhere. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix cifs] [akpm@linux-foundation.org: fix smack] Signed-off-by: Jan Blunck Signed-off-by: Andreas Gruenbacher Acked-by: Christoph Hellwig Cc: Al Viro Cc: Casey Schaufler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 16 ++++++++-------- kernel/auditfilter.c | 11 ++++++----- 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f4fcf58f20f..b898814fe4a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -549,7 +549,7 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(nd.mnt, nd.dentry); + root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); path_release(&nd); if (!root_mnt) goto skip_it; @@ -583,17 +583,17 @@ skip_it: static int is_under(struct vfsmount *mnt, struct dentry *dentry, struct nameidata *nd) { - if (mnt != nd->mnt) { + if (mnt != nd->path.mnt) { for (;;) { if (mnt->mnt_parent == mnt) return 0; - if (mnt->mnt_parent == nd->mnt) + if (mnt->mnt_parent == nd->path.mnt) break; mnt = mnt->mnt_parent; } dentry = mnt->mnt_mountpoint; } - return is_subdir(dentry, nd->dentry); + return is_subdir(dentry, nd->path.dentry); } int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) @@ -641,7 +641,7 @@ int audit_add_tree_rule(struct audit_krule *rule) err = path_lookup(tree->pathname, 0, &nd); if (err) goto Err; - mnt = collect_mounts(nd.mnt, nd.dentry); + mnt = collect_mounts(nd.path.mnt, nd.path.dentry); path_release(&nd); if (!mnt) { err = -ENOMEM; @@ -701,7 +701,7 @@ int audit_tag_tree(char *old, char *new) err = path_lookup(new, 0, &nd); if (err) return err; - tagged = collect_mounts(nd.mnt, nd.dentry); + tagged = collect_mounts(nd.path.mnt, nd.path.dentry); path_release(&nd); if (!tagged) return -ENOMEM; @@ -711,8 +711,8 @@ int audit_tag_tree(char *old, char *new) drop_collected_mounts(tagged); return err; } - mnt = mntget(nd.mnt); - dentry = dget(nd.dentry); + mnt = mntget(nd.path.mnt); + dentry = dget(nd.path.dentry); path_release(&nd); if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6f19fd477aa..a36e66797c3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp) inotify_init_watch(&parent->wdata); /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, - AUDIT_IN_WATCH); + wd = inotify_add_watch(audit_ih, &parent->wdata, + ndp->path.dentry->d_inode, AUDIT_IN_WATCH); if (wd < 0) { audit_free_parent(&parent->wdata); return ERR_PTR(wd); @@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, /* update watch filter fields */ if (ndw) { - watch->dev = ndw->dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->dentry->d_inode->i_ino; + watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->path.dentry->d_inode->i_ino; } /* The audit_filter_mutex must not be held during inotify calls because @@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, */ mutex_unlock(&audit_filter_mutex); - if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { + if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, + &i_watch) < 0) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { /* caller expects mutex locked */ -- cgit v1.2.3 From 1d957f9bf87da74f420424d16ece005202bbebd3 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:34:35 -0800 Subject: Introduce path_put() * Add path_put() functions for releasing a reference to the dentry and vfsmount of a struct path in the right order * Switch from path_release(nd) to path_put(&nd->path) * Rename dput_path() to path_put_conditional() [akpm@linux-foundation.org: fix cifs] Signed-off-by: Jan Blunck Signed-off-by: Andreas Gruenbacher Acked-by: Christoph Hellwig Cc: Cc: Al Viro Cc: Steven French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 12 ++++++------ kernel/auditfilter.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b898814fe4a..9ef5e0aacc3 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -550,7 +550,7 @@ void audit_trim_trees(void) goto skip_it; root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_release(&nd); + path_put(&nd.path); if (!root_mnt) goto skip_it; @@ -642,7 +642,7 @@ int audit_add_tree_rule(struct audit_krule *rule) if (err) goto Err; mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_release(&nd); + path_put(&nd.path); if (!mnt) { err = -ENOMEM; goto Err; @@ -702,7 +702,7 @@ int audit_tag_tree(char *old, char *new) if (err) return err; tagged = collect_mounts(nd.path.mnt, nd.path.dentry); - path_release(&nd); + path_put(&nd.path); if (!tagged) return -ENOMEM; @@ -713,7 +713,7 @@ int audit_tag_tree(char *old, char *new) } mnt = mntget(nd.path.mnt); dentry = dget(nd.path.dentry); - path_release(&nd); + path_put(&nd.path); if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) follow_up(&mnt, &dentry); @@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new) spin_lock(&vfsmount_lock); if (!is_under(mnt, dentry, &nd)) { spin_unlock(&vfsmount_lock); - path_release(&nd); + path_put(&nd.path); put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } spin_unlock(&vfsmount_lock); - path_release(&nd); + path_put(&nd.path); list_for_each_entry(p, &list, mnt_list) { failed = tag_chunk(p->mnt_root->d_inode, tree); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a36e66797c3..2f2914b7cc3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp, static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) { if (ndp) { - path_release(ndp); + path_put(&ndp->path); kfree(ndp); } if (ndw) { - path_release(ndw); + path_put(&ndw->path); kfree(ndw); } } -- cgit v1.2.3 From 6ac08c39a16f72c2d3e845cb6849a1392fa03e80 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:34:38 -0800 Subject: Use struct path in fs_struct * Use struct path in fs_struct. Signed-off-by: Andreas Gruenbacher Signed-off-by: Jan Blunck Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 4 ++-- kernel/exit.c | 12 ++++-------- kernel/fork.c | 18 +++++++++--------- 3 files changed, 15 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c06ecf38d7..741291a1de0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1697,8 +1697,8 @@ void __audit_getname(const char *name) ++context->name_count; if (!context->pwd) { read_lock(¤t->fs->lock); - context->pwd = dget(current->fs->pwd); - context->pwdmnt = mntget(current->fs->pwdmnt); + context->pwd = dget(current->fs->pwd.dentry); + context->pwdmnt = mntget(current->fs->pwd.mnt); read_unlock(¤t->fs->lock); } diff --git a/kernel/exit.c b/kernel/exit.c index 3b893e78ce6..506a957b665 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -512,14 +512,10 @@ static void __put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { - dput(fs->altroot); - mntput(fs->altrootmnt); - } + path_put(&fs->root); + path_put(&fs->pwd); + if (fs->altroot.dentry) + path_put(&fs->altroot); kmem_cache_free(fs_cachep, fs); } } diff --git a/kernel/fork.c b/kernel/fork.c index 4363a4eb84e..dd249c37b3a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -600,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) rwlock_init(&fs->lock); fs->umask = old->umask; read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); + if (old->altroot.dentry) { + fs->altroot = old->altroot; + path_get(&old->altroot); } else { - fs->altrootmnt = NULL; - fs->altroot = NULL; + fs->altroot.mnt = NULL; + fs->altroot.dentry = NULL; } read_unlock(&old->lock); } -- cgit v1.2.3 From 44707fdf5938ad269ea5d6c5744d82f6a7328746 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:38:33 -0800 Subject: d_path: Use struct path in struct avc_audit_data audit_log_d_path() is a d_path() wrapper that is used by the audit code. To use a struct path in audit_log_d_path() I need to embed it into struct avc_audit_data. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jan Blunck Acked-by: Christoph Hellwig Cc: Al Viro Cc: "J. Bruce Fields" Cc: Neil Brown Cc: Stephen Smalley Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 12 ++++++------ kernel/auditsc.c | 28 +++++++++++----------------- 2 files changed, 17 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c8555b18021..783e6570124 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1312,26 +1312,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct dentry *dentry, struct vfsmount *vfsmnt) + struct path *path) { - char *p, *path; + char *p, *pathname; if (prefix) audit_log_format(ab, " %s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ - path = kmalloc(PATH_MAX+11, ab->gfp_mask); - if (!path) { + pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); + if (!pathname) { audit_log_format(ab, ""); return; } - p = d_path(dentry, vfsmnt, path, PATH_MAX+11); + p = d_path(path->dentry, path->mnt, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, ""); } else audit_log_untrustedstring(ab, p); - kfree(path); + kfree(pathname); } /** diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 741291a1de0..ac6d9b23b01 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -208,8 +208,7 @@ struct audit_context { int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ - struct dentry * pwd; - struct vfsmount * pwdmnt; + struct path pwd; struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; @@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context) __putname(context->names[i].name); } context->name_count = 0; - if (context->pwd) - dput(context->pwd); - if (context->pwdmnt) - mntput(context->pwdmnt); - context->pwd = NULL; - context->pwdmnt = NULL; + path_put(&context->pwd); + context->pwd.dentry = NULL; + context->pwd.mnt = NULL; } static inline void audit_free_aux(struct audit_context *context) @@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_path.dentry, - vma->vm_file->f_path.mnt); + &vma->vm_file->f_path); break; } vma = vma->vm_next; @@ -1341,10 +1336,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->target_sid, context->target_comm)) call_panic = 1; - if (context->pwd && context->pwdmnt) { + if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); + audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } @@ -1367,8 +1362,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case 0: /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, " name=", context->pwd, - context->pwdmnt); + audit_log_d_path(ab, " name=", &context->pwd); break; default: /* log the name's directory component */ @@ -1695,10 +1689,10 @@ void __audit_getname(const char *name) context->names[context->name_count].ino = (unsigned long)-1; context->names[context->name_count].osid = 0; ++context->name_count; - if (!context->pwd) { + if (!context->pwd.dentry) { read_lock(¤t->fs->lock); - context->pwd = dget(current->fs->pwd.dentry); - context->pwdmnt = mntget(current->fs->pwd.mnt); + context->pwd = current->fs->pwd; + path_get(¤t->fs->pwd); read_unlock(¤t->fs->lock); } -- cgit v1.2.3 From cf28b4863f9ee8f122e8ff3ac0d403e07ba9c6d9 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:38:44 -0800 Subject: d_path: Make d_path() use a struct path d_path() is used on a pair. Lets use a struct path to reflect this. [akpm@linux-foundation.org: fix build in mm/memory.c] Signed-off-by: Jan Blunck Acked-by: Bryan Wu Acked-by: Christoph Hellwig Cc: Al Viro Cc: "J. Bruce Fields" Cc: Neil Brown Cc: Michael Halcrow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 783e6570124..2eeea9a1424 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1325,7 +1325,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, audit_log_format(ab, ""); return; } - p = d_path(path->dentry, path->mnt, pathname, PATH_MAX+11); + p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, ""); -- cgit v1.2.3 From db4315d6f53edc2cc0b0b06fce1beffebb119c71 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 5 Feb 2008 00:48:13 +0100 Subject: timer_list: print relative expiry time signed Relative expiry time can get negative, so it should be signed. Signed-off-by: Pavel Machek Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index d3d94c1a0fd..67fe8fc21fb 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,9 +65,9 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n", + SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", (unsigned long long)ktime_to_ns(timer->expires), - (unsigned long long)(ktime_to_ns(timer->expires) - now)); + (long long)(ktime_to_ns(timer->expires) - now)); } static void -- cgit v1.2.3 From b0abcfc14605b2a8c686bd8e193ab05b01a7980b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 18 Feb 2008 18:23:16 -0500 Subject: Audit: use == not = in if statements Clearly this was supposed to be an == not an = in the if statement. This patch also causes us to stop processing execve args once we have failed rather than continuing to loop on failure over and over and over. Signed-off-by: Eric Paris Acked-by: Al Viro Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ac6d9b23b01..2087d6de67e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1000,9 +1000,10 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len = -1) || len > MAX_ARG_STRLEN - 1)) { + if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } /* walk the whole argument looking for non-ascii chars */ @@ -1020,6 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; has_cntl = audit_string_contains_control(buf, to_send); @@ -1083,6 +1085,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; -- cgit v1.2.3 From 188fd89d539d899bfca2bc83534e5508e0161139 Mon Sep 17 00:00:00 2001 From: "S.Caglar Onur" Date: Thu, 14 Feb 2008 17:36:51 +0200 Subject: genirq: spurious.c: use time_* macros The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. So following patch implements usage of the time_after() macro, defined at linux/jiffies.h, which deals with wrapping correctly Signed-off-by: S.Caglar Onur Acked-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a6b2bc831dd..088dabbf2d6 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -6,6 +6,7 @@ * This file contains spurious interrupt handling. */ +#include #include #include #include @@ -179,7 +180,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * otherwise the couter becomes a doomsday timer for otherwise * working systems */ - if (jiffies - desc->last_unhandled > HZ/10) + if (time_after(jiffies, desc->last_unhandled + HZ/10)) desc->irqs_unhandled = 1; else desc->irqs_unhandled++; -- cgit v1.2.3 From 89d694b9dbe769ca1004e01db0ca43964806a611 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2008 18:25:17 +0100 Subject: genirq: do not leave interupts enabled on free_irq The default_disable() function was changed in commit: 76d2160147f43f982dfe881404cfde9fd0a9da21 genirq: do not mask interrupts by default It removed the mask function in favour of the default delayed interrupt disabling. Unfortunately this also broke the shutdown in free_irq() when the last handler is removed from the interrupt for those architectures which rely on the default implementations. Now we can end up with a enabled interrupt line after the last handler was removed, which can result in spurious interrupts. Fix this by adding a default_shutdown function, which is only installed, when the irqchip implementation does provide neither a shutdown nor a disable function. [@stable: affected versions: .21 - .24 ] Pointed-out-by: Michael Hennerich Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: stable@kernel.org Tested-by: Michael Hennerich --- kernel/irq/chip.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cc54c627635..fdb3fbe2b0c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -245,6 +245,17 @@ static unsigned int default_startup(unsigned int irq) return 0; } +/* + * default shutdown function + */ +static void default_shutdown(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; +} + /* * Fixup enable/disable function pointers */ @@ -256,8 +267,15 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->disable = default_disable; if (!chip->startup) chip->startup = default_startup; + /* + * We use chip->disable, when the user provided its own. When + * we have default_disable set for chip->disable, then we need + * to use default_shutdown, otherwise the irq line is not + * disabled on free_irq(): + */ if (!chip->shutdown) - chip->shutdown = chip->disable; + chip->shutdown = chip->disable != default_disable ? + chip->disable : default_shutdown; if (!chip->name) chip->name = chip->typename; if (!chip->end) -- cgit v1.2.3 From 8a235efad548abd2ab5ebea45a9ffa750c814375 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Feb 2008 01:47:44 +0100 Subject: Hibernation: Handle DEBUG_PAGEALLOC on x86 Make hibernation work with CONFIG_DEBUG_PAGEALLOC set on x86, by checking if the pages to be copied are marked as present in the kernel mapping and temporarily marking them as present if that's not the case. No functional modifications are introduced if CONFIG_DEBUG_PAGEALLOC is unset. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/snapshot.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 95250d7c8d9..72a020cabb4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -875,8 +875,8 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } #endif /* CONFIG_HIGHMEM */ /** - * saveable - Determine whether a non-highmem page should be included in - * the suspend image. + * saveable_page - Determine whether a non-highmem page should be included + * in the suspend image. * * We should save the page if it isn't Nosave, and is not in the range * of pages statically defined as 'unsaveable', and it isn't a part of @@ -897,7 +897,8 @@ static struct page *saveable_page(unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; - if (PageReserved(page) && pfn_is_nosave(pfn)) + if (PageReserved(page) + && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; return page; @@ -938,6 +939,25 @@ static inline void do_copy_page(long *dst, long *src) *dst++ = *src++; } + +/** + * safe_copy_page - check if the page we are going to copy is marked as + * present in the kernel page tables (this always is the case if + * CONFIG_DEBUG_PAGEALLOC is not set and in that case + * kernel_page_present() always returns 'true'). + */ +static void safe_copy_page(void *dst, struct page *s_page) +{ + if (kernel_page_present(s_page)) { + do_copy_page(dst, page_address(s_page)); + } else { + kernel_map_pages(s_page, 1, 1); + do_copy_page(dst, page_address(s_page)); + kernel_map_pages(s_page, 1, 0); + } +} + + #ifdef CONFIG_HIGHMEM static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) @@ -946,8 +966,7 @@ page_is_saveable(struct zone *zone, unsigned long pfn) saveable_highmem_page(pfn) : saveable_page(pfn); } -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; @@ -961,29 +980,26 @@ copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); } else { - src = page_address(s_page); if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - do_copy_page(buffer, src); + safe_copy_page(buffer, s_page); dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { - dst = page_address(d_page); - do_copy_page(dst, src); + safe_copy_page(page_address(d_page), s_page); } } } #else #define page_is_saveable(zone, pfn) saveable_page(pfn) -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - do_copy_page(page_address(pfn_to_page(dst_pfn)), - page_address(pfn_to_page(src_pfn))); + safe_copy_page(page_address(pfn_to_page(dst_pfn)), + pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ -- cgit v1.2.3 From 120fc3d77acfd91f3521737a440d42839c475982 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 21 Feb 2008 00:33:20 +0100 Subject: modules: do not try to add sysfs attributes if !CONFIG_SYSFS Thanks to Alexey for the testing and the fix of the fix. Cc: Alexey Dobriyan Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 92595bad381..901cd6ac2f1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -987,12 +987,11 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, return ret; } - /* * /sys/module/foo/sections stuff * J. Corbet */ -#ifdef CONFIG_KALLSYMS +#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) static ssize_t module_sect_show(struct module_attribute *mattr, struct module *mod, char *buf) { @@ -1188,7 +1187,7 @@ static inline void add_notes_attrs(struct module *mod, unsigned int nsect, static inline void remove_notes_attrs(struct module *mod) { } -#endif /* CONFIG_KALLSYMS */ +#endif #ifdef CONFIG_SYSFS int module_add_modinfo_attrs(struct module *mod) @@ -1231,9 +1230,7 @@ void module_remove_modinfo_attrs(struct module *mod) } kfree(mod->modinfo_attrs); } -#endif -#ifdef CONFIG_SYSFS int mod_sysfs_init(struct module *mod) { int err; -- cgit v1.2.3 From 3a2d5b700132f35401f1d9e22fe3c2cab02c2549 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 23 Feb 2008 19:13:25 +0100 Subject: PM: Introduce PM_EVENT_HIBERNATE callback state During the last step of hibernation in the "platform" mode (with the help of ACPI) we use the suspend code, including the devices' ->suspend() methods, to prepare the system for entering the ACPI S4 system sleep state. But at least for some devices the operations performed by the ->suspend() callback in that case must be different from its operations during regular suspend. For this reason, introduce the new PM event type PM_EVENT_HIBERNATE and pass it to the device drivers' ->suspend() methods during the last phase of hibernation, so that they can distinguish this case and handle it as appropriate. Modify the drivers that handle PM_EVENT_SUSPEND in a special way and need to handle PM_EVENT_HIBERNATE in the same way. These changes are necessary to fix a hibernation regression related to the i915 driver (ref. http://lkml.org/lkml/2008/2/22/488). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Tested-by: Jeff Chua Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 859a8e59773..14a656cdc65 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -391,7 +391,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - error = device_suspend(PMSG_SUSPEND); + error = device_suspend(PMSG_HIBERNATE); if (error) goto Resume_console; @@ -404,7 +404,7 @@ int hibernation_platform_enter(void) goto Finish; local_irq_disable(); - error = device_power_down(PMSG_SUSPEND); + error = device_power_down(PMSG_HIBERNATE); if (!error) { hibernation_ops->enter(); /* We should never get here */ -- cgit v1.2.3 From de4fc64f0f2a4efbaad3e7c1e1e05a28f69b45e5 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 23 Feb 2008 15:23:33 -0800 Subject: markers: fix sparse warnings in markers.c char can be unsigned kernel/marker.c:64:20: error: dubious one-bit signed bitfield kernel/marker.c:65:14: error: dubious one-bit signed bitfield Signed-off-by: Harvey Harrison Acked-by: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index c4c2cd8b61f..50effc01d9a 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -61,8 +61,8 @@ struct marker_entry { int refcount; /* Number of times armed. 0 if disarmed. */ struct rcu_head rcu; void *oldptr; - char rcu_pending:1; - char ptype:1; + unsigned char rcu_pending:1; + unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; -- cgit v1.2.3 From 3e4ab747efa8e78562ec6782b08bbf21a00aba1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Feb 2008 15:23:55 -0800 Subject: futex: fix init order When the futex init code fails to initialize the futex pseudo file system it returns early without initializing the hash queues. Should the boot succeed then a futex syscall which tries to enqueue a waiter on the hashqueue will crash due to the unitilialized plist heads. Initialize the hash queues before the filesystem. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Lennert Buytenhek Cc: Riku Voipio Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 221f2128a43..c21f667c63f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2145,8 +2145,14 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { - int i = register_filesystem(&futex_fs_type); + int i; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + spin_lock_init(&futex_queues[i].lock); + } + + i = register_filesystem(&futex_fs_type); if (i) return i; @@ -2156,10 +2162,6 @@ static int __init init(void) return PTR_ERR(futex_mnt); } - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); - spin_lock_init(&futex_queues[i].lock); - } return 0; } __initcall(init); -- cgit v1.2.3 From a0c1e9073ef7428a14309cba010633a6cd6719ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Feb 2008 15:23:57 -0800 Subject: futex: runtime enable pi and robust functionality Not all architectures implement futex_atomic_cmpxchg_inatomic(). The default implementation returns -ENOSYS, which is currently not handled inside of the futex guts. Futex PI calls and robust list exits with a held futex result in an endless loop in the futex code on architectures which have no support. Fixing up every place where futex_atomic_cmpxchg_inatomic() is called would add a fair amount of extra if/else constructs to the already complex code. It is also not possible to disable the robust feature before user space tries to register robust lists. Compile time disabling is not a good idea either, as there are already architectures with runtime detection of futex_atomic_cmpxchg_inatomic support. Detect the functionality at runtime instead by calling cmpxchg_futex_value_locked() with a NULL pointer from the futex initialization code. This is guaranteed to fail, but the call of futex_atomic_cmpxchg_inatomic() happens with pagefaults disabled. On architectures, which use the asm-generic implementation or have a runtime CPU feature detection, a -ENOSYS return value disables the PI/robust features. On architectures with a working implementation the call returns -EFAULT and the PI/robust features are enabled. The relevant syscalls return -ENOSYS and the robust list exit code is blocked, when the detection fails. Fixes http://lkml.org/lkml/2008/2/11/149 Originally reported by: Lennart Buytenhek Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Lennert Buytenhek Cc: Riku Voipio Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 38 ++++++++++++++++++++++++++++++++++---- kernel/futex_compat.c | 9 +++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c21f667c63f..06968cd7920 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,8 @@ #include "rtmutex_common.h" +int __read_mostly futex_cmpxchg_enabled; + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -469,6 +471,8 @@ void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key; + if (!futex_cmpxchg_enabled) + return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1870,6 +1874,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -1894,6 +1900,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, struct robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->robust_list; else { @@ -1997,6 +2006,9 @@ void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -2051,7 +2063,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret; + int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; struct rw_semaphore *fshared = NULL; @@ -2083,13 +2095,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr, fshared); + if (futex_cmpxchg_enabled) + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; default: ret = -ENOSYS; @@ -2145,8 +2160,23 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { + u32 curval; int i; + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non functional ones will return + * -ENOSYS. + */ + curval = cmpxchg_futex_value_locked(NULL, 0, 0); + if (curval == -EFAULT) + futex_cmpxchg_enabled = 1; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); spin_lock_init(&futex_queues[i].lock); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7d5e4b016f3..ff90f049f8f 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -54,6 +54,9 @@ void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -115,6 +118,9 @@ asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -130,6 +136,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, struct compat_robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->compat_robust_list; else { -- cgit v1.2.3 From 43627582799db317e966ecb0002c2c3c9805ec0f Mon Sep 17 00:00:00 2001 From: Srinivasa Ds Date: Sat, 23 Feb 2008 15:24:04 -0800 Subject: kprobes: refuse kprobe insertion on add/sub_preempt_counter() Kprobes makes use of preempt_disable(),preempt_enable_noresched() and these functions inturn call add/sub_preempt_count(). So we need to refuse user from inserting probe in to these functions. This patch disallows user from probing add/sub_preempt_count(). Signed-off-by: Srinivasa DS Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f28f19e65b5..c4bc8c21095 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3766,7 +3766,7 @@ void scheduler_tick(void) #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) -void add_preempt_count(int val) +void __kprobes add_preempt_count(int val) { /* * Underflow? @@ -3782,7 +3782,7 @@ void add_preempt_count(int val) } EXPORT_SYMBOL(add_preempt_count); -void sub_preempt_count(int val) +void __kprobes sub_preempt_count(int val) { /* * Underflow? -- cgit v1.2.3 From a043e3b2c63445512c5592cbe3c8694f3c655e81 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:09 -0800 Subject: cgroup: fix comments fix: - comments about need_forkexit_callback - comments about release agent - typo and comment style, etc. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 142 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 79 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4766bb65e4d..36066d8a491 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -113,9 +113,9 @@ static int root_count; #define dummytop (&rootnode.top_cgroup) /* This flag indicates whether tasks in the fork and exit paths should - * take callback_mutex and check for fork/exit handlers to call. This - * avoids us having to do extra work in the fork/exit path if none of the - * subsystems need to be called. + * check for fork/exit handlers to call. This avoids us having to do + * extra work in the fork/exit path if none of the subsystems need to + * be called. */ static int need_forkexit_callback; @@ -307,7 +307,6 @@ static inline void put_css_set_taskexit(struct css_set *cg) * template: location in which to build the desired set of subsystem * state objects for the new cgroup group */ - static struct css_set *find_existing_css_set( struct css_set *oldcg, struct cgroup *cgrp, @@ -354,7 +353,6 @@ static struct css_set *find_existing_css_set( * and chains them on tmp through their cgrp_link_list fields. Returns 0 on * success or a negative error */ - static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; @@ -396,7 +394,6 @@ static void free_cg_links(struct list_head *tmp) * substituted into the appropriate hierarchy. Must be called with * cgroup_mutex held */ - static struct css_set *find_css_set( struct css_set *oldcg, struct cgroup *cgrp) { @@ -507,8 +504,8 @@ static struct css_set *find_css_set( * critical pieces of code here. The exception occurs on cgroup_exit(), * when a task in a notify_on_release cgroup exits. Then cgroup_mutex * is taken, and if the cgroup count is zero, a usermode call made - * to /sbin/cgroup_release_agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. + * to the release agent with the name of the cgroup (path relative to + * the root of cgroup file system) as the argument. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all @@ -521,7 +518,7 @@ static struct css_set *find_css_set( * * The need for this exception arises from the action of * cgroup_attach_task(), which overwrites one tasks cgroup pointer with - * another. It does so using cgroup_mutexe, however there are + * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as @@ -537,7 +534,6 @@ static struct css_set *find_css_set( * cgroup_lock - lock out any changes to cgroup structures * */ - void cgroup_lock(void) { mutex_lock(&cgroup_mutex); @@ -548,7 +544,6 @@ void cgroup_lock(void) * * Undo the lock taken in a previous cgroup_lock() call. */ - void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); @@ -590,7 +585,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) * Call subsys's pre_destroy handler. * This is called before css refcnt check. */ - static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -600,7 +594,6 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -1129,8 +1122,13 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return dentry->d_fsdata; } -/* - * Called with cgroup_mutex held. Writes path of cgroup into buf. +/** + * cgroup_path - generate the path of a cgroup + * @cgrp: the cgroup in question + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Called with cgroup_mutex held. Writes path of cgroup into buf. * Returns 0 on success, -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) @@ -1188,11 +1186,13 @@ static void get_first_subsys(const struct cgroup *cgrp, *subsys_id = test_ss->subsys_id; } -/* - * Attach task 'tsk' to cgroup 'cgrp' +/** + * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' + * @cgrp: the cgroup the task is attaching to + * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'pid' during call. + * Call holding cgroup_mutex. May take task_lock of + * the task 'tsk' during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1293,7 +1293,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) } /* The various types of files and directories in a cgroup file system */ - enum cgroup_filetype { FILE_ROOT, FILE_DIR, @@ -1584,12 +1583,11 @@ static int cgroup_create_file(struct dentry *dentry, int mode, } /* - * cgroup_create_dir - create a directory for an object. - * cgrp: the cgroup we create the directory for. - * It must have a valid ->parent field - * And we are going to fill its ->dentry field. - * dentry: dentry of the new cgroup - * mode: mode to set on new directory. + * cgroup_create_dir - create a directory for an object. + * @cgrp: the cgroup we create the directory for. It must have a valid + * ->parent field. And we are going to fill its ->dentry field. + * @dentry: dentry of the new cgroup + * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, int mode) @@ -1651,8 +1649,12 @@ int cgroup_add_files(struct cgroup *cgrp, return 0; } -/* Count the number of tasks in a cgroup. */ - +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. + */ int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; @@ -1962,12 +1964,13 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) } /** - * Build and fill cgroupstats so that taskstats can export it to user - * space. - * + * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { @@ -2199,14 +2202,13 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* - * cgroup_create - create a cgroup - * parent: cgroup that will be parent of the new cgroup. - * name: name of the new cgroup. Will be strcpy'ed. - * mode: mode to set on new inode + * cgroup_create - create a cgroup + * @parent: cgroup that will be parent of the new cgroup + * @dentry: dentry of the new cgroup + * @mode: mode to set on new inode * - * Must be called with the mutex on the parent inode held + * Must be called with the mutex on the parent inode held */ - static long cgroup_create(struct cgroup *parent, struct dentry *dentry, int mode) { @@ -2349,13 +2351,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) parent = cgrp->parent; root = cgrp->root; sb = root->sb; + /* - * Call pre_destroy handlers of subsys + * Call pre_destroy handlers of subsys. Notify subsystems + * that rmdir() request comes. */ cgroup_call_pre_destroy(cgrp); - /* - * Notify subsyses that rmdir() request comes. - */ if (cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); @@ -2431,8 +2432,10 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) } /** - * cgroup_init_early - initialize cgroups at system boot, and - * initialize any subsystems that request early init. + * cgroup_init_early - cgroup initialization at system boot + * + * Initialize cgroups at system boot, and initialize any + * subsystems that request early init. */ int __init cgroup_init_early(void) { @@ -2474,8 +2477,10 @@ int __init cgroup_init_early(void) } /** - * cgroup_init - register cgroup filesystem and /proc file, and - * initialize any subsystems that didn't request early init. + * cgroup_init - cgroup initialization + * + * Register cgroup filesystem and /proc file, and initialize + * any subsystems that didn't request early init. */ int __init cgroup_init(void) { @@ -2618,7 +2623,7 @@ static struct file_operations proc_cgroupstats_operations = { /** * cgroup_fork - attach newly forked task to its parents cgroup. - * @tsk: pointer to task_struct of forking parent process. + * @child: pointer to task_struct of forking parent process. * * Description: A task inherits its parent's cgroup at fork(). * @@ -2642,9 +2647,12 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - called on a new task very soon before - * adding it to the tasklist. No need to take any locks since no-one - * can be operating on this task + * cgroup_fork_callbacks - run fork callbacks + * @child: the new task + * + * Called on a new task very soon before adding it to the + * tasklist. No need to take any locks since no-one can + * be operating on this task. */ void cgroup_fork_callbacks(struct task_struct *child) { @@ -2659,11 +2667,14 @@ void cgroup_fork_callbacks(struct task_struct *child) } /** - * cgroup_post_fork - called on a new task after adding it to the - * task list. Adds the task to the list running through its css_set - * if necessary. Has to be after the task is visible on the task list - * in case we race with the first call to cgroup_iter_start() - to - * guarantee that the new task ends up on its list. */ + * cgroup_post_fork - called on a new task after adding it to the task list + * @child: the task in question + * + * Adds the task to the list running through its css_set if necessary. + * Has to be after the task is visible on the task list in case we race + * with the first call to cgroup_iter_start() - to guarantee that the + * new task ends up on its list. + */ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { @@ -2676,6 +2687,7 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process + * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -2706,7 +2718,6 @@ void cgroup_post_fork(struct task_struct *child) * top_cgroup isn't going away, and either task has PF_EXITING set, * which wards off any cgroup_attach_task() attempts, or task is a failed * fork, never visible to cgroup_attach_task. - * */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { @@ -2743,9 +2754,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** - * cgroup_clone - duplicate the current cgroup in the hierarchy - * that the given subsystem is attached to, and move this task into - * the new child + * cgroup_clone - clone the cgroup the given subsystem is attached to + * @tsk: the task to be moved + * @subsys: the given subsystem + * + * Duplicate the current cgroup in the hierarchy that the given + * subsystem is attached to, and move this task into the new + * child. */ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) { @@ -2858,9 +2873,12 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) return ret; } -/* - * See if "cgrp" is a descendant of the current task's cgroup in - * the appropriate hierarchy +/** + * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp + * @cgrp: the cgroup in question + * + * See if @cgrp is a descendant of the current task's cgroup in + * the appropriate hierarchy. * * If we are sending in dummytop, then presumably we are creating * the top cgroup in the subsystem. @@ -2939,9 +2957,7 @@ void __css_put(struct cgroup_subsys_state *css) * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. - * */ - static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); -- cgit v1.2.3 From f777073848ba3708d68d87e43f104f83316187d7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:10 -0800 Subject: cgroup: fix memory leak in cgroup_get_sb() opts.release_agent is not kfree()ed in all necessary places. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36066d8a491..947fe3b2218 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -954,8 +954,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, } root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) + if (!root) { + if (opts.release_agent) + kfree(opts.release_agent); return -ENOMEM; + } init_cgroup_root(root); root->subsys_bits = opts.subsys_bits; -- cgit v1.2.3 From 8d53d55d27754508e58e9ac18a4a445b110434bf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:11 -0800 Subject: cgroup: fix subsys bitops Cgroup uses unsigned long for subsys bitops, not unsigned long long. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 947fe3b2218..84125936172 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -319,7 +319,7 @@ static struct css_set *find_existing_css_set( /* Built the set of subsystem state objects that we want to * see in the new css_set */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1ull << i)) { + if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -689,7 +689,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long long bit = 1ull << i; + unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; -- cgit v1.2.3 From 68db38f1537a44097e264f28bda751d6b919cd53 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:11 -0800 Subject: cgroup: remove duplicate code in find_css_set() The list head res->tasks gets initialized twice in find_css_set(). Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 84125936172..2aa408201aa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -470,7 +470,6 @@ static struct css_set *find_css_set( /* Link this cgroup group into the list */ list_add(&res->list, &init_css_set.list); css_set_count++; - INIT_LIST_HEAD(&res->tasks); write_unlock(&css_set_lock); return res; -- cgit v1.2.3 From bc231d2a048010d5e0b49ac7fddbfa822fc41109 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:12 -0800 Subject: cgroup: remove dead code in cgroup_get_rootdir() Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2aa408201aa..d8abe996e00 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -919,7 +919,6 @@ static int cgroup_get_rootdir(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ -- cgit v1.2.3 From 04e2f1741d235ba599037734878d72e57cb302b5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 23 Feb 2008 18:05:03 -0800 Subject: Add memory barrier semantics to wake_up() & co Oleg Nesterov and others have pointed out that on some architectures, the traditional sequence of set_current_state(TASK_INTERRUPTIBLE); if (CONDITION) return; schedule(); is racy wrt another CPU doing CONDITION = 1; wake_up_process(p); because while set_current_state() has a memory barrier separating setting of the TASK_INTERRUPTIBLE state from reading of the CONDITION variable, there is no such memory barrier on the wakeup side. Now, wake_up_process() does actually take a spinlock before it reads and sets the task state on the waking side, and on x86 (and many other architectures) that spinlock is in fact equivalent to a memory barrier, but that is not generally guaranteed. The write that sets CONDITION could move into the critical region protected by the runqueue spinlock. However, adding a smp_wmb() to before the spinlock should now order the writing of CONDITION wrt the lock itself, which in turn is ordered wrt the accesses within the spinlock (which includes the reading of the old state). This should thus close the race (which probably has never been seen in practice, but since smp_wmb() is a no-op on x86, it's not like this will make anything worse either on the most common architecture where the spinlock already gave the required protection). Acked-by: Oleg Nesterov Acked-by: Dmitry Adamushko Cc: Andrew Morton Cc: Nick Piggin Signed-off-by: Linus Torvalds --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c4bc8c21095..b387a8de26a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1831,6 +1831,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) -- cgit v1.2.3