From c017b4be3e84176cab10eca5e6c4faeb8cfc6f3e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 28 Oct 2009 13:33:09 +0000 Subject: kmemleak: Simplify the kmemleak_scan_area() function prototype This function was taking non-necessary arguments which can be determined by kmemleak. The patch also modifies the calling sites. Signed-off-by: Catalin Marinas Cc: Pekka Enberg Cc: Christoph Lameter Cc: Rusty Russell --- kernel/module.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8b7d8805819..1eb95209707 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2043,9 +2043,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, unsigned int i; /* only scan the sections containing data */ - kmemleak_scan_area(mod->module_core, (unsigned long)mod - - (unsigned long)mod->module_core, - sizeof(struct module), GFP_KERNEL); + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < hdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -2054,8 +2052,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) continue; - kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - - (unsigned long)mod->module_core, + kmemleak_scan_area((void *)sechdrs[i].sh_addr, sechdrs[i].sh_size, GFP_KERNEL); } } -- cgit v1.2.3 From a6f5aa1ea05686ad6e84593a00a04161e6dfb3a3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 28 Oct 2009 13:33:10 +0000 Subject: kmemleak: Scan the _ftrace_events section in modules This section contains pointers to allocated objects and not scanning it leads to false positives. Reported-by: Zdenek Kabelac Acked-by: Rusty Russell Signed-off-by: Catalin Marinas --- kernel/module.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1eb95209707..dd29ba43c34 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2380,6 +2380,12 @@ static noinline struct module *load_module(void __user *umod, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ -- cgit v1.2.3 From 14d8c9f3c09e7fd7b9af80904289fe204f5b93c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 00:53:17 +0000 Subject: signal: Fix racy access to __task_cred in kill_pid_info_as_uid() kill_pid_info_as_uid() accesses __task_cred() without being in a RCU read side critical section. tasklist_lock is not protecting that when CONFIG_TREE_PREEMPT_RCU=y. Convert the whole tasklist_lock section to rcu and use lock_task_sighand to prevent the exit race. Signed-off-by: Thomas Gleixner LKML-Reference: <20091210004703.232302055@linutronix.de> Acked-by: Oleg Nesterov --- kernel/signal.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6b982f2cf52..73316568a69 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1175,11 +1175,12 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, int ret = -EINVAL; struct task_struct *p; const struct cred *pcred; + unsigned long flags; if (!valid_signal(sig)) return ret; - read_lock(&tasklist_lock); + rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; @@ -1196,14 +1197,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = security_task_kill(p, info, sig, secid); if (ret) goto out_unlock; - if (sig && p->sighand) { - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = __send_signal(sig, info, p, 1, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + + if (sig) { + if (lock_task_sighand(p, &flags)) { + ret = __send_signal(sig, info, p, 1, 0); + unlock_task_sighand(p, &flags); + } else + ret = -ESRCH; } out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); -- cgit v1.2.3 From 7cf7db8df0b78076eafa4ead47559344ca7b7a43 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 00:53:21 +0000 Subject: signals: Fix more rcu assumptions 1) Remove the misleading comment in __sigqueue_alloc() which claims that holding a spinlock is equivalent to rcu_read_lock(). 2) Add a rcu_read_lock/unlock around the __task_cred() access in __sigqueue_alloc() This needs to be revisited to remove the remaining users of read_lock(&tasklist_lock) but that's outside the scope of this patch. Signed-off-by: Thomas Gleixner LKML-Reference: <20091210004703.269843657@linutronix.de> --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 73316568a69..f67545f9394 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi struct user_struct *user; /* - * We won't get problems with the target's UID changing under us - * because changing it requires RCU be used, and if t != current, the - * caller must be holding the RCU readlock (by way of a spinlock) and - * we use RCU protection here + * Protect access to @t credentials. This can go away when all + * callers hold rcu read lock. */ + rcu_read_lock(); user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); + rcu_read_unlock(); if (override_rlimit || atomic_read(&user->sigpending) <= -- cgit v1.2.3 From d4581a239a40319205762b76c01eb6363f277efa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 00:52:51 +0000 Subject: sys: Fix missing rcu protection for __task_cred() access commit c69e8d9 (CRED: Use RCU to access another task's creds and to release a task's own creds) added non rcu_read_lock() protected access to task creds of the target task in set_prio_one(). The comment above the function says: * - the caller must hold the RCU read lock The calling code in sys_setpriority does read_lock(&tasklist_lock) but not rcu_read_lock(). This works only when CONFIG_TREE_PREEMPT_RCU=n. With CONFIG_TREE_PREEMPT_RCU=y the rcu_callbacks can run in the tick interrupt when they see no read side critical section. There is another instance of __task_cred() in sys_setpriority() itself which is equally unprotected. Wrap the whole code section into a rcu read side critical section to fix this quick and dirty. Will be revisited in course of the read_lock(&tasklist_lock) -> rcu crusade. Oleg noted further: This also fixes another bug here. find_task_by_vpid() is not safe without rcu_read_lock(). I do not mean it is not safe to use the result, just find_pid_ns() by itself is not safe. Usually tasklist gives enough protection, but if copy_process() fails it calls free_pid() lockless and does call_rcu(delayed_put_pid(). This means, without rcu lock find_pid_ns() can't scan the hash table safely. Signed-off-by: Thomas Gleixner LKML-Reference: <20091210004703.029784964@linutronix.de> Acked-by: Paul E. McKenney --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 9968c5fb55b..bc1dc61c31e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -163,6 +163,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) if (niceval > 19) niceval = 19; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -200,6 +201,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); out: return error; } -- cgit v1.2.3 From bb6eddf7676e1c1f3e637aa93c5224488d99036f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2009 15:35:10 +0100 Subject: clockevents: Prevent clockevent_devices list corruption on cpu hotplug Xiaotian Feng triggered a list corruption in the clock events list on CPU hotplug and debugged the root cause. If a CPU registers more than one per cpu clock event device, then only the active clock event device is removed on CPU_DEAD. The unused devices are kept in the clock events device list. On CPU up the clock event devices are registered again, which means that we list_add an already enqueued list_head. That results in list corruption. Resolve this by removing all devices which are associated to the dead CPU on CPU_DEAD. Reported-by: Xiaotian Feng Signed-off-by: Thomas Gleixner Tested-by: Xiaotian Feng Cc: stable@kernel.org --- kernel/time/clockevents.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 20a8920029e..91db2e33d86 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -238,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old, */ void clockevents_notify(unsigned long reason, void *arg) { - struct list_head *node, *tmp; + struct clock_event_device *dev, *tmp; unsigned long flags; + int cpu; spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); @@ -250,8 +251,19 @@ void clockevents_notify(unsigned long reason, void *arg) * Unregister the clock event devices which were * released from the users in the notify chain. */ - list_for_each_safe(node, tmp, &clockevents_released) - list_del(node); + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + cpu = *((int *)arg); + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1) { + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + list_del(&dev->list); + } + } break; default: break; -- cgit v1.2.3 From 01fc0ac198eabcbf460e1ed058860a935b6c2c9a Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 19 Apr 2009 21:57:19 +0200 Subject: kbuild: move bounds.h to include/generated Signed-off-by: Sam Ravnborg Cc: Al Viro Signed-off-by: Michal Marek --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index 3c530138183..98a51f26c13 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -12,7 +12,7 @@ void foo(void) { - /* The enum constants to put into include/linux/bounds.h */ + /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); /* End of constants */ -- cgit v1.2.3 From 273b281fa22c293963ee3e6eec418f5dda2dbc83 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 18 Oct 2009 00:52:28 +0200 Subject: kbuild: move utsrelease.h to include/generated Fix up all users of utsrelease.h Signed-off-by: Sam Ravnborg Signed-off-by: Michal Marek --- kernel/kexec.c | 2 +- kernel/trace/trace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index f336e2107f9..83f54e2a6ee 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88bd9ae2a9e..bfb1b64bfa9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -12,7 +12,7 @@ * Copyright (C) 2004 William Lee Irwin III */ #include -#include +#include #include #include #include -- cgit v1.2.3 From 7539a3b3d1f892dd97eaf094134d7de55c13befe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 13 Dec 2009 00:07:30 +0100 Subject: sched: Make wakeup side and atomic variants of completion API irq safe Alan Stern noticed that all the wakeup side (and atomic) variants of the completion APIs should be irq safe, but the newly introduced completion_done() and try_wait_for_completion() aren't. The use of the irq unsafe variants in IRQ contexts can cause crashes/hangs. Fix the problem by making them use spin_lock_irqsave() and spin_lock_irqrestore(). Reported-by: Alan Stern Signed-off-by: Rafael J. Wysocki Cc: Linus Torvalds Cc: Zhang Rui Cc: pm list Cc: Peter Zijlstra Cc: David Chinner Cc: Lachlan McIlroy LKML-Reference: <200912130007.30541.rjw@sisk.pl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ff39cadf621..8b3532f262d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5908,14 +5908,15 @@ EXPORT_SYMBOL(wait_for_completion_killable); */ bool try_wait_for_completion(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -5930,12 +5931,13 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); -- cgit v1.2.3 From 663997d417330a59a566452f52cfa04c8ffd190b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 12 Dec 2009 13:57:27 -0800 Subject: sched: Use pr_fmt() and pr_() - Convert printk(KERN_ to pr_ (not KERN_DEBUG) - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Coalesce long format strings - Add missing \n to "ERROR: !SD_LOAD_BALANCE domain has parent" Signed-off-by: Joe Perches Cc: Peter Zijlstra LKML-Reference: <1260655047.2637.7.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- kernel/sched.c | 94 ++++++++++++++++++++++--------------------------- kernel/sched_idletask.c | 2 +- 2 files changed, 43 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8b3532f262d..258c73c6a2f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -26,6 +26,8 @@ * Thomas Gleixner, Mike Kravetz */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -5337,8 +5339,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); print_modules(); @@ -6906,23 +6908,23 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + pr_info("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(KERN_CONT " running "); + pr_cont(" running "); else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); + pr_cont(" %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); + pr_cont(" %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + pr_cont("%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), task_pid_nr(p->real_parent), (unsigned long)task_thread_info(p)->flags); @@ -6934,11 +6936,9 @@ void show_state_filter(unsigned long state_filter) struct task_struct *g, *p; #if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #else - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -7296,9 +7296,8 @@ again: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); + pr_info("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } } @@ -7805,48 +7804,44 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_DEBUG "%*s domain %d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); + pr_cont("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n"); return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + pr_cont("span %s level %s\n", str, sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); + pr_err("ERROR: domain->span does not contain CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); + pr_cont("\n"); + pr_err("ERROR: group is NULL\n"); break; } if (!group->cpu_power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + pr_cont("\n"); + pr_err("ERROR: domain->cpu_power not set\n"); break; } if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); + pr_cont("\n"); + pr_err("ERROR: empty group\n"); break; } if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); + pr_cont("\n"); + pr_err("ERROR: repeated CPUs\n"); break; } @@ -7854,23 +7849,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s", str); + pr_cont(" %s", str); if (group->cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + pr_cont(" (cpu_power = %d)", group->cpu_power); } group = group->next; } while (group != sd->groups); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + pr_err("ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); + pr_err("ERROR: parent span is not a superset of domain->span\n"); return 0; } @@ -8426,8 +8419,7 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); + pr_warning("Can not alloc domain group for node %d\n", num); return -ENOMEM; } d->sched_group_nodes[num] = sg; @@ -8456,8 +8448,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); + pr_warning("Can not alloc domain group for node %d\n", + j); return -ENOMEM; } sg->cpu_power = 0; @@ -8685,7 +8677,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, d->sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); + pr_warning("Can not alloc sched group node list\n"); return sa_notcovered; } sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; @@ -8702,7 +8694,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_send_covered; d->rd = alloc_rootdomain(); if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); + pr_warning("Cannot alloc root domain\n"); return sa_tmpmask; } return sa_rootdomain; @@ -9684,13 +9676,11 @@ void __might_sleep(char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 33d5384a73a..b810e22772d 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -35,7 +35,7 @@ static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + pr_err("bad: scheduling from the idle thread!\n"); dump_stack(); spin_lock_irq(&rq->lock); } -- cgit v1.2.3 From 5fe85be081edf0ac92d83f9c39e0ab5c1371eb82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:14:58 +0000 Subject: sched: Use rcu in sys_sched_getscheduler/sys_sched_getparam() read_lock(&tasklist_lock) does not protect sys_sched_getscheduler and sys_sched_getparam() against a concurrent update of the policy or scheduler parameters as do_sched_setscheduler() does not take the tasklist_lock. The accessed integers can be retrieved w/o locking and are snapshots anyway. Using rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away is not changing the above situation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.753790977@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 258c73c6a2f..1782beed2fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6458,7 +6458,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); @@ -6466,7 +6466,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6484,7 +6484,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) @@ -6495,7 +6495,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) goto out_unlock; lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... @@ -6505,7 +6505,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.3 From 23f5d142519621b16cf2b378cf8adf4dcf01a616 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:01 +0000 Subject: sched: Use rcu in sched_get/set_affinity() tasklist_lock is held read locked to protect the find_task_by_vpid() call and to prevent the task going away. sched_setaffinity acquires a task struct ref and drops tasklist lock right away. The access to the cpus_allowed mask is protected by rq->lock. rcu_read_lock() provides the same protection here. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.789059966@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1782beed2fa..79893123325 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6516,22 +6516,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return -ESRCH; } - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ + /* Prevent p going away */ get_task_struct(p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; @@ -6617,7 +6613,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -6633,7 +6629,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) task_rq_unlock(rq, &flags); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return retval; -- cgit v1.2.3 From 1a551ae715825bb2a2107a2dd68de024a1fa4e32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:11 +0000 Subject: sched: Use rcu in sched_get_rr_param() read_lock(&tasklist_lock) does not protect sys_sched_get_rr_param() against a concurrent update of the policy or scheduler parameters as do_sched_scheduler() does not take the tasklist_lock. The access to task->sched_class->get_rr_interval is protected by task_rq_lock(task). Use rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.862897167@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 79893123325..db5c26692dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6873,7 +6873,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -6886,13 +6886,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, &flags); - read_unlock(&tasklist_lock); + rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.3 From b9f8fcd55bbdb037e5332dbdb7b494f0b70861ac Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 13 Dec 2009 18:25:02 -0800 Subject: sched: Fix cpu_clock() in NMIs, on !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK Relax stable-sched-clock architectures to not save/disable/restore hardirqs in cpu_clock(). The background is that I was trying to resolve a sparc64 perf issue when I discovered this problem. On sparc64 I implement pseudo NMIs by simply running the kernel at IRQ level 14 when local_irq_disable() is called, this allows performance counter events to still come in at IRQ level 15. This doesn't work if any code in an NMI handler does local_irq_save() or local_irq_disable() since the "disable" will kick us back to cpu IRQ level 14 thus letting NMIs back in and we recurse. The only path which that does that in the perf event IRQ handling path is the code supporting frequency based events. It uses cpu_clock(). cpu_clock() simply invokes sched_clock() with IRQs disabled. And that's a fundamental bug all on it's own, particularly for the HAVE_UNSTABLE_SCHED_CLOCK case. NMIs can thus get into the sched_clock() code interrupting the local IRQ disable code sections of it. Furthermore, for the not-HAVE_UNSTABLE_SCHED_CLOCK case, the IRQ disabling done by cpu_clock() is just pure overhead and completely unnecessary. So the core problem is that sched_clock() is not NMI safe, but we are invoking it from NMI contexts in the perf events code (via cpu_clock()). A less important issue is the overhead of IRQ disabling when it isn't necessary in cpu_clock(). CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures are not affected by this patch. Signed-off-by: David S. Miller Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091213.182502.215092085.davem@davemloft.net> Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 479ce5682d7..5b496132c28 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { - unsigned long long clock; - unsigned long flags; + return sched_clock_cpu(cpu); +} - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - return clock; -} EXPORT_SYMBOL_GPL(cpu_clock); -- cgit v1.2.3 From 9ee349ad6d326df3633d43f54202427295999c47 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 16 Dec 2009 18:04:32 +0100 Subject: sched: Fix set_cpu_active() in cpu_down() Sachin found cpu hotplug test failures on powerpc, which made the kernel hang on his POWER box. The problem is that we fail to re-activate a cpu when a hot-unplug fails. Fix this by moving the de-activation into _cpu_down after doing the initial checks. Remove the synchronize_sched() calls and rely on those implied by rebuilding the sched domains using the new mask. Reported-by: Sachin Sant Signed-off-by: Xiaotian Feng Tested-by: Sachin Sant Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.500272612@chello.nl> Signed-off-by: Ingo Molnar --- kernel/cpu.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 291ac586f37..1c8ddd6ee94 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -ENOMEM; cpu_hotplug_begin(); + set_cpu_active(cpu, false); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { @@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu) goto out; } - set_cpu_active(cpu, false); - - /* - * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_mask. - * This is not strictly necessary becuase stop_machine() - * that we run down the line already provides the required - * synchronization. But it's really a side effect and we do not - * want to depend on the innards of the stop_machine here. - */ - synchronize_sched(); - err = _cpu_down(cpu, 0); out: @@ -382,19 +371,12 @@ int disable_nonboot_cpus(void) return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); - /* We take down all of the non-boot CPUs in one shot to avoid races + /* + * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - set_cpu_active(cpu, false); - } - - synchronize_sched(); - printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) -- cgit v1.2.3 From e6c8fba7771563b2f3dfb96a78f36ec17e15bdf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:33 +0100 Subject: sched: Fix task_hot() test order Make sure not to access sched_fair fields before verifying it is indeed a sched_fair task. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith CC: stable@kernel.org LKML-Reference: <20091216170517.577998058@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9c30858b646..1d8ca25dd6f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2046,6 +2046,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + if (p->sched_class != &fair_sched_class) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2054,9 +2057,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) &p->se == cfs_rq_of(&p->se)->last)) return 1; - if (p->sched_class != &fair_sched_class) - return 0; - if (sysctl_sched_migration_cost == -1) return 1; if (sysctl_sched_migration_cost == 0) -- cgit v1.2.3 From e4f4288842ee12747e10c354d72be7d424c0b627 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:34 +0100 Subject: sched: Select_task_rq_fair() must honour SD_LOAD_BALANCE We should skip !SD_LOAD_BALANCE domains. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.653578430@chello.nl> CC: stable@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5bedf6e3ebf..ec1d2715620 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,6 +1429,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; + /* * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. -- cgit v1.2.3 From 06b83b5fbea273672822b6ee93e16781046553ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:35 +0100 Subject: sched: Use TASK_WAKING for fork wakups For later convenience use TASK_WAKING for fresh tasks. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.732561278@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1d8ca25dd6f..1672823aabf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2540,14 +2540,6 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; } /* @@ -2558,6 +2550,12 @@ void sched_fork(struct task_struct *p, int clone_flags) int cpu = get_cpu(); __sched_fork(p); + /* + * We mark the process as waking here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_WAKING; /* * Revert to default priority/policy on fork if requested. @@ -2626,7 +2624,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) struct rq *rq; rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); + BUG_ON(p->state != TASK_WAKING); + p->state = TASK_RUNNING; update_rq_clock(rq); activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); @@ -6984,6 +6983,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); -- cgit v1.2.3 From e2912009fb7b715728311b0d8fe327a1432b3f79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:36 +0100 Subject: sched: Ensure set_task_cpu() is never called on blocked tasks In order to clean up the set_task_cpu() rq dependencies we need to ensure it is never called on blocked tasks because such usage does not pair with consistent rq->lock usage. This puts the migration burden on ttwu(). Furthermore we need to close a race against changing ->cpus_allowed, since select_task_rq() runs with only preemption disabled. For sched_fork() this is safe because the child isn't in the tasklist yet, for wakeup we fix this by synchronizing set_cpus_allowed_ptr() against TASK_WAKING, which leaves sched_exec to be a problem This also closes a hole in (6ad4c1888 sched: Fix balance vs hotplug race) where ->select_task_rq() doesn't validate the result against the sched_domain/root_domain. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 85 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1672823aabf..33d7965f63f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2018,22 +2018,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - /* Must have done schedule() in kthread() before we set_task_cpu */ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - raw_spin_lock_irqsave(&rq->lock, flags); - update_rq_clock(rq); - set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - raw_spin_unlock_irqrestore(&rq->lock, flags); } EXPORT_SYMBOL(kthread_bind); @@ -2074,6 +2067,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); +#endif + trace_sched_migrate_task(p, new_cpu); if (old_cpu != new_cpu) { @@ -2107,13 +2108,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. + * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) { - update_rq_clock(rq); - set_task_cpu(p, dest_cpu); + if (!p->se.on_rq && !task_running(rq, p)) return 0; - } init_completion(&req->done); req->task = p; @@ -2319,10 +2317,42 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +/* + * Called from: + * + * - fork, @p is stable because it isn't on the tasklist yet + * + * - exec, @p is unstable XXX + * + * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so + * we should be good. + */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - return p->sched_class->select_task_rq(p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_active(cpu))) { + + cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + /* + * XXX: race against hot-plug modifying cpu_active_mask + */ + BUG_ON(cpu >= nr_cpu_ids); + } + + return cpu; } #endif @@ -7098,7 +7128,23 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; + /* + * Since we rely on wake-ups to migrate sleeping tasks, don't change + * the ->cpus_allowed mask from under waking tasks, which would be + * possible when we change rq->lock in ttwu(), so synchronize against + * TASK_WAKING to avoid that. + */ +again: + while (p->state == TASK_WAKING) + cpu_relax(); + rq = task_rq_lock(p, &flags); + + if (p->state == TASK_WAKING) { + task_rq_unlock(rq, &flags); + goto again; + } + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7154,7 +7200,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; @@ -7170,12 +7216,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; - on_rq = p->se.on_rq; - if (on_rq) + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->se.on_rq) { deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { + set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } -- cgit v1.2.3 From 3802290628348674985d14914f9bfee7b9084548 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:37 +0100 Subject: sched: Fix sched_exec() balancing Since we access ->cpus_allowed without holding rq->lock we need a retry loop to validate the result, this comes for near free when we merge sched_migrate_task() into sched_exec() since that already does the needed check. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.884743662@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 33d7965f63f..63e55ac242d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2322,7 +2322,7 @@ void task_oncpu_function_call(struct task_struct *p, * * - fork, @p is stable because it isn't on the tasklist yet * - * - exec, @p is unstable XXX + * - exec, @p is unstable, retry loop * * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so * we should be good. @@ -3132,21 +3132,36 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +void sched_exec(void) { + struct task_struct *p = current; struct migration_req req; + int dest_cpu, this_cpu; unsigned long flags; struct rq *rq; +again: + this_cpu = get_cpu(); + dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == this_cpu) { + put_cpu(); + return; + } + rq = task_rq_lock(p, &flags); + put_cpu(); + + /* + * select_task_rq() can race against ->cpus_allowed + */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; + || unlikely(!cpu_active(dest_cpu))) { + task_rq_unlock(rq, &flags); + goto again; + } /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { @@ -3161,23 +3176,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) return; } -out: task_rq_unlock(rq, &flags); } -/* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. -- cgit v1.2.3 From 5da9a0fb673a0ea0a093862f95f6b89b3390c31e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:38 +0100 Subject: sched: Fix select_task_rq() vs hotplug issues Since select_task_rq() is now responsible for guaranteeing ->cpus_allowed and cpu_active_mask, we need to verify this. select_task_rq_rt() can blindly return smp_processor_id()/task_cpu() without checking the valid masks, select_task_rq_fair() can do the same in the rare case that all SD_flags are disabled. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.961475466@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 75 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 63e55ac242d..cc40bdadee7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2317,6 +2317,43 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rcu_read_lock(); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + rcu_read_unlock(); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + /* * Called from: * @@ -2343,14 +2380,8 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_active(cpu))) { - - cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - /* - * XXX: race against hot-plug modifying cpu_active_mask - */ - BUG_ON(cpu >= nr_cpu_ids); - } + !cpu_active(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); return cpu; } @@ -7319,36 +7350,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - pr_info("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } + dest_cpu = select_fallback_rq(dead_cpu, p); -move: /* It can have affinity changed while we were choosing. */ if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) goto again; -- cgit v1.2.3 From 881232b70b195768a71cd74ff4b4e8ab9502997b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:39 +0100 Subject: sched: Move kthread_bind() back to kthread.c Since kthread_bind() lost its dependencies on sched.c, move it back where it came from. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.039524041@chello.nl> Signed-off-by: Ingo Molnar --- kernel/kthread.c | 23 +++++++++++++++++++++++ kernel/sched.c | 26 -------------------------- 2 files changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index ab7ae57773e..fbb6222fe7e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -149,6 +149,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; +} +EXPORT_SYMBOL(kthread_bind); + /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). diff --git a/kernel/sched.c b/kernel/sched.c index cc40bdadee7..297dc441ff9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2004,32 +2004,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @p: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - * - * Function lives here instead of kthread.c because it messes with - * scheduler internals which require locking. - */ -void kthread_bind(struct task_struct *p, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; - p->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - #ifdef CONFIG_SMP /* * Is this task likely cache-hot: -- cgit v1.2.3 From efbbd05a595343a413964ad85a2ad359b7b7efbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:40 +0100 Subject: sched: Add pre and post wakeup hooks As will be apparent in the next patch, we need a pre wakeup hook for sched_fair task migration, hence rename the post wakeup hook and one pre wakeup. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.114746117@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++---- kernel/sched_rt.c | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 297dc441ff9..6c571bdd565 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2412,6 +2412,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); @@ -2475,8 +2479,8 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); if (unlikely(rq->idle_stamp)) { u64 delta = rq->clock - rq->idle_stamp; @@ -2666,8 +2670,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d2ea2828164..f48328ac216 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq) * If we are not running and we are not going to reschedule soon, we should * try to push tasks away now */ -static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && @@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = { .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, - .task_wake_up = task_wake_up_rt, + .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif -- cgit v1.2.3 From 88ec22d3edb72b261f8628226cd543589a6d5e1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:41 +0100 Subject: sched: Remove the cfs_rq dependency from set_task_cpu() In order to remove the cfs_rq dependency from set_task_cpu() we need to ensure the task is cfs_rq invariant for all callsites. The simple approach is to substract cfs_rq->min_vruntime from se->vruntime on dequeue, and add cfs_rq->min_vruntime on enqueue. However, this has the downside of breaking FAIR_SLEEPERS since we loose the old vruntime as we only maintain the relative position. To solve this, we observe that we only migrate runnable tasks, we do this using deactivate_task(.sleep=0) and activate_task(.wakeup=0), therefore we can restrain the min_vruntime invariance to that state. The only other case is wakeup balancing, since we want to maintain the old vruntime we cannot make it relative on dequeue, but since we don't migrate inactive tasks, we can do so right before we activate it again. This is where we need the new pre-wakeup hook, we need to call this while still holding the old rq->lock. We could fold it into ->select_task_rq(), but since that has multiple callsites and would obfuscate the locking requirements, that seems like a fudge. This leaves the fork() case, simply make sure that ->task_fork() leaves the ->vruntime in a relative state. This covers all cases where set_task_cpu() gets called, and ensures it sees a relative vruntime. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.191697025@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +----- kernel/sched_fair.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c571bdd565..f92ce63edff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2038,8 +2038,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); #ifdef CONFIG_SCHED_DEBUG /* @@ -2056,8 +2054,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -10102,7 +10098,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec1d2715620..42ac3c9f66f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_MIGRATE 2 + static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + se->vruntime += cfs_rq->min_vruntime; + /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (wakeup) { + if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!sleep) + se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int flags = 0; + + if (wakeup) + flags |= ENQUEUE_WAKEUP; + if (p->state == TASK_WAKING) + flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; } hrtick_update(rq); @@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq) #ifdef CONFIG_SMP +static void task_waking_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + se->vruntime -= cfs_rq->min_vruntime; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1978,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } + se->vruntime -= cfs_rq->min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2031,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) +static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); + if (!on_rq) + place_entity(cfs_rq, &p->se, 1); } #endif @@ -2076,6 +2112,8 @@ static const struct sched_class fair_sched_class = { .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v1.2.3 From 738d2be4301007f054541c5c4bf7fb6a361c9b3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:42 +0100 Subject: sched: Simplify set_task_cpu() Rearrange code a bit now that its a simpler function. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.269101883@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f92ce63edff..8a2bfd37ab4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2034,11 +2034,8 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { - int old_cpu = task_cpu(p); - #ifdef CONFIG_SCHED_DEBUG /* * We should never call set_task_cpu() on a blocked task, @@ -2049,11 +2046,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); - if (old_cpu != new_cpu) { - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - } + if (task_cpu(p) == new_cpu) + return; + + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); __set_task_cpu(p, new_cpu); } -- cgit v1.2.3 From 6e1415467614e854fee660ff6648bd10fa976e95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 15 Dec 2009 19:27:45 +0000 Subject: NOMMU: Optimise away the {dac_,}mmap_min_addr tests In NOMMU mode clamp dac_mmap_min_addr to zero to cause the tests on it to be skipped by the compiler. We do this as the minimum mmap address doesn't make any sense in NOMMU mode. mmap_min_addr and round_hint_to_min() can be discarded entirely in NOMMU mode. Signed-off-by: David Howells Acked-by: Eric Paris Signed-off-by: James Morris --- kernel/sysctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45e4bef0012..856a24eadf7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif +#ifdef CONFIG_MMU { .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, @@ -1221,6 +1222,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = mmap_min_addr_handler, }, +#endif #ifdef CONFIG_NUMA { .procname = "numa_zonelist_order", -- cgit v1.2.3 From cf1e367ee84e02ac349ad0858eb65e8a6a511c8b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 17 Dec 2009 11:15:42 +1100 Subject: timers: Remove duplicate setting of new_base in __mod_timer() new_base is set using per_cpu(tvec_bases, cpu) after selecting the desired value of cpu immediately below so this line is a unnecessary. Signed-off-by: Simon Horman LKML-Reference: <20091217001542.GD25317@verge.net.au> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 5db5a8d2681..15533b79239 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = __get_cpu_var(tvec_bases); - cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) -- cgit v1.2.3 From f6325e30ebd6fc870315b017a5d4a6ab15bf790b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:08 -0600 Subject: cpumask: use cpu_online in kernel/perf_event.c Also, we want to check against nr_cpu_ids, not num_possible_cpus(). The latter works, but the correct bounds check is < nr_cpu_ids. Signed-off-by: Rusty Russell To: Thomas Gleixner --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8ab86988bd2..97d1a3dd7a5 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1614,7 +1614,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) * offline CPU and activate it when the CPU comes up, but * that's for later. */ - if (!cpu_isset(cpu, cpu_online_map)) + if (!cpu_online(cpu)) return ERR_PTR(-ENODEV); cpuctx = &per_cpu(perf_cpu_context, cpu); -- cgit v1.2.3 From 62ac12795095dc959649c66ace78708e7ac52477 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:26 -0600 Subject: cpumask: avoid dereferencing struct cpumask struct cpumask will be undefined soon with CONFIG_CPUMASK_OFFSTACK=y, to avoid them being declared on the stack. cpumask_bits() does what we want here (of course, this code is crap). Signed-off-by: Rusty Russell To: Thomas Gleixner --- kernel/time/timer_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 28265636b6c..bdfb8dd1050 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -237,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m) #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - tick_get_broadcast_mask()->bits[0]); + cpumask_bits(tick_get_broadcast_mask())[0]); #ifdef CONFIG_TICK_ONESHOT SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - tick_get_broadcast_oneshot_mask()->bits[0]); + cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); #endif SEQ_printf(m, "\n"); #endif -- cgit v1.2.3 From 416eb39556a03d1c7e52b0791e9052ccd71db241 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Dec 2009 06:05:49 +0100 Subject: sched: Make warning less noisy Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8a2bfd37ab4..af7dfa74e6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.3 From 234da7bcdc7aaa935846534c3b726dbc79a9cdd5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Dec 2009 20:21:05 +0100 Subject: sched: Teach might_sleep() about preemptible RCU In practice, it is harmless to voluntarily sleep in a rcu_read_lock() section if we are running under preempt rcu, but it is illegal if we build a kernel running non-preemptable rcu. Currently, might_sleep() doesn't notice sleepable operations under rcu_read_lock() sections if we are running under preemptable rcu because preempt_count() is left untouched after rcu_read_lock() in this case. But we want developers who test their changes under such config to notice the "sleeping while atomic" issues. So we add rcu_read_lock_nesting to prempt_count() in might_sleep() checks. [ v2: Handle rcu-tiny ] Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Cc: Peter Zijlstra LKML-Reference: <1260991265-8451-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index af7dfa74e6b..7be88a7be04 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9682,7 +9682,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() & ~PREEMPT_ACTIVE; + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -- cgit v1.2.3 From 077614ee1e93245a3b9a4e1213659405dbeb0ba6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:31 +0100 Subject: sched: Fix broken assertion There's a preemption race in the set_task_cpu() debug check in that when we get preempted after setting task->state we'd still be on the rq proper, but fail the test. Check for preempted tasks, since those are always on the RQ. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121830.137155561@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7be88a7be04..720df108a2d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.3 From 3e26120cc7c819c97bc07281ca1fb9017cfe9a39 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 17 Dec 2009 15:27:05 -0800 Subject: kernel/sysctl.c: fix the incomplete part of sysctl_max_map_count-should-be-non-negative.patch It is a mistake that we used 'proc_dointvec', it should be 'proc_dointvec_minmax', as in the original patch. Signed-off-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45e4bef0012..6665761c006 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1131,7 +1131,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, #else -- cgit v1.2.3 From 9cd80bbb07fcd6d4d037fad4297496d3b132ac6b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 17 Dec 2009 15:27:15 -0800 Subject: do_wait() optimization: do not place sub-threads on task_struct->children list Thanks to Roland who pointed out de_thread() issues. Currently we add sub-threads to ->real_parent->children list. This buys nothing but slows down do_wait(). With this patch ->children contains only main threads (group leaders). The only complication is that forget_original_parent() should iterate over sub-threads by hand, and de_thread() needs another list_replace() when it changes ->group_leader. Henceforth do_wait_thread() can never see task_detached() && !EXIT_DEAD tasks, we can remove this check (and we can unify do_wait_thread() and ptrace_do_wait()). This change can confuse the optimistic search in mm_update_next_owner(), but this is fixable and minor. Perhaps badness() and oom_kill_process() should be updated, but they should be fixed in any case. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Ratan Nalumasu Cc: Vitaly Mayatskikh Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 36 +++++++++++++++++------------------- kernel/fork.c | 2 +- 2 files changed, 18 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5962d7ccf24..546774a31a6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + list_del_init(&p->sibling); __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - list_del_init(&p->sibling); } /* @@ -736,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father) /* * Any that need to be release_task'd are put on the @dead list. */ -static void reparent_thread(struct task_struct *father, struct task_struct *p, +static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - if (p->pdeath_signal) - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - list_move_tail(&p->sibling, &p->real_parent->children); if (task_detached(p)) @@ -780,12 +777,18 @@ static void forget_original_parent(struct task_struct *father) reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { - p->real_parent = reaper; - if (p->parent == father) { - BUG_ON(task_ptrace(p)); - p->parent = p->real_parent; - } - reparent_thread(father, p, &dead_children); + struct task_struct *t = p; + do { + t->real_parent = reaper; + if (t->parent == father) { + BUG_ON(task_ptrace(t)); + t->parent = t->real_parent; + } + if (t->pdeath_signal) + group_send_sig_info(t->pdeath_signal, + SEND_SIG_NOINFO, t); + } while_each_thread(p, t); + reparent_leader(father, p, &dead_children); } write_unlock_irq(&tasklist_lock); @@ -1551,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { - /* - * Do not consider detached threads. - */ - if (!task_detached(p)) { - int ret = wait_consider_task(wo, 0, p); - if (ret) - return ret; - } + int ret = wait_consider_task(wo, 0, p); + if (ret) + return ret; } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 202a0ba63d3..5b2959b3ffc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1291,7 +1291,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - list_add_tail(&p->sibling, &p->real_parent->children); tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { @@ -1303,6 +1302,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } -- cgit v1.2.3 From 6485536bcf499839a54dcda8a8d47ea0bd29b375 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 17 Dec 2009 15:27:27 -0800 Subject: printk: fix new kernel-doc warnings Fix kernel-doc warnings in printk.c: Warning(kernel/printk.c:1422): No description found for parameter 'dumper' Warning(kernel/printk.c:1422): Excess function parameter 'dump' description in 'kmsg_dump_register' Warning(kernel/printk.c:1451): No description found for parameter 'dumper' Warning(kernel/printk.c:1451): Excess function parameter 'dump' description in 'kmsg_dump_unregister' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 1ded8e7dd19..17463ca2e22 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1412,7 +1412,7 @@ static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. - * @dump: pointer to the kmsg_dumper structure + * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be @@ -1442,7 +1442,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. - * @dump: pointer to the kmsg_dumper structure + * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. -- cgit v1.2.3 From 6f5d51148921c242680a7a1d9913384a30ab3cbe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 15:59:45 +0000 Subject: fix braindamage in audit_tree.c untag_chunk() ... aka "Al had badly fscked up when writing that thing and nobody noticed until Eric had fixed leaks that used to mask the breakage". The function essentially creates a copy of old array sans one element and replaces the references to elements of original (they are on cyclic lists) with those to corresponding elements of new one. After that the old one is fair game for freeing. First of all, there's a dumb braino: when we get to list_replace_init we use indices for wrong arrays - position in new one with the old array and vice versa. Another bug is more subtle - termination condition is wrong if the element to be excluded happens to be the last one. We shouldn't go until we fill the new array, we should go until we'd finished the old one. Otherwise the element we are trying to kill will remain on the cyclic lists... That crap used to be masked by several leaks, so it was not quite trivial to hit. Eric had fixed some of those leaks a while ago and the shit had hit the fan... Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2451dc6f328..b36aa9651ba 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -277,7 +277,7 @@ static void untag_chunk(struct node *p) owner->root = NULL; } - for (i = j = 0; i < size; i++, j++) { + for (i = j = 0; j <= size; i++, j++) { struct audit_tree *s; if (&chunk->owners[j] == p) { list_del_init(&p->list); @@ -290,7 +290,7 @@ static void untag_chunk(struct node *p) if (!s) /* result of earlier fallback */ continue; get_tree(s); - list_replace_init(&chunk->owners[i].list, &new->owners[j].list); + list_replace_init(&chunk->owners[j].list, &new->owners[i].list); } list_replace_rcu(&chunk->hash, &new->hash); -- cgit v1.2.3 From b4c30aad39805902cf5b855aa8a8b22d728ad057 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 16:03:30 +0000 Subject: fix more leaks in audit_tree.c tag_chunk() Several leaks in audit_tree didn't get caught by commit 318b6d3d7ddbcad3d6867e630711b8a705d873d7, including the leak on normal exit in case of multiple rules refering to the same chunk. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b36aa9651ba..4b05bd9479d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - put_inotify_watch(watch); + put_inotify_watch(&old->watch); return 0; } } spin_unlock(&hash_lock); chunk = alloc_chunk(old->count + 1); - if (!chunk) + if (!chunk) { + put_inotify_watch(&old->watch); return -ENOMEM; + } mutex_lock(&inode->inotify_mutex); if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { @@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); inotify_evict_watch(&old->watch); mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); + put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ + put_inotify_watch(&old->watch); /* and kill it */ return 0; } -- cgit v1.2.3 From 3df0fc5b2e9d8092dcaeb5ae0b6753d85c851d66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Dec 2009 14:23:57 +0100 Subject: sched: Restore printk sanity Revert the braindead pr_* crap. (Commit 663997d "sched: Use pr_fmt() and pr_()") It's dumb and causes stupid "sched: " strings all over the place. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Cc: Joe Perches Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <1261315437.4314.6.camel@laptop> [ i dont mind the pr_*() patterns that much - but Peter dislikes them with a vengence. ] [ - v2: remove spurious diffstat from changelog :-/ ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 89 +++++++++++++++++++++++++++---------------------- kernel/sched_idletask.c | 2 +- 2 files changed, 50 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 720df108a2d..7ffde2ae786 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -26,8 +26,6 @@ * Thomas Gleixner, Mike Kravetz */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -5375,8 +5373,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); print_modules(); @@ -6940,23 +6938,23 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - pr_info("%-13.13s %c", p->comm, + printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - pr_cont(" running "); + printk(KERN_CONT " running "); else - pr_cont(" %08lx ", thread_saved_pc(p)); + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - pr_cont(" running task "); + printk(KERN_CONT " running task "); else - pr_cont(" %016lx ", thread_saved_pc(p)); + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - pr_cont("%5lu %5d %6d 0x%08lx\n", free, + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), task_pid_nr(p->real_parent), (unsigned long)task_thread_info(p)->flags); @@ -6968,9 +6966,11 @@ void show_state_filter(unsigned long state_filter) struct task_struct *g, *p; #if BITS_PER_LONG == 32 - pr_info(" task PC stack pid father\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #else - pr_info(" task PC stack pid father\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -7828,44 +7828,48 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_DEBUG "%*s domain %d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { - pr_cont("does not load-balance\n"); + printk("does not load-balance\n"); if (sd->parent) - pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); return -1; } - pr_cont("span %s level %s\n", str, sd->name); + printk(KERN_CONT "span %s level %s\n", str, sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - pr_err("ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { - pr_cont("\n"); - pr_err("ERROR: group is NULL\n"); + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); break; } if (!group->cpu_power) { - pr_cont("\n"); - pr_err("ERROR: domain->cpu_power not set\n"); + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); break; } if (!cpumask_weight(sched_group_cpus(group))) { - pr_cont("\n"); - pr_err("ERROR: empty group\n"); + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); break; } if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - pr_cont("\n"); - pr_err("ERROR: repeated CPUs\n"); + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } @@ -7873,21 +7877,23 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - pr_cont(" %s", str); + printk(KERN_CONT " %s", str); if (group->cpu_power != SCHED_LOAD_SCALE) { - pr_cont(" (cpu_power = %d)", group->cpu_power); + printk(KERN_CONT " (cpu_power = %d)", + group->cpu_power); } group = group->next; } while (group != sd->groups); - pr_cont("\n"); + printk(KERN_CONT "\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) - pr_err("ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - pr_err("ERROR: parent span is not a superset of domain->span\n"); + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); return 0; } @@ -8443,7 +8449,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - pr_warning("Can not alloc domain group for node %d\n", num); + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); return -ENOMEM; } d->sched_group_nodes[num] = sg; @@ -8472,8 +8479,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - pr_warning("Can not alloc domain group for node %d\n", - j); + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); return -ENOMEM; } sg->cpu_power = 0; @@ -8701,7 +8708,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, d->sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!d->sched_group_nodes) { - pr_warning("Can not alloc sched group node list\n"); + printk(KERN_WARNING "Can not alloc sched group node list\n"); return sa_notcovered; } sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; @@ -8718,7 +8725,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_send_covered; d->rd = alloc_rootdomain(); if (!d->rd) { - pr_warning("Cannot alloc root domain\n"); + printk(KERN_WARNING "Cannot alloc root domain\n"); return sa_tmpmask; } return sa_rootdomain; @@ -9700,11 +9707,13 @@ void __might_sleep(char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; - pr_err("BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 21b969a2872..5f93b570d38 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -35,7 +35,7 @@ static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { raw_spin_unlock_irq(&rq->lock); - pr_err("bad: scheduling from the idle thread!\n"); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); raw_spin_lock_irq(&rq->lock); } -- cgit v1.2.3 From 70f1120527797adb31c68bdc6f1b45e182c342c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Dec 2009 17:36:27 +0100 Subject: sched: Fix hotplug hang The hot-unplug kstopmachine usage does a wakeup after deactivating the cpu, hence we cannot use cpu_active() here but must rely on the good olde online. Reported-by: Sachin Sant Reported-by: Jens Axboe Signed-off-by: Peter Zijlstra Tested-by: Jens Axboe Cc: Heiko Carstens Cc: Benjamin Herrenschmidt LKML-Reference: <1261326987.4314.24.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7ffde2ae786..87f1f47beff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2346,7 +2346,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_active(cpu))) + !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); return cpu; -- cgit v1.2.3 From 0e2c8b8f55072a98b99e7bdad55c912084d6a526 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 20 Dec 2009 10:50:02 +0100 Subject: resources: fix call to alignf() in allocate_resource() The second parameter to alignf() in allocate_resource() must reflect what new resource is attempted to be allocated, else functions like pcibios_align_resource() (at least on x86) or pcmcia_align() can't work correctly. Commit 1e5ad9679016275d422e36b12a98b0927d76f556 broke this by setting the "new" resource until we're about to return success. To keep the resource untouched when allocate_resource() fails, a "tmp" resource is introduced. Signed-off-by: Dominik Brodowski Acked-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Jesse Barnes Signed-off-by: Linus Torvalds --- kernel/resource.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index dc15686b7a7..af96c1e4b54 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -308,37 +308,37 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - resource_size_t start, end; + struct resource tmp = *new; - start = root->start; + tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to new->end below would cause an underflow. + * of this->start - 1 to tmp->end below would cause an underflow. */ if (this && this->start == 0) { - start = this->end + 1; + tmp.start = this->end + 1; this = this->sibling; } for(;;) { if (this) - end = this->start - 1; + tmp.end = this->start - 1; else - end = root->end; - if (start < min) - start = min; - if (end > max) - end = max; - start = ALIGN(start, align); + tmp.end = root->end; + if (tmp.start < min) + tmp.start = min; + if (tmp.end > max) + tmp.end = max; + tmp.start = ALIGN(tmp.start, align); if (alignf) - alignf(alignf_data, new, size, align); - if (start < end && end - start >= size - 1) { - new->start = start; - new->end = start + size - 1; + alignf(alignf_data, &tmp, size, align); + if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { + new->start = tmp.start; + new->end = tmp.start + size - 1; return 0; } if (!this) break; - start = this->end + 1; + tmp.start = this->end + 1; this = this->sibling; } return -EBUSY; -- cgit v1.2.3 From c757bea93bea4b77ebd181cc6dca60c15e3b1a2c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Dec 2009 22:35:16 -0500 Subject: tracing: Fix setting tracer specific options The function __set_tracer_option() takes as its last parameter a "neg" value. If set it should negate the value of the option. The trace_options_write() passed the value written to the file which is what the new value needs to be set as. But since this is not the negative, it never sets the value. Reported-by: Peter Zijlstra Cc: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee61915935d..d0a4c12d1f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3949,7 +3949,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); ret = __set_tracer_option(current_trace, topt->flags, - topt->opt, val); + topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) return ret; -- cgit v1.2.3 From 628ff7c1d8d8466a5ad8078bd0206a130f8b8a51 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 18 Dec 2009 09:41:24 -0800 Subject: anonfd: Allow making anon files read-only It seems a couple places such as arch/ia64/kernel/perfmon.c and drivers/infiniband/core/uverbs_main.c could use anon_inode_getfile() instead of a private pseudo-fs + alloc_file(), if only there were a way to get a read-only file. So provide this by having anon_inode_getfile() create a read-only file if we pass O_RDONLY in flags. Signed-off-by: Roland Dreier Signed-off-by: Al Viro --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e0eb4a2fe18..1f38270f08c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4724,7 +4724,7 @@ SYSCALL_DEFINE5(perf_event_open, if (IS_ERR(event)) goto err_put_context; - err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); if (err < 0) goto err_free_put_context; -- cgit v1.2.3 From 5300990c0370e804e49d9a59d928c5d53fb73487 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Dec 2009 10:15:07 -0500 Subject: Sanitize f_flags helpers * pull ACC_MODE to fs.h; we have several copies all over the place * nightmarish expression calculating f_mode by f_flags deserves a helper too (OPEN_FMODE(flags)) Signed-off-by: Al Viro --- kernel/auditsc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 267e484f019..fc0f928167e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -250,7 +250,6 @@ struct audit_context { #endif }; -#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) static inline int open_arg(int flags, int mask) { int n = ACC_MODE(flags); -- cgit v1.2.3 From 83f57a11d84460dfe2afdb5a8bc759953428e38b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 22 Dec 2009 14:10:37 -0800 Subject: Revert "time: Remove xtime_cache" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 7bc7d637452383d56ba4368d4336b0dde1bb476d, as requested by John Stultz. Quoting John: "Petr TitÄ›ra reported an issue where he saw odd atime regressions with 2.6.33 where there were a full second worth of nanoseconds in the nanoseconds field. He also reviewed the time code and narrowed down the problem: unhandled overflow of the nanosecond field caused by rounding up the sub-nanosecond accumulated time. Details: * At the end of update_wall_time(), we currently round up the sub-nanosecond portion of accumulated time when storing it into xtime. This was added to avoid time inconsistencies caused when the sub-nanosecond portion was truncated when storing into xtime. Unfortunately we don't handle the possible second overflow caused by that rounding. * Previously the xtime_cache code hid this overflow by normalizing the xtime value when storing into the xtime_cache. * We could try to handle the second overflow after the rounding up, but since this affects the timekeeping's internal state, this would further complicate the next accumulation cycle, causing small errors in ntp steering. As much as I'd like to get rid of it, the xtime_cache code is known to work. * The correct fix is really to include the sub-nanosecond portion in the timekeeping accessor function, so we don't need to round up at during accumulation. This would greatly simplify the accumulation code. Unfortunately, we can't do this safely until the last three non-GENERIC_TIME arches (sparc32, arm, cris) are converted (those patches are in -mm) and we kill off the spots where arches set xtime directly. This is all 2.6.34 material, so I think reverting the xtime_cache change is the best approach for now. Many thanks to Petr for both reporting and finding the issue!" Reported-by: Petr TitÄ›ra Requested-by: john stultz Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/time.c | 1 + kernel/time/timekeeping.c | 27 +++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index c6324d96009..804798005d1 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,6 +136,7 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; + update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index af4135f0582..7faaa32fbf4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -165,6 +165,13 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +static struct timespec xtime_cache __attribute__ ((aligned (16))); +void update_xtime_cache(u64 nsec) +{ + xtime_cache = xtime; + timespec_add_ns(&xtime_cache, nsec); +} + /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { @@ -325,6 +332,8 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; + update_xtime_cache(0); + timekeeper.ntp_error = 0; ntp_clear(); @@ -550,6 +559,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -583,6 +593,7 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } + update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -722,6 +733,7 @@ static void timekeeping_adjust(s64 offset) timekeeper.ntp_error_shift; } + /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -765,6 +777,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) return offset; } + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -774,6 +787,7 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; + u64 nsecs; int shift = 0, maxshift; /* Make sure we're fully resumed: */ @@ -839,6 +853,9 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); + update_xtime_cache(nsecs); + /* check to see if there is a new clocksource to use */ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); } @@ -875,13 +892,13 @@ void monotonic_to_bootbased(struct timespec *ts) unsigned long get_seconds(void) { - return xtime.tv_sec; + return xtime_cache.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime; + return xtime_cache; } struct timespec current_kernel_time(void) @@ -891,7 +908,8 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + + now = xtime_cache; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -905,7 +923,8 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + + now = xtime_cache; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); -- cgit v1.2.3 From 45465487897a1c6d508b14b904dc5777f7ec7e04 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:26 -0800 Subject: kfifo: move struct kfifo in place This is a new generic kernel FIFO implementation. The current kernel fifo API is not very widely used, because it has to many constrains. Only 17 files in the current 2.6.31-rc5 used it. FIFO's are like list's a very basic thing and a kfifo API which handles the most use case would save a lot of development time and memory resources. I think this are the reasons why kfifo is not in use: - The API is to simple, important functions are missing - A fifo can be only allocated dynamically - There is a requirement of a spinlock whether you need it or not - There is no support for data records inside a fifo So I decided to extend the kfifo in a more generic way without blowing up the API to much. The new API has the following benefits: - Generic usage: For kernel internal use and/or device driver. - Provide an API for the most use case. - Slim API: The whole API provides 25 functions. - Linux style habit. - DECLARE_KFIFO, DEFINE_KFIFO and INIT_KFIFO Macros - Direct copy_to_user from the fifo and copy_from_user into the fifo. - The kfifo itself is an in place member of the using data structure, this save an indirection access and does not waste the kernel allocator. - Lockless access: if only one reader and one writer is active on the fifo, which is the common use case, no additional locking is necessary. - Remove spinlock - give the user the freedom of choice what kind of locking to use if one is required. - Ability to handle records. Three type of records are supported: - Variable length records between 0-255 bytes, with a record size field of 1 bytes. - Variable length records between 0-65535 bytes, with a record size field of 2 bytes. - Fixed size records, which no record size field. - Preserve memory resource. - Performance! - Easy to use! This patch: Since most users want to have the kfifo as part of another object, reorganize the code to allow including struct kfifo in another data structure. This requires changing the kfifo_alloc and kfifo_init prototypes so that we pass an existing kfifo pointer into them. This patch changes the implementation and all existing users. [akpm@linux-foundation.org: fix warning] Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 65 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 3765ff3c1bb..8da6bb9782b 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -1,6 +1,7 @@ /* - * A simple kernel FIFO implementation. + * A generic kernel FIFO implementation. * + * Copyright (C) 2009 Stefani Seibold * Copyright (C) 2004 Stelian Pop * * This program is free software; you can redistribute it and/or modify @@ -26,49 +27,51 @@ #include #include +static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, + unsigned int size, spinlock_t *lock) +{ + fifo->buffer = buffer; + fifo->size = size; + fifo->lock = lock; + + kfifo_reset(fifo); +} + /** - * kfifo_init - allocates a new FIFO using a preallocated buffer + * kfifo_init - initialize a FIFO using a preallocated buffer + * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. * @size: the size of the internal buffer, this have to be a power of 2. - * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * - * Do NOT pass the kfifo to kfifo_free() after use! Simply free the - * &struct kfifo with kfree(). */ -struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - gfp_t gfp_mask, spinlock_t *lock) +void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size, + spinlock_t *lock) { - struct kfifo *fifo; - /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); - fifo = kmalloc(sizeof(struct kfifo), gfp_mask); - if (!fifo) - return ERR_PTR(-ENOMEM); - - fifo->buffer = buffer; - fifo->size = size; - fifo->in = fifo->out = 0; - fifo->lock = lock; - - return fifo; + _kfifo_init(fifo, buffer, size, lock); } EXPORT_SYMBOL(kfifo_init); /** - * kfifo_alloc - allocates a new FIFO and its internal buffer - * @size: the size of the internal buffer to be allocated. + * kfifo_alloc - allocates a new FIFO internal buffer + * @fifo: the fifo to assign then new buffer + * @size: the size of the buffer to be allocated, this have to be a power of 2. * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * + * This function dynamically allocates a new fifo internal buffer + * * The size will be rounded-up to a power of 2. + * The buffer will be release with kfifo_free(). + * Return 0 if no error, otherwise the an error code */ -struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) +int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, + spinlock_t *lock) { unsigned char *buffer; - struct kfifo *ret; /* * round up to the next power of 2, since our 'let the indices @@ -80,26 +83,24 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) } buffer = kmalloc(size, gfp_mask); - if (!buffer) - return ERR_PTR(-ENOMEM); - - ret = kfifo_init(buffer, size, gfp_mask, lock); + if (!buffer) { + _kfifo_init(fifo, 0, 0, NULL); + return -ENOMEM; + } - if (IS_ERR(ret)) - kfree(buffer); + _kfifo_init(fifo, buffer, size, lock); - return ret; + return 0; } EXPORT_SYMBOL(kfifo_alloc); /** - * kfifo_free - frees the FIFO + * kfifo_free - frees the FIFO internal buffer * @fifo: the fifo to be freed. */ void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); - kfree(fifo); } EXPORT_SYMBOL(kfifo_free); -- cgit v1.2.3 From c1e13f25674ed564948ecb7dfe5f83e578892896 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:27 -0800 Subject: kfifo: move out spinlock Move the pointer to the spinlock out of struct kfifo. Most users in tree do not actually use a spinlock, so the few exceptions now have to call kfifo_{get,put}_locked, which takes an extra argument to a spinlock. Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 8da6bb9782b..4950bdbe347 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -28,11 +28,10 @@ #include static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, - unsigned int size, spinlock_t *lock) + unsigned int size) { fifo->buffer = buffer; fifo->size = size; - fifo->lock = lock; kfifo_reset(fifo); } @@ -42,16 +41,14 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. * @size: the size of the internal buffer, this have to be a power of 2. - * @lock: the lock to be used to protect the fifo buffer * */ -void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size, - spinlock_t *lock) +void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) { /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); - _kfifo_init(fifo, buffer, size, lock); + _kfifo_init(fifo, buffer, size); } EXPORT_SYMBOL(kfifo_init); @@ -60,7 +57,6 @@ EXPORT_SYMBOL(kfifo_init); * @fifo: the fifo to assign then new buffer * @size: the size of the buffer to be allocated, this have to be a power of 2. * @gfp_mask: get_free_pages mask, passed to kmalloc() - * @lock: the lock to be used to protect the fifo buffer * * This function dynamically allocates a new fifo internal buffer * @@ -68,8 +64,7 @@ EXPORT_SYMBOL(kfifo_init); * The buffer will be release with kfifo_free(). * Return 0 if no error, otherwise the an error code */ -int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, - spinlock_t *lock) +int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) { unsigned char *buffer; @@ -84,11 +79,11 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask, buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0, NULL); + _kfifo_init(fifo, 0, 0); return -ENOMEM; } - _kfifo_init(fifo, buffer, size, lock); + _kfifo_init(fifo, buffer, size); return 0; } -- cgit v1.2.3 From e64c026dd09b73faf20707711402fc5ed55a8e70 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:28 -0800 Subject: kfifo: cleanup namespace change name of __kfifo_* functions to kfifo_*, because the prefix __kfifo should be reserved for internal functions only. Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 4950bdbe347..963ffde4af1 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -100,7 +100,7 @@ void kfifo_free(struct kfifo *fifo) EXPORT_SYMBOL(kfifo_free); /** - * __kfifo_put - puts some data into the FIFO, no locking version + * kfifo_put - puts some data into the FIFO, no locking version * @fifo: the fifo to be used. * @buffer: the data to be added. * @len: the length of the data to be added. @@ -112,7 +112,7 @@ EXPORT_SYMBOL(kfifo_free); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int __kfifo_put(struct kfifo *fifo, +unsigned int kfifo_put(struct kfifo *fifo, const unsigned char *buffer, unsigned int len) { unsigned int l; @@ -144,10 +144,10 @@ unsigned int __kfifo_put(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(__kfifo_put); +EXPORT_SYMBOL(kfifo_put); /** - * __kfifo_get - gets some data from the FIFO, no locking version + * kfifo_get - gets some data from the FIFO, no locking version * @fifo: the fifo to be used. * @buffer: where the data must be copied. * @len: the size of the destination buffer. @@ -158,7 +158,7 @@ EXPORT_SYMBOL(__kfifo_put); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int __kfifo_get(struct kfifo *fifo, +unsigned int kfifo_get(struct kfifo *fifo, unsigned char *buffer, unsigned int len) { unsigned int l; @@ -190,4 +190,4 @@ unsigned int __kfifo_get(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(__kfifo_get); +EXPORT_SYMBOL(kfifo_get); -- cgit v1.2.3 From 7acd72eb85f1c7a15e8b5eb554994949241737f1 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:28 -0800 Subject: kfifo: rename kfifo_put... into kfifo_in... and kfifo_get... into kfifo_out... rename kfifo_put... into kfifo_in... to prevent miss use of old non in kernel-tree drivers ditto for kfifo_get... -> kfifo_out... Improve the prototypes of kfifo_in and kfifo_out to make the kerneldoc annotations more readable. Add mini "howto porting to the new API" in kfifo.h Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 963ffde4af1..d659442e73f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -100,20 +100,20 @@ void kfifo_free(struct kfifo *fifo) EXPORT_SYMBOL(kfifo_free); /** - * kfifo_put - puts some data into the FIFO, no locking version + * kfifo_in - puts some data into the FIFO * @fifo: the fifo to be used. - * @buffer: the data to be added. + * @from: the data to be added. * @len: the length of the data to be added. * - * This function copies at most @len bytes from the @buffer into + * This function copies at most @len bytes from the @from buffer into * the FIFO depending on the free space, and returns the number of * bytes copied. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_put(struct kfifo *fifo, - const unsigned char *buffer, unsigned int len) +unsigned int kfifo_in(struct kfifo *fifo, + const unsigned char *from, unsigned int len) { unsigned int l; @@ -128,10 +128,10 @@ unsigned int kfifo_put(struct kfifo *fifo, /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); + memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), from, l); /* then put the rest (if any) at the beginning of the buffer */ - memcpy(fifo->buffer, buffer + l, len - l); + memcpy(fifo->buffer, from + l, len - l); /* * Ensure that we add the bytes to the kfifo -before- @@ -144,22 +144,22 @@ unsigned int kfifo_put(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(kfifo_put); +EXPORT_SYMBOL(kfifo_in); /** - * kfifo_get - gets some data from the FIFO, no locking version + * kfifo_out - gets some data from the FIFO * @fifo: the fifo to be used. - * @buffer: where the data must be copied. + * @to: where the data must be copied. * @len: the size of the destination buffer. * * This function copies at most @len bytes from the FIFO into the - * @buffer and returns the number of copied bytes. + * @to buffer and returns the number of copied bytes. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_get(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, + unsigned char *to, unsigned int len) { unsigned int l; @@ -174,10 +174,10 @@ unsigned int kfifo_get(struct kfifo *fifo, /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + memcpy(to, fifo->buffer + (fifo->out & (fifo->size - 1)), l); /* then get the rest (if any) from the beginning of the buffer */ - memcpy(buffer + l, fifo->buffer, len - l); + memcpy(to + l, fifo->buffer, len - l); /* * Ensure that we remove the bytes from the kfifo -before- @@ -190,4 +190,4 @@ unsigned int kfifo_get(struct kfifo *fifo, return len; } -EXPORT_SYMBOL(kfifo_get); +EXPORT_SYMBOL(kfifo_out); -- cgit v1.2.3 From a121f24accac1600bf5b6fb1e12eeabdfed7cb1a Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:31 -0800 Subject: kfifo: add kfifo_skip, kfifo_from_user and kfifo_to_user Add kfifo_reset_out() for save lockless discard the fifo output Add kfifo_skip() to skip a number of output bytes Add kfifo_from_user() to copy user space data into the fifo Add kfifo_to_user() to copy fifo data to user space Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 123 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index d659442e73f..2a78425ef67 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -26,6 +26,7 @@ #include #include #include +#include static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) @@ -99,6 +100,21 @@ void kfifo_free(struct kfifo *fifo) } EXPORT_SYMBOL(kfifo_free); +/** + * kfifo_skip - skip output data + * @fifo: the fifo to be used. + * @len: number of bytes to skip + */ +void kfifo_skip(struct kfifo *fifo, unsigned int len) +{ + if (len < kfifo_len(fifo)) { + __kfifo_add_out(fifo, len); + return; + } + kfifo_reset_out(fifo); +} +EXPORT_SYMBOL(kfifo_skip); + /** * kfifo_in - puts some data into the FIFO * @fifo: the fifo to be used. @@ -115,6 +131,7 @@ EXPORT_SYMBOL(kfifo_free); unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, unsigned int len) { + unsigned int off; unsigned int l; len = min(len, fifo->size - fifo->in + fifo->out); @@ -126,21 +143,16 @@ unsigned int kfifo_in(struct kfifo *fifo, smp_mb(); + off = __kfifo_off(fifo, fifo->in); + /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), from, l); + l = min(len, fifo->size - off); + memcpy(fifo->buffer + off, from, l); /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, from + l, len - l); - /* - * Ensure that we add the bytes to the kfifo -before- - * we update the fifo->in index. - */ - - smp_wmb(); - - fifo->in += len; + __kfifo_add_in(fifo, len); return len; } @@ -161,6 +173,7 @@ EXPORT_SYMBOL(kfifo_in); unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) { + unsigned int off; unsigned int l; len = min(len, fifo->in - fifo->out); @@ -172,22 +185,116 @@ unsigned int kfifo_out(struct kfifo *fifo, smp_rmb(); + off = __kfifo_off(fifo, fifo->out); + /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(to, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + l = min(len, fifo->size - off); + memcpy(to, fifo->buffer + off, l); /* then get the rest (if any) from the beginning of the buffer */ memcpy(to + l, fifo->buffer, len - l); + __kfifo_add_out(fifo, len); + + return len; +} +EXPORT_SYMBOL(kfifo_out); + +/** + * kfifo_from_user - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from into the + * FIFO depending and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len) +{ + unsigned int off; + unsigned int l; + int ret; + + len = min(len, fifo->size - fifo->in + fifo->out); + /* - * Ensure that we remove the bytes from the kfifo -before- - * we update the fifo->out index. + * Ensure that we sample the fifo->out index -before- we + * start putting bytes into the kfifo. */ smp_mb(); - fifo->out += len; + off = __kfifo_off(fifo, fifo->in); + + /* first put the data starting from fifo->in to buffer end */ + l = min(len, fifo->size - off); + ret = copy_from_user(fifo->buffer + off, from, l); + + if (unlikely(ret)) + return l - ret; + + /* then put the rest (if any) at the beginning of the buffer */ + ret = copy_from_user(fifo->buffer, from + l, len - l); + + if (unlikely(ret)) + return len - ret; + + __kfifo_add_in(fifo, len); return len; } -EXPORT_SYMBOL(kfifo_out); +EXPORT_SYMBOL(kfifo_from_user); + +/** + * kfifo_to_user - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * + * This function copies at most @len bytes from the FIFO into the + * @to buffer and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len) +{ + unsigned int off; + unsigned int l; + int ret; + + len = min(len, fifo->in - fifo->out); + + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + ret = copy_to_user(to, fifo->buffer + off, l); + + if (unlikely(ret)) + return l - ret; + + /* then get the rest (if any) from the beginning of the buffer */ + ret = copy_to_user(to + l, fifo->buffer, len - l); + + if (unlikely(ret)) + return len - ret; + + __kfifo_add_out(fifo, len); + + return len; +} +EXPORT_SYMBOL(kfifo_to_user); + -- cgit v1.2.3 From 86d4880313603810901f639ccb5c88ff13d4ad3c Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 21 Dec 2009 14:37:32 -0800 Subject: kfifo: add record handling functions Add kfifo_in_rec() - puts some record data into the FIFO Add kfifo_out_rec() - gets some record data from the FIFO Add kfifo_from_user_rec() - puts some data from user space into the FIFO Add kfifo_to_user_rec() - gets data from the FIFO and write it to user space Add kfifo_peek_rec() - gets the size of the next FIFO record field Add kfifo_skip_rec() - skip the next fifo out record Add kfifo_avail_rec() - determinate the number of bytes available in a record FIFO Signed-off-by: Stefani Seibold Acked-by: Greg Kroah-Hartman Acked-by: Mauro Carvalho Chehab Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 286 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 193 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 2a78425ef67..e92d519f93b 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -115,27 +115,11 @@ void kfifo_skip(struct kfifo *fifo, unsigned int len) } EXPORT_SYMBOL(kfifo_skip); -/** - * kfifo_in - puts some data into the FIFO - * @fifo: the fifo to be used. - * @from: the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from buffer into - * the FIFO depending on the free space, and returns the number of - * bytes copied. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_in(struct kfifo *fifo, - const unsigned char *from, unsigned int len) +static inline void __kfifo_in_data(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; - len = min(len, fifo->size - fifo->in + fifo->out); - /* * Ensure that we sample the fifo->out index -before- we * start putting bytes into the kfifo. @@ -143,7 +127,7 @@ unsigned int kfifo_in(struct kfifo *fifo, smp_mb(); - off = __kfifo_off(fifo, fifo->in); + off = __kfifo_off(fifo, fifo->in + off); /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); @@ -151,33 +135,13 @@ unsigned int kfifo_in(struct kfifo *fifo, /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, from + l, len - l); - - __kfifo_add_in(fifo, len); - - return len; } -EXPORT_SYMBOL(kfifo_in); -/** - * kfifo_out - gets some data from the FIFO - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * - * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_out(struct kfifo *fifo, - unsigned char *to, unsigned int len) +static inline void __kfifo_out_data(struct kfifo *fifo, + void *to, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; - len = min(len, fifo->in - fifo->out); - /* * Ensure that we sample the fifo->in index -before- we * start removing bytes from the kfifo. @@ -185,7 +149,7 @@ unsigned int kfifo_out(struct kfifo *fifo, smp_rmb(); - off = __kfifo_off(fifo, fifo->out); + off = __kfifo_off(fifo, fifo->out + off); /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - off); @@ -193,34 +157,14 @@ unsigned int kfifo_out(struct kfifo *fifo, /* then get the rest (if any) from the beginning of the buffer */ memcpy(to + l, fifo->buffer, len - l); - - __kfifo_add_out(fifo, len); - - return len; } -EXPORT_SYMBOL(kfifo_out); -/** - * kfifo_from_user - puts some data from user space into the FIFO - * @fifo: the fifo to be used. - * @from: pointer to the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from into the - * FIFO depending and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len) +static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off) { - unsigned int off; unsigned int l; int ret; - len = min(len, fifo->size - fifo->in + fifo->out); - /* * Ensure that we sample the fifo->out index -before- we * start putting bytes into the kfifo. @@ -228,29 +172,101 @@ unsigned int kfifo_from_user(struct kfifo *fifo, smp_mb(); - off = __kfifo_off(fifo, fifo->in); + off = __kfifo_off(fifo, fifo->in + off); /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - off); ret = copy_from_user(fifo->buffer + off, from, l); if (unlikely(ret)) - return l - ret; + return ret + len - l; /* then put the rest (if any) at the beginning of the buffer */ - ret = copy_from_user(fifo->buffer, from + l, len - l); + return copy_from_user(fifo->buffer, from + l, len - l); +} + +static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off) +{ + unsigned int l; + int ret; + + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out + off); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + ret = copy_to_user(to, fifo->buffer + off, l); if (unlikely(ret)) - return len - ret; + return ret + len - l; - __kfifo_add_in(fifo, len); + /* then get the rest (if any) from the beginning of the buffer */ + return copy_to_user(to + l, fifo->buffer, len - l); +} +unsigned int __kfifo_in_n(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + if (kfifo_avail(fifo) < len + recsize) + return len + 1; + + __kfifo_in_data(fifo, from, len, recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_in_n); + +/** + * kfifo_in - puts some data into the FIFO + * @fifo: the fifo to be used. + * @from: the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from buffer into + * the FIFO depending on the free space, and returns the number of + * bytes copied. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, + unsigned int len) +{ + len = min(kfifo_avail(fifo), len); + + __kfifo_in_data(fifo, from, len, 0); + __kfifo_add_in(fifo, len); return len; } -EXPORT_SYMBOL(kfifo_from_user); +EXPORT_SYMBOL(kfifo_in); + +unsigned int __kfifo_in_generic(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_in_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_in_generic); + +unsigned int __kfifo_out_n(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize) +{ + if (kfifo_len(fifo) < len + recsize) + return len; + + __kfifo_out_data(fifo, to, len, recsize); + __kfifo_add_out(fifo, len + recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_out_n); /** - * kfifo_to_user - gets data from the FIFO and write it to user space + * kfifo_out - gets some data from the FIFO * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. @@ -261,40 +277,124 @@ EXPORT_SYMBOL(kfifo_from_user); * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) { - unsigned int off; - unsigned int l; - int ret; + len = min(kfifo_len(fifo), len); - len = min(len, fifo->in - fifo->out); + __kfifo_out_data(fifo, to, len, 0); + __kfifo_add_out(fifo, len); - /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. - */ + return len; +} +EXPORT_SYMBOL(kfifo_out); - smp_rmb(); +unsigned int __kfifo_out_generic(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_out_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_out_generic); - off = __kfifo_off(fifo, fifo->out); +unsigned int __kfifo_from_user_n(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + if (kfifo_avail(fifo) < len + recsize) + return len + 1; - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - off); - ret = copy_to_user(to, fifo->buffer + off, l); + return __kfifo_from_user_data(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_from_user_n); - if (unlikely(ret)) - return l - ret; +/** + * kfifo_from_user - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from into the + * FIFO depending and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len) +{ + len = min(kfifo_avail(fifo), len); + len -= __kfifo_from_user_data(fifo, from, len, 0); + __kfifo_add_in(fifo, len); + return len; +} +EXPORT_SYMBOL(kfifo_from_user); - /* then get the rest (if any) from the beginning of the buffer */ - ret = copy_to_user(to + l, fifo->buffer, len - l); +unsigned int __kfifo_from_user_generic(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_from_user_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_from_user_generic); - if (unlikely(ret)) - return len - ret; +unsigned int __kfifo_to_user_n(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int reclen, + unsigned int recsize) +{ + unsigned int ret; - __kfifo_add_out(fifo, len); + if (kfifo_len(fifo) < reclen + recsize) + return len; + ret = __kfifo_to_user_data(fifo, to, reclen, recsize); + + if (likely(ret == 0)) + __kfifo_add_out(fifo, reclen + recsize); + + return ret; +} +EXPORT_SYMBOL(__kfifo_to_user_n); + +/** + * kfifo_to_user - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * + * This function copies at most @len bytes from the FIFO into the + * @to buffer and returns the number of copied bytes. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len) +{ + len = min(kfifo_len(fifo), len); + len -= __kfifo_to_user_data(fifo, to, len, 0); + __kfifo_add_out(fifo, len); return len; } EXPORT_SYMBOL(kfifo_to_user); +unsigned int __kfifo_to_user_generic(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_to_user_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_to_user_generic); + +unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) +{ + if (recsize == 0) + return kfifo_avail(fifo); + + return __kfifo_peek_n(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_peek_generic); + +void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) +{ + __kfifo_skip_rec(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_skip_generic); + -- cgit v1.2.3 From 0c69774e6ce94364cfaa8bdeb18061edc414bc5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Dec 2009 15:43:19 +0100 Subject: sched: Revert 738d2be, simplify set_task_cpu() Effectively reverts 738d2be4301007f054541c5c4bf7fb6a361c9b3a. As demonstrated by Eric, we really need to call __set_task_cpu() early in the fork() path to properly initialize the various task state -- specifically the cgroup state through set_task_rq(). [ we could probably fix this by explicitly calling __set_task_cpu() from sched_fork(), but lets try that for the next cycle and simply revert to the old behaviour for now. ] Reported-by: Eric Paris Tested-by: Eric Paris , Signed-off-by: Peter Zijlstra Cc: efault@gmx.de LKML-Reference: <1261492999.4937.36.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 87f1f47beff..c535cc4f642 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2045,11 +2045,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); - if (task_cpu(p) == new_cpu) - return; - - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + if (task_cpu(p) != new_cpu) { + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + } __set_task_cpu(p, new_cpu); } -- cgit v1.2.3 From 4440095c8268c1a5e11577097d2be429cec036ca Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Dec 2009 21:00:20 +0100 Subject: SYSCTL: Print binary sysctl warnings (nearly) only once When printing legacy sysctls print the warning message for each of them only once. This way there is a guarantee the syslog won't be flooded for any sane program. The original attempt at this made the tables non const and stored the flag inline. Linus suggested using a separate hash table for this, this is based on a code snippet from him. The hash implies this is not exact and can sometimes not print a new sysctl due to a hash collision, but in practice this should not be a problem I used a FNV32 hash over the binary string with a 32byte bitmap. This gives relatively little collisions when all the predefined binary sysctls are hashed: size 256 bucket length number 0: [25] 1: [67] 2: [88] 3: [47] 4: [22] 5: [6] 6: [1] The worst case is a single collision of 6 hash values. Signed-off-by: Andi Kleen --- kernel/sysctl_binary.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 112533d5fc0..8f5d16e0707 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1417,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen) return; } +#define WARN_ONCE_HASH_BITS 8 +#define WARN_ONCE_HASH_SIZE (1< Date: Mon, 21 Dec 2009 13:02:24 +0100 Subject: kprobes: Fix distinct type warning Every time I see this: kernel/kprobes.c: In function 'register_kretprobe': kernel/kprobes.c:1038: warning: comparison of distinct pointer types lacks a cast I'm wondering if something changed in common code and we need to do something for s390. Apparently that's not the case. Let's get rid of this annoying warning. Signed-off-by: Heiko Carstens Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu LKML-Reference: <20091221120224.GA4471@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5342a344c4..b7df302a020 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1035,7 +1035,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * num_possible_cpus()); + rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); #endif -- cgit v1.2.3 From 40892367bc893f3abf6f5ca8ac2ed1c98ba26a77 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 21 Dec 2009 12:01:17 -0800 Subject: tracing: Kconfig spelling fixes and cleanups Fix filename reference (ftrace-implementation.txt -> ftrace-design.txt). Fix spelling, punctuation, grammar. Fix help text indentation and line lengths to reduce need for horizontal scrolling or larger window sizes. Signed-off-by: Randy Dunlap Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091221120117.3fb49cdc.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 112 +++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d006554888d..6c22d8a2f28 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,17 +12,17 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -34,17 +34,17 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_DYNAMIC_FTRACE bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FTRACE_MCOUNT_RECORD bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_HW_BRANCH_TRACER bool @@ -52,7 +52,7 @@ config HAVE_HW_BRANCH_TRACER config HAVE_SYSCALL_TRACEPOINTS bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config TRACER_MAX_TRACE bool @@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options. +# hiding of the automatic options. config TRACING bool @@ -119,7 +119,7 @@ menuconfig FTRACE bool "Tracers" default y if DEBUG_KERNEL help - Enable the kernel tracing infrastructure. + Enable the kernel tracing infrastructure. if FTRACE @@ -133,7 +133,7 @@ config FUNCTION_TRACER help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation - instruction to the beginning of every kernel function, which NOP + instruction at the beginning of every kernel function, which NOP sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very @@ -150,7 +150,7 @@ config FUNCTION_GRAPH_TRACER and its entry. Its first purpose is to trace the duration of functions and draw a call graph for each thread with some information like - the return value. This is done by setting the current return + the return value. This is done by setting the current return address on the current task structure into a stack of calls. @@ -173,7 +173,7 @@ config IRQSOFF_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be used together or separately.) @@ -186,7 +186,7 @@ config PREEMPT_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP help - This option measures the time spent in preemption off critical + This option measures the time spent in preemption-off critical sections, with microsecond accuracy. The default measurement method is a maximum search, which is @@ -195,7 +195,7 @@ config PREEMPT_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be used together or separately.) @@ -222,7 +222,7 @@ config ENABLE_DEFAULT_TRACERS depends on !GENERIC_TRACER select TRACING help - This tracer hooks to various trace points in the kernel + This tracer hooks to various trace points in the kernel, allowing the user to pick and choose which trace point they want to trace. It also includes the sched_switch tracer plugin. @@ -265,19 +265,19 @@ choice The likely/unlikely profiler only looks at the conditions that are annotated with a likely or unlikely macro. - The "all branch" profiler will profile every if statement in the + The "all branch" profiler will profile every if-statement in the kernel. This profiler will also enable the likely/unlikely - profiler as well. + profiler. - Either of the above profilers add a bit of overhead to the system. - If unsure choose "No branch profiling". + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". config BRANCH_PROFILE_NONE bool "No branch profiling" help - No branch profiling. Branch profiling adds a bit of overhead. - Only enable it if you want to analyse the branching behavior. - Otherwise keep it disabled. + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" @@ -288,7 +288,7 @@ config PROFILE_ANNOTATED_BRANCHES /sys/kernel/debug/tracing/profile_annotated_branch - Note: this will add a significant overhead, only turn this + Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES @@ -305,7 +305,7 @@ config PROFILE_ALL_BRANCHES This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system - is to be analyzed + is to be analyzed in much detail. endchoice config TRACING_BRANCHES @@ -335,7 +335,7 @@ config POWER_TRACER depends on X86 select GENERIC_TRACER help - This tracer helps developers to analyze and optimize the kernels + This tracer helps developers to analyze and optimize the kernel's power management decisions, specifically the C-state and P-state behavior. @@ -391,14 +391,14 @@ config HW_BRANCH_TRACER select GENERIC_TRACER help This tracer records all branches on the system in a circular - buffer giving access to the last N branches for each cpu. + buffer, giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected data is then fed to the userspace application in order to analyse allocation hotspots, internal fragmentation and so on, making it possible to see how well an allocator performs, as well as debug @@ -417,15 +417,15 @@ config WORKQUEUE_TRACER bool "Trace workqueues" select GENERIC_TRACER help - The workqueue tracer provides some statistical informations + The workqueue tracer provides some statistical information about each cpu workqueue thread such as the number of the works inserted and executed since their creation. It can help - to evaluate the amount of work each of them have to perform. + to evaluate the amount of work each of them has to perform. For example it can help a developer to decide whether he should - choose a per cpu workqueue instead of a singlethreaded one. + choose a per-cpu workqueue instead of a singlethreaded one. config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" + bool "Support for tracing block IO actions" depends on SYSFS depends on BLOCK select RELAY @@ -456,15 +456,15 @@ config KPROBE_EVENT select TRACING default y help - This allows the user to add tracing events (similar to tracepoints) on the fly - via the ftrace interface. See Documentation/trace/kprobetrace.txt - for more details. + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.txt for more details. Those events can be inserted wherever kprobes can probe, and record various register and memory values. - This option is also required by perf-probe subcommand of perf tools. If - you want to use perf tools, this option is strongly recommended. + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" @@ -472,32 +472,32 @@ config DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replaces them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replace them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. - This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise - has native performance as long as no tracing is active. + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. A file is created - in debugfs called function_profile_enabled which defaults to zero. - When a 1 is echoed into this file profiling begins, and when a - zero is entered, profiling stops. A file in the trace_stats - directory called functions, that show the list of functions that - have been hit and their counters. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stats directory; this file shows the list of functions that + have been hit and their counters. - If in doubt, say N + If in doubt, say N. config FTRACE_MCOUNT_RECORD def_bool y @@ -556,8 +556,8 @@ config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER help - This option creates a test to stress the ring buffer and bench mark it. - It creates its own ring buffer such that it will not interfer with + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with any other users of the ring buffer (such as ftrace). It then creates a producer and consumer that will run for 10 seconds and sleep for 10 seconds. Each interval it will print out the number of events @@ -566,7 +566,7 @@ config RING_BUFFER_BENCHMARK It does not disable interrupts or raise its priority, so it may be affected by processes that are running. - If unsure, say N + If unsure, say N. endif # FTRACE -- cgit v1.2.3 From 88f7a890d74137ab0d126a5d65679cd620f1a289 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:22:22 +0800 Subject: ksym_tracer: Fix to make the tracer work ksym tracer doesn't work: # echo tasklist_lock:rw- > ksym_trace_filter -bash: echo: write error: No such device It's because we pass to perf_event_create_kernel_counter() a cpu number which is not present. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF19E.1010201@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 10 +++++++--- kernel/trace/trace_ksym.c | 1 - 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 366eedf949c..48fb0bb6992 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -388,7 +389,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, if (!cpu_events) return ERR_PTR(-ENOMEM); - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); @@ -399,18 +401,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, goto fail; } } + put_online_cpus(); return cpu_events; fail: - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); if (IS_ERR(*pevent)) break; unregister_hw_breakpoint(*pevent); } + put_online_cpus(); + free_percpu(cpu_events); - /* return the error if any */ return ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index faf37fa4408..340b6ff193e 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -197,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) entry->attr.bp_addr = addr; entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - ret = -EAGAIN; entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, ksym_hbp_handler); -- cgit v1.2.3 From 3d13ec2efdb5843ad91e57b60d50b44d922cf063 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:23:19 +0800 Subject: ksym_tracer: Fix to allow writing newline to ksym_trace_filter It used to work, but now doesn't: # echo > ksym_filter bash: echo: write error: Invalid argument It's caused by d954fbf0ff6b5fdfb32350e85a2f15d3db976506 ("tracing: Fix wrong usage of strstrip in trace_ksyms"). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF1D7.5040400@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 340b6ff193e..160a8d8b37a 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -299,8 +299,8 @@ static ssize_t ksym_trace_filter_write(struct file *file, * 2: echo 0 > ksym_trace_filter * 3: echo "*:---" > ksym_trace_filter */ - if (!buf[0] || !strcmp(buf, "0") || - !strcmp(buf, "*:---")) { + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { __ksym_trace_reset(); ret = 0; goto out; -- cgit v1.2.3 From e6d9491bf8ba6728cc86aeabbc688d20ec0563b5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:23:40 +0800 Subject: ksym_tracer: Fix race when incrementing count We are under rcu read section but not holding the write lock, so count++ is not atomic. Use atomic64_t instead. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF1EC.9010608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 160a8d8b37a..67d79f709fc 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -32,6 +32,8 @@ #include #include +#include + /* * For now, let us restrict the no. of symbols traced simultaneously to number * of available hardware breakpoint registers. @@ -44,7 +46,7 @@ struct trace_ksym { struct perf_event **ksym_hbp; struct perf_event_attr attr; #ifdef CONFIG_PROFILE_KSYM_TRACER - unsigned long counter; + atomic64_t counter; #endif struct hlist_node ksym_hlist; }; @@ -69,9 +71,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) rcu_read_lock(); hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if ((entry->attr.bp_addr == hbp_hit_addr) && - (entry->counter <= MAX_UL_INT)) { - entry->counter++; + if (entry->attr.bp_addr == hbp_hit_addr) { + atomic64_inc(&entry->counter); break; } } @@ -501,7 +502,8 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v) seq_printf(m, " %-36s", fn_name); else seq_printf(m, " %-36s", ""); - seq_printf(m, " %15lu\n", entry->counter); + seq_printf(m, " %15llu\n", + (unsigned long long)atomic64_read(&entry->counter)); return 0; } -- cgit v1.2.3 From 53ab668064edaeef99c0ee22799483d45f4c81f6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 30 Dec 2009 14:24:03 +0800 Subject: ksym_tracer: Remove trace_stat trace_stat is problematic. Don't use it, use seqfile instead. This fixes a race that reading the stat file is not protected by any lock, which can lead to use after free. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: K.Prasad Cc: Frederic Weisbecker LKML-Reference: <4B3AF203.40200@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 127 ++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 67d79f709fc..94103cdcf9d 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -26,7 +26,6 @@ #include #include "trace_output.h" -#include "trace_stat.h" #include "trace.h" #include @@ -444,103 +443,77 @@ struct tracer ksym_tracer __read_mostly = .print_line = ksym_trace_output }; -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - d_tracer = tracing_init_dentry(); - ksym_filter_entry_count = 0; - - entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'ksym_trace_filter' file\n"); - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); - - #ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_tracer_stat_headers(struct seq_file *m) +static int ksym_profile_show(struct seq_file *m, void *v) { + struct hlist_node *node; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + seq_puts(m, " Access Type "); seq_puts(m, " Symbol Counter\n"); seq_puts(m, " ----------- "); seq_puts(m, " ------ -------\n"); - return 0; -} -static int ksym_tracer_stat_show(struct seq_file *m, void *v) -{ - struct hlist_node *stat = v; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + access_type = entry->attr.bp_type; - access_type = entry->attr.bp_type; + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", ""); + seq_printf(m, " %15llu\n", + (unsigned long long)atomic64_read(&entry->counter)); } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", ""); - seq_printf(m, " %15llu\n", - (unsigned long long)atomic64_read(&entry->counter)); + rcu_read_unlock(); return 0; } -static void *ksym_tracer_stat_start(struct tracer_stat *trace) +static int ksym_profile_open(struct inode *node, struct file *file) { - return ksym_filter_head.first; -} - -static void * -ksym_tracer_stat_next(void *v, int idx) -{ - struct hlist_node *stat = v; - - return stat->next; + return single_open(file, ksym_profile_show, NULL); } -static struct tracer_stat ksym_tracer_stats = { - .name = "ksym_tracer", - .stat_start = ksym_tracer_stat_start, - .stat_next = ksym_tracer_stat_next, - .stat_headers = ksym_tracer_stat_headers, - .stat_show = ksym_tracer_stat_show +static const struct file_operations ksym_profile_fops = { + .open = ksym_profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; +#endif /* CONFIG_PROFILE_KSYM_TRACER */ -__init static int ksym_tracer_stat_init(void) +__init static int init_ksym_trace(void) { - int ret; + struct dentry *d_tracer; - ret = register_stat_tracer(&ksym_tracer_stats); - if (ret) { - printk(KERN_WARNING "Warning: could not register " - "ksym tracer stats\n"); - return 1; - } + d_tracer = tracing_init_dentry(); - return 0; + trace_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + trace_create_file("ksym_profile", 0444, d_tracer, + NULL, &ksym_profile_fops); +#endif + + return register_tracer(&ksym_tracer); } -fs_initcall(ksym_tracer_stat_init); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ +device_initcall(init_ksym_trace); -- cgit v1.2.3 From 79b408210885b9f7f0b067b07a09d68f4da3a700 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:19 +0800 Subject: tracing/kprobe: Show sign of fields in trace_kprobe format files The format files of trace_kprobe do not show the sign of the fields. The other format files show the field signed type of the fields and this patch makes the trace_kprobe formats consistent with the others. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D27.5040009@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ecab06547a..83f1e6ef706 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1182,10 +1182,11 @@ static int __probe_event_show_format(struct trace_seq *s, #undef SHOW_FIELD #define SHOW_FIELD(type, item, name) \ do { \ - ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ - "offset:%u;\tsize:%u;\n", name, \ + ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ + "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type)); \ + (unsigned int)sizeof(type), \ + is_signed_type(type)); \ if (!ret) \ return 0; \ } while (0) -- cgit v1.2.3 From fb7ae981cb9fe8665b9da97e8734745e030c151d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:38 +0800 Subject: tracing: Fix sign fields in ftrace_define_fields_##call() Add is_signed_type() call to trace_define_field() in ftrace macros. The code previously just passed in 0 (false), disregarding whether or not the field was actually a signed type. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D3A.6020007@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 458e5bfe26d..d4fa5dc1ee4 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -158,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0, FILTER_OTHER); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; @@ -168,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), \ container.item), \ - sizeof(field.container.item), 0, \ - FILTER_OTHER); \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -- cgit v1.2.3 From 05cbaa2853cdfc255fdd04e65a82bfe9208c4e52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Dec 2009 16:00:35 +0100 Subject: perf: Fix NULL deref in inheritance code Liming found a NULL deref when a task has a perf context but no counters when it forks. This can occur in two cases, a race during construction where the fork hits after installing the context but before the first counter gets inserted, or more reproducably, a fork after the last counter is closed (which leaves the context around). Reported-by: Wang Liming Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras CC: LKML-Reference: <1262185684.7135.222.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061398d..58ed1dae587 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5148,7 +5148,7 @@ int perf_event_init_task(struct task_struct *child) GFP_KERNEL); if (!child_ctx) { ret = -ENOMEM; - goto exit; + break; } __perf_event_init_context(child_ctx, child); @@ -5164,7 +5164,7 @@ int perf_event_init_task(struct task_struct *child) } } - if (inherited_all) { + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. @@ -5184,7 +5184,6 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } -exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); -- cgit v1.2.3 From 10b465aaf9536ee5a16652fa0700740183d48ec9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 19 Dec 2009 14:43:01 +0000 Subject: modules: Skip empty sections when exporting section notes Commit 35dead4 "modules: don't export section names of empty sections via sysfs" changed the set of sections that have attributes, but did not change the iteration over these attributes in add_notes_attrs(). This can lead to add_notes_attrs() creating attributes with the wrong names or with null name pointers. Introduce a sect_empty() function and use it in both add_sect_attrs() and add_notes_attrs(). Reported-by: Martin Michlmayr Signed-off-by: Ben Hutchings Tested-by: Martin Michlmayr Cc: stable@kernel.org Signed-off-by: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e96b8ed1cb6..f82386bd9ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1010,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, * J. Corbet */ #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} + struct module_sect_attr { struct module_attribute mattr; @@ -1051,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC - && sechdrs[i].sh_size) + if (!sect_empty(&sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1070,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { - if (! (sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!sechdrs[i].sh_size) + if (sect_empty(&sechdrs[i])) continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, @@ -1156,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; for (i = 0; i < nsect; i++) - if ((sechdrs[i].sh_flags & SHF_ALLOC) && + if (!sect_empty(&sechdrs[i]) && (sechdrs[i].sh_type == SHT_NOTE)) ++notes; @@ -1172,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < nsect; ++i) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (sect_empty(&sechdrs[i])) continue; if (sechdrs[i].sh_type == SHT_NOTE) { nattr->attr.name = mod->sect_attrs->attrs[loaded].name; -- cgit v1.2.3 From 8767ba2796a1c894e6d9524584a26a8224f0543d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 8 Jan 2010 14:42:38 -0800 Subject: kmod: fix resource leak in call_usermodehelper_pipe() Fix resource (write-pipe file) leak in call_usermodehelper_pipe(). When call_usermodehelper_exec() fails, write-pipe file is opened and call_usermodehelper_pipe() just returns an error. Since it is hard for caller to determine whether the error occured when opening the pipe or executing the helper, the caller cannot close the pipe by themselves. I've found this resoruce leak when testing coredump. You can check how the resource leaks as below; $ echo "|nocommand" > /proc/sys/kernel/core_pattern $ ulimit -c unlimited $ while [ 1 ]; do ./segv; done &> /dev/null & $ cat /proc/meminfo (<- repeat it) where segv.c is; //----- int main () { char *p = 0; *p = 1; } //----- This patch closes write-pipe file if call_usermodehelper_exec() failed. Signed-off-by: Masami Hiramatsu Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 25b10319036..bf0e231d970 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, return -ENOMEM; ret = call_usermodehelper_stdinpipe(sub_info, filp); - if (ret < 0) - goto out; + if (ret < 0) { + call_usermodehelper_freeinfo(sub_info); + return ret; + } - return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (ret < 0) /* Failed to execute helper, close pipe */ + filp_close(*filp, NULL); - out: - call_usermodehelper_freeinfo(sub_info); return ret; } EXPORT_SYMBOL(call_usermodehelper_pipe); -- cgit v1.2.3 From bd4f490a079730aadfaf9a728303ea0135c01945 Mon Sep 17 00:00:00 2001 From: Dave Anderson Date: Fri, 8 Jan 2010 14:42:50 -0800 Subject: cgroups: fix 2.6.32 regression causing BUG_ON() in cgroup_diput() The LTP cgroup test suite generates a "kernel BUG at kernel/cgroup.c:790!" here in cgroup_diput(): /* * if we're getting rid of the cgroup, refcount should ensure * that there are no pidlists left. */ BUG_ON(!list_empty(&cgrp->pidlists)); The cgroup pidlist rework in 2.6.32 generates the BUG_ON, which is caused when pidlist_array_load() calls cgroup_pidlist_find(): (1) if a matching cgroup_pidlist is found, it down_write's the mutex of the pre-existing cgroup_pidlist, and increments its use_count. (2) if no matching cgroup_pidlist is found, then a new one is allocated, it down_write's its mutex, and the use_count is set to 0. (3) the matching, or new, cgroup_pidlist gets returned back to pidlist_array_load(), which increments its use_count -- regardless whether new or pre-existing -- and up_write's the mutex. So if a matching list is ever encountered by cgroup_pidlist_find() during the life of a cgroup directory, it results in an inflated use_count value, preventing it from ever getting released by cgroup_release_pid_array(). Then if the directory is subsequently removed, cgroup_diput() hits the BUG_ON() when it finds that the directory's cgroup is still populated with a pidlist. The patch simply removes the use_count increment when a matching pidlist is found by cgroup_pidlist_find(), because it gets bumped by the calling pidlist_array_load() function while still protected by the list's mutex. Signed-off-by: Dave Anderson Reviewed-by: Li Zefan Acked-by: Ben Blum Cc: Paul Menage Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0249f4be9b5..1fbcc748044 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2468,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, /* make sure l doesn't vanish out from under us */ down_write(&l->mutex); mutex_unlock(&cgrp->pidlist_mutex); - l->use_count++; return l; } } -- cgit v1.2.3 From b45c6e76bc2c72f6426c14bed64fdcbc9bf37cb0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 8 Jan 2010 14:42:52 -0800 Subject: kernel/signal.c: fix kernel information leak with print-fatal-signals=1 When print-fatal-signals is enabled it's possible to dump any memory reachable by the kernel to the log by simply jumping to that address from user space. Or crash the system if there's some hardware with read side effects. The fatal signals handler will dump 16 bytes at the execution address, which is fully controlled by ring 3. In addition when something jumps to a unmapped address there will be up to 16 additional useless page faults, which might be potentially slow (and at least is not very efficient) Fortunately this option is off by default and only there on i386. But fix it by checking for kernel addresses and also stopping when there's a page fault. Signed-off-by: Andi Kleen Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d09692b4037..934ae5e687b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -979,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) for (i = 0; i < 16; i++) { unsigned char insn; - __get_user(insn, (unsigned char *)(regs->ip + i)); + if (get_user(insn, (unsigned char *)(regs->ip + i))) + break; printk("%02x ", insn); } } -- cgit v1.2.3