From e6fe07a014c7a3466dcd1a387a9ac04d84c2703c Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Thu, 6 Aug 2009 13:22:40 -0600 Subject: pm_qos: remove BKL pm_qos_power_open got its lock_kernel() calls from the open() pushdown. A look at the code shows that the only global resources accessed are pm_qos_array and "name". pm_qos_array doesn't change (things pointed to therein do change, but they are atomics and/or are protected by pm_qos_lock). Accesses to "name" are totally unprotected with or without the BKL; that will be fixed shortly. The BKL is not helpful here; take it out. Signed-off-by: Jonathan Corbet LKML-Reference: <20091010153349.071381158@linutronix.de> Acked-by: Mark Gross Reviewed-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner --- kernel/pm_qos_params.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index dfdec524d1b..d96b83ed21c 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include @@ -352,20 +351,15 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) int ret; long pm_qos_class; - lock_kernel(); pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { filp->private_data = (void *)pm_qos_class; sprintf(name, "process_%d", current->pid); ret = pm_qos_add_requirement(pm_qos_class, name, PM_QOS_DEFAULT_VALUE); - if (ret >= 0) { - unlock_kernel(); + if (ret >= 0) return 0; - } } - unlock_kernel(); - return -EPERM; } -- cgit v1.2.3 From 1a6deaea3584fd7af1cad492b1fe0867060b45db Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Thu, 6 Aug 2009 13:35:44 -0600 Subject: pm_qos: clean up racy global "name" variable "name" is a poor name for a file-global variable. It was used in three different functions, with no mutual exclusion. But it's just a tiny, temporary string; let's just move it onto the stack in the functions that need it. Also use snprintf() just in case. Signed-off-by: Jonathan Corbet LKML-Reference: <20091010153349.113570550@linutronix.de> Acked-by: Mark Gross Reviewed-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner --- kernel/pm_qos_params.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index d96b83ed21c..3db49b9ca37 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -343,18 +343,18 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); -#define PID_NAME_LEN sizeof("process_1234567890") -static char name[PID_NAME_LEN]; +#define PID_NAME_LEN 32 static int pm_qos_power_open(struct inode *inode, struct file *filp) { int ret; long pm_qos_class; + char name[PID_NAME_LEN]; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { filp->private_data = (void *)pm_qos_class; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); ret = pm_qos_add_requirement(pm_qos_class, name, PM_QOS_DEFAULT_VALUE); if (ret >= 0) @@ -366,9 +366,10 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) static int pm_qos_power_release(struct inode *inode, struct file *filp) { int pm_qos_class; + char name[PID_NAME_LEN]; pm_qos_class = (long)filp->private_data; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); pm_qos_remove_requirement(pm_qos_class, name); return 0; @@ -379,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, { s32 value; int pm_qos_class; + char name[PID_NAME_LEN]; pm_qos_class = (long)filp->private_data; if (count != sizeof(s32)) return -EINVAL; if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); pm_qos_update_requirement(pm_qos_class, name, value); return sizeof(s32); -- cgit v1.2.3 From 6f15fa50087c8317e353145319466afbeb27a75d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Oct 2009 20:31:33 +0200 Subject: sys: Remove BKL from sys_reboot Serialization of sys_reboot can be done local. The BKL is not protecting anything else. LKML-Reference: <20091010153349.405590702@linutronix.de> Signed-off-by: Thomas Gleixner --- kernel/sys.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 255475d163e..22ea9553c3b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -349,6 +348,9 @@ void kernel_power_off(void) machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers @@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) cmd = LINUX_REBOOT_CMD_HALT; - lock_kernel(); + mutex_lock(&reboot_mutex); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: kernel_restart(NULL); @@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, case LINUX_REBOOT_CMD_HALT: kernel_halt(); - unlock_kernel(); do_exit(0); panic("cannot halt"); case LINUX_REBOOT_CMD_POWER_OFF: kernel_power_off(); - unlock_kernel(); do_exit(0); break; case LINUX_REBOOT_CMD_RESTART2: if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - unlock_kernel(); - return -EFAULT; + ret = -EFAULT; + break; } buffer[sizeof(buffer) - 1] = '\0'; @@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, ret = -EINVAL; break; } - unlock_kernel(); + mutex_unlock(&reboot_mutex); return ret; } -- cgit v1.2.3 From 1e5ad9679016275d422e36b12a98b0927d76f556 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 2 Nov 2009 10:45:36 -0700 Subject: resources: when allocate_resource() fails, leave resource untouched When "allocate_resource(root, new, size, ...)" fails, we currently clobber "new". This is inconvenient for the caller, who might care about the original contents of the resource. For example, when pci_bus_alloc_resource() fails, the "can't allocate mem resource %pR" message from pci_assign_resources() currently contains junk for the resource start/end. This patch delays the "new" update until we're about to return success. Acked-by: Linus Torvalds Acked-by: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index fb11a58b959..dc15686b7a7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; + resource_size_t start, end; - new->start = root->start; + start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to new->end below would cause an underflow. */ if (this && this->start == 0) { - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } for(;;) { if (this) - new->end = this->start - 1; + end = this->start - 1; else - new->end = root->end; - if (new->start < min) - new->start = min; - if (new->end > max) - new->end = max; - new->start = ALIGN(new->start, align); + end = root->end; + if (start < min) + start = min; + if (end > max) + end = max; + start = ALIGN(start, align); if (alignf) alignf(alignf_data, new, size, align); - if (new->start < new->end && new->end - new->start >= size - 1) { - new->end = new->start + size - 1; + if (start < end && end - start >= size - 1) { + new->start = start; + new->end = start + size - 1; return 0; } if (!this) break; - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } return -EBUSY; -- cgit v1.2.3 From b71a8eb0fa64ec6d00175f479e3ef851703568af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 6 Oct 2009 12:42:51 +0200 Subject: tree-wide: fix typos "selct" + "slect" -> "select" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch was generated by git grep -E -i -l 's(le|el)ct' | xargs -r perl -p -i -e 's/([Ss])(le|el)ct/$1elect/ with only skipping net/netfilter/xt_SECMARK.c and include/linux/netfilter/xt_SECMARK.h which have a struct member called selctx. Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 5e18c6ab2c6..c403567f78c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -580,7 +580,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, * @count: length of buffer * * Takes input from sysfs interface for manually overriding the default - * clocksource selction. + * clocksource selection. */ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, -- cgit v1.2.3 From dc186ad741c12ae9ecac8b89e317ef706fdaf8f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2009 01:09:48 +0900 Subject: workqueue: Add debugobjects support Add debugobject support to track the life time of work_structs. While at it, remove duplicate definition of INIT_DELAYED_WORK_ON_STACK(). Signed-off-by: Thomas Gleixner Signed-off-by: Tejun Heo --- kernel/workqueue.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 128 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 12328147132..ddad63fbb61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,116 @@ struct workqueue_struct { #endif }; +#ifdef CONFIG_DEBUG_OBJECTS_WORK + +static struct debug_obj_descr work_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int work_fixup_init(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int work_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The work struct was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + debug_object_init(work, &work_debug_descr); + debug_object_activate(work, &work_debug_descr); + return 0; + } + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int work_fixup_free(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_free(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr work_debug_descr = { + .name = "work_struct", + .fixup_init = work_fixup_init, + .fixup_activate = work_fixup_activate, + .fixup_free = work_fixup_free, +}; + +static inline void debug_work_activate(struct work_struct *work) +{ + debug_object_activate(work, &work_debug_descr); +} + +static inline void debug_work_deactivate(struct work_struct *work) +{ + debug_object_deactivate(work, &work_debug_descr); +} + +void __init_work(struct work_struct *work, int onstack) +{ + if (onstack) + debug_object_init_on_stack(work, &work_debug_descr); + else + debug_object_init(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(__init_work); + +void destroy_work_on_stack(struct work_struct *work) +{ + debug_object_free(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_work_on_stack); + +#else +static inline void debug_work_activate(struct work_struct *work) { } +static inline void debug_work_deactivate(struct work_struct *work) { } +#endif + /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); @@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, { unsigned long flags; + debug_work_activate(work); spin_lock_irqsave(&cwq->lock, flags); insert_work(cwq, work, &cwq->worklist); spin_unlock_irqrestore(&cwq->lock, flags); @@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct lockdep_map lockdep_map = work->lockdep_map; #endif trace_workqueue_execution(cwq->thread, work); + debug_work_deactivate(work); cwq->current_work = work; list_del_init(cwq->worklist.next); spin_unlock_irq(&cwq->lock); @@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work) static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, struct wq_barrier *barr, struct list_head *head) { - INIT_WORK(&barr->work, wq_barrier_func); + /* + * debugobject calls are safe here even with cwq->lock locked + * as we know for sure that this will not trigger any of the + * checks and call back into the fixup functions where we + * might deadlock. + */ + INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); init_completion(&barr->done); + debug_work_activate(&barr->work); insert_work(cwq, &barr->work, head); } @@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } spin_unlock_irq(&cwq->lock); - if (active) + if (active) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } return active; } @@ -451,6 +572,7 @@ out: return 0; wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); return 1; } EXPORT_SYMBOL_GPL(flush_work); @@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work) */ smp_rmb(); if (cwq == get_wq_data(work)) { + debug_work_deactivate(work); list_del_init(&work->entry); ret = 1; } @@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, } spin_unlock_irq(&cwq->lock); - if (unlikely(running)) + if (unlikely(running)) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } } static void wait_on_work(struct work_struct *work) -- cgit v1.2.3 From fb3d38b9904888aa8e36d88b2388dc70934b2169 Mon Sep 17 00:00:00 2001 From: Liuweni Date: Mon, 2 Nov 2009 15:55:03 +0100 Subject: fix kerneldoc for set_irq_msi() Signed-off-by: Liuweni Signed-off-by: Jiri Kosina --- kernel/irq/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c1660194d11..ae70e93b29b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -166,11 +166,11 @@ int set_irq_data(unsigned int irq, void *data) EXPORT_SYMBOL(set_irq_data); /** - * set_irq_data - set irq type data for an irq + * set_irq_msi - set irq type data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * - * Set the hardware irq controller data for an irq + * Set the MSI descriptor entry for an irq */ int set_irq_msi(unsigned int irq, struct msi_desc *entry) { -- cgit v1.2.3 From fbfecd3712f917ca210a55c157233d88b785896b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 28 Oct 2009 20:11:04 +0100 Subject: tree-wide: fix typos "couter" -> "counter" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch was generated by git grep -E -i -l 'couter' | xargs -r perl -p -i -e 's/couter/counter/' Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- kernel/irq/spurious.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704760f..abd4fb77f8c 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -230,7 +230,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, /* * If we are seeing only the odd spurious IRQ caused by * bus asynchronicity then don't eventually trigger an error, - * otherwise the couter becomes a doomsday timer for otherwise + * otherwise the counter becomes a doomsday timer for otherwise * working systems */ if (time_after(jiffies, desc->last_unhandled + HZ/10)) -- cgit v1.2.3 From af901ca181d92aac3a7dc265144a9081a86d8f39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Sat, 14 Nov 2009 13:09:05 -0200 Subject: tree-wide: fix assorted typos all over the place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That is "success", "unknown", "through", "performance", "[re|un]mapping" , "access", "default", "reasonable", "[con]currently", "temperature" , "channel", "[un]used", "application", "example","hierarchy", "therefore" , "[over|under]flow", "contiguous", "threshold", "enough" and others. Signed-off-by: André Goddard Rosa Signed-off-by: Jiri Kosina --- kernel/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7f29643c898..759a629cc4b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -419,7 +419,7 @@ static void perf_event_remove_from_context(struct perf_event *event) if (!task) { /* * Per cpu events are removed via an smp call and - * the removal is always sucessful. + * the removal is always successful. */ smp_call_function_single(event->cpu, __perf_event_remove_from_context, @@ -827,7 +827,7 @@ perf_install_in_context(struct perf_event_context *ctx, if (!task) { /* * Per cpu events are installed via an smp call and - * the install is always sucessful. + * the install is always successful. */ smp_call_function_single(cpu, __perf_install_in_context, event, 1); -- cgit v1.2.3 From 2f0993e0fb663c49e4d1e02654f6203246be4817 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 07:06:10 +0100 Subject: hw-breakpoints: Drop callback and task parameters from modify helper Drop the callback and task parameters from modify_user_hw_breakpoint(). For now we have no user that need to modify a breakpoint to the point of changing its handler or its task context. Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- kernel/hw_breakpoint.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index cf5ee162841..2d10b012828 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -312,9 +312,7 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * @tsk: pointer to 'task_struct' of the process to which the address belongs */ struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk) +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { /* * FIXME: do it without unregistering @@ -323,7 +321,8 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, */ unregister_hw_breakpoint(bp); - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, + bp->callback); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); -- cgit v1.2.3 From b326e9560a28fc3e950637ef51847ed8f05c1335 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 09:44:31 +0100 Subject: hw-breakpoints: Use overflow handler instead of the event callback struct perf_event::event callback was called when a breakpoint triggers. But this is a rather opaque callback, pretty tied-only to the breakpoint API and not really integrated into perf as it triggers even when we don't overflow. We prefer to use overflow_handler() as it fits into the perf events rules, being called only when we overflow. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- kernel/hw_breakpoint.c | 17 +++++------------ kernel/perf_event.c | 24 +++++++++--------------- kernel/trace/trace_ksym.c | 5 +++-- 3 files changed, 17 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2d10b012828..b600fc27f16 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -259,7 +259,7 @@ void release_bp_slot(struct perf_event *bp) } -int __register_perf_hw_breakpoint(struct perf_event *bp) +int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -276,19 +276,12 @@ int __register_perf_hw_breakpoint(struct perf_event *bp) * This is a quick hack that will be removed soon, once we remove * the tmp breakpoints from ptrace */ - if (!bp->attr.disabled || bp->callback == perf_bp_event) + if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); return ret; } -int register_perf_hw_breakpoint(struct perf_event *bp) -{ - bp->callback = perf_bp_event; - - return __register_perf_hw_breakpoint(bp); -} - /** * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes @@ -297,7 +290,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) */ struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk) { return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); @@ -322,7 +315,7 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) unregister_hw_breakpoint(bp); return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, - bp->callback); + bp->overflow_handler); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); @@ -347,7 +340,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); */ struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered) + perf_overflow_handler_t triggered) { struct perf_event **cpu_events, **pevent, *bp; long err; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6b7ddba1dd6..fd43ff4ac86 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4286,15 +4286,8 @@ static void bp_perf_event_destroy(struct perf_event *event) static const struct pmu *bp_perf_event_init(struct perf_event *bp) { int err; - /* - * The breakpoint is already filled if we haven't created the counter - * through perf syscall - * FIXME: manage to get trigerred to NULL if it comes from syscalls - */ - if (!bp->callback) - err = register_perf_hw_breakpoint(bp); - else - err = __register_perf_hw_breakpoint(bp); + + err = register_perf_hw_breakpoint(bp); if (err) return ERR_PTR(err); @@ -4390,7 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, - perf_callback_t callback, + perf_overflow_handler_t overflow_handler, gfp_t gfpflags) { const struct pmu *pmu; @@ -4433,10 +4426,10 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; - if (!callback && parent_event) - callback = parent_event->callback; + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; - event->callback = callback; + event->overflow_handler = overflow_handler; if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4776,7 +4769,8 @@ err_put_context: */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, perf_callback_t callback) + pid_t pid, + perf_overflow_handler_t overflow_handler) { struct perf_event *event; struct perf_event_context *ctx; @@ -4793,7 +4787,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, callback, GFP_KERNEL); + NULL, overflow_handler, GFP_KERNEL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_put_context; diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index ddfa0fd43bc..acb87d4a4ac 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) } #endif /* CONFIG_PROFILE_KSYM_TRACER */ -void ksym_hbp_handler(struct perf_event *hbp, void *data) +void ksym_hbp_handler(struct perf_event *hbp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { struct ring_buffer_event *event; struct ksym_trace_entry *entry; - struct pt_regs *regs = data; struct ring_buffer *buffer; int pc; -- cgit v1.2.3 From 109d71c6dd52ec08878c2c67eb4c0bd67fcbc80b Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Thu, 19 Nov 2009 13:42:06 -0800 Subject: lockstat: Fix min, max times in /proc/lock_stats Fix min, max times in /proc/lock_stats (1) When collecting lock hold and wait times, if the current minimum time is zero, it will be replaced by the next time. (2) When aggregating minimum and maximum lock hold and wait times accross cpus, the values are added, instead of selecting the minimum and maximum. Signed-off-by: Frank Rowand Signed-off-by: Peter Zijlstra LKML-Reference: <4B05BBAE.2050005@am.sony.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f5dcd36d315..7a3ae56b3a7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -168,7 +168,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time) if (time > lt->max) lt->max = time; - if (time < lt->min || !lt->min) + if (time < lt->min || !lt->nr) lt->min = time; lt->total += time; @@ -177,8 +177,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time) static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) { - dst->min += src->min; - dst->max += src->max; + if (!src->nr) + return; + + if (src->max > dst->max) + dst->max = src->max; + + if (src->min < dst->min || !dst->nr) + dst->min = src->min; + dst->total += src->total; dst->nr += src->nr; } -- cgit v1.2.3 From e1b8090bdf125f8b2e192149547fead7f302a89c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 6 Dec 2009 20:41:16 +0100 Subject: cpumask: Fix generate_sched_domains() for UP Commit acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 ("cpumask: Partition_sched_domains takes array of cpumask_var_t") changed the function signature of generate_sched_domains() for the CONFIG_SMP=y case, but forgot to update the corresponding function for the CONFIG_SMP=n case, causing: kernel/cpuset.c:2073: warning: passing argument 1 of 'generate_sched_domains' from incompatible pointer type Signed-off-by: Geert Uytterhoeven Cc: Rusty Russell Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3cf2183b472..43fb7e80002 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) { } -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { *domains = NULL; -- cgit v1.2.3 From 6ad4c18884e864cf4c77f9074d3d1816063f99cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Nov 2009 13:31:39 +0100 Subject: sched: Fix balance vs hotplug race Since (e761b77: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment) we have cpu_active_mask which is suppose to rule scheduler migration and load-balancing, except it never (fully) did. The particular problem being solved here is a crash in try_to_wake_up() where select_task_rq() ends up selecting an offline cpu because select_task_rq_fair() trusts the sched_domain tree to reflect the current state of affairs, similarly select_task_rq_rt() trusts the root_domain. However, the sched_domains are updated from CPU_DEAD, which is after the cpu is taken offline and after stop_machine is done. Therefore it can race perfectly well with code assuming the domains are right. Cure this by building the domains from cpu_active_mask on CPU_DOWN_PREPARE. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/cpu.c | 18 +++++++++++++----- kernel/cpuset.c | 16 +++++++++------- kernel/sched.c | 32 +++++++++++++++++--------------- 3 files changed, 39 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ba0f1ecb21..b2168864037 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { + set_cpu_active(cpu, true); + nr_calls--; __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); @@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) /* Ensure that we are not runnable on dying cpu */ cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); + set_cpus_allowed_ptr(current, cpu_active_mask); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { + set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu) err = _cpu_down(cpu, 0); - if (cpu_online(cpu)) - set_cpu_active(cpu, true); - out: cpu_maps_update_done(); stop_machine_destroy(); @@ -387,6 +386,15 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + + for_each_online_cpu(cpu) { + if (cpu == first_cpu) + continue; + set_cpu_active(cpu, false); + } + + synchronize_sched(); + printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 43fb7e80002..ba401fab459 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } retval = validate_change(cs, trialcs); @@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_online_mask); + cpu_active_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); @@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, switch (phase) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: break; default: @@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, cgroup_lock(); mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); @@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); diff --git a/kernel/sched.c b/kernel/sched.c index aa31244caa9..281da29d080 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4134,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4297,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4694,7 +4694,7 @@ int select_nohz_load_balancer(int stop_tick) cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -7093,7 +7093,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } @@ -7115,7 +7115,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; @@ -7269,19 +7269,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) again: /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto move; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); if (dest_cpu < nr_cpu_ids) goto move; /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); /* * Don't tell them about moving exiting tasks or @@ -7310,7 +7310,7 @@ move: */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); unsigned long flags; local_irq_save(flags); @@ -7564,7 +7564,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { - int i, cpu_num = num_online_cpus(); + int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; @@ -7574,7 +7574,7 @@ static void register_sched_domain_sysctl(void) if (entry == NULL) return; - for_each_online_cpu(i) { + for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; @@ -9100,7 +9100,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -9231,8 +9231,10 @@ static int update_sched_domains(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; @@ -9279,7 +9281,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); + arch_init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); -- cgit v1.2.3 From 56053170ea2a2c0dc17420e9b94aa3ca51d80408 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Dec 2009 06:46:48 +0100 Subject: hw-breakpoints: Fix task-bound breakpoint slot allocation Whatever the context nature of a breakpoint, we always perform the following constraint checks before allocating it a slot: - Check the number of pinned breakpoint bound the concerned cpus - Check the max number of task-bound breakpoints that are belonging to a task. - Add both and see if we have a reamining slot for the new breakpoint This is the right thing to do when we are about to register a cpu-only bound breakpoint. But not if we are dealing with a task bound breakpoint. What we want in this case is: - Check the number of pinned breakpoint bound the concerned cpus - Check the number of breakpoints that already belong to the task in which the breakpoint to register is bound to. - Add both This fixes a regression that makes the "firefox -g" command fail to register breakpoints once we deal with a secondary thread. Reported-by: Walt Signed-off-by: Frederic Weisbecker Cc: Prasad --- kernel/hw_breakpoint.c | 74 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index b600fc27f16..02b492504a5 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -83,15 +83,51 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } +static int task_bp_pinned(struct task_struct *tsk) +{ + struct perf_event_context *ctx = tsk->perf_event_ctxp; + struct list_head *list; + struct perf_event *bp; + unsigned long flags; + int count = 0; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return 0; + + list = &ctx->event_list; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + return count; +} + /* * Report the number of pinned/un-pinned breakpoints we have in * a given cpu (cpu > -1) or in all of them (cpu = -1). */ -static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) { + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); - slots->pinned += max_task_bp_pinned(cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu); + else + slots->pinned += task_bp_pinned(tsk); slots->flexible = per_cpu(nr_bp_flexible, cpu); return; @@ -101,7 +137,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) unsigned int nr; nr = per_cpu(nr_cpu_bp_pinned, cpu); - nr += max_task_bp_pinned(cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu); + else + nr += task_bp_pinned(tsk); if (nr > slots->pinned) slots->pinned = nr; @@ -118,33 +157,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) */ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) { - int count = 0; - struct perf_event *bp; - struct perf_event_context *ctx = tsk->perf_event_ctxp; unsigned int *tsk_pinned; - struct list_head *list; - unsigned long flags; - - if (WARN_ONCE(!ctx, "No perf context for this task")) - return; - - list = &ctx->event_list; - - spin_lock_irqsave(&ctx->lock, flags); - - /* - * The current breakpoint counter is not included in the list - * at the open() callback time - */ - list_for_each_entry(bp, list, event_entry) { - if (bp->attr.type == PERF_TYPE_BREAKPOINT) - count++; - } - - spin_unlock_irqrestore(&ctx->lock, flags); + int count = 0; - if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) - return; + count = task_bp_pinned(tsk); tsk_pinned = per_cpu(task_bp_pinned, cpu); if (enable) { @@ -233,7 +249,7 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - fetch_bp_busy_slots(&slots, bp->cpu); + fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ if (slots.pinned + (!!slots.flexible) == HBP_NUM) { -- cgit v1.2.3 From 6ab8886326a1b9a3a8d164d8174e3c20703a03a2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 8 Dec 2009 18:25:15 +1100 Subject: perf: hw_breakpoints: Fix percpu namespace clash Today's linux-next build failed with: kernel/hw_breakpoint.c:86: error: 'task_bp_pinned' redeclared as different kind of symbol ... Caused by commit dd17c8f72993f9461e9c19250e3f155d6d99df22 ("percpu: remove per_cpu__ prefix") from the percpu tree interacting with commit 56053170ea2a2c0dc17420e9b94aa3ca51d80408 ("hw-breakpoints: Fix task-bound breakpoint slot allocation") from the tip tree. Signed-off-by: Stephen Rothwell Acked-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Tejun Heo Cc: Rusty Russell Cc: Christoph Lameter Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <20091208182515.bb6dda4a.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 02b492504a5..03a0773ac2b 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -52,7 +52,7 @@ static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); /* Number of non-pinned cpu/task breakpoints in a cpu */ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); @@ -73,7 +73,7 @@ static DEFINE_MUTEX(nr_bp_mutex); static unsigned int max_task_bp_pinned(int cpu) { int i; - unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); for (i = HBP_NUM -1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -162,7 +162,7 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) count = task_bp_pinned(tsk); - tsk_pinned = per_cpu(task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); if (enable) { tsk_pinned[count]++; if (count > 0) @@ -209,7 +209,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. @@ -220,7 +220,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks @@ -232,7 +232,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM * * -> Same checks as before. But now the nr_bp_flexible, if any, must keep * one register at least (or they will never be fed). @@ -240,7 +240,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ int reserve_bp_slot(struct perf_event *bp) { -- cgit v1.2.3 From 722d0172377a5697919b9f7e5beb95165b1dec4e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 8 Dec 2009 13:19:42 +0100 Subject: futex: Take mmap_sem for get_user_pages in fault_in_user_writeable get_user_pages() must be called with mmap_sem held. Signed-off-by: Andi Kleen Cc: stable@kernel.org Cc: Andrew Morton Cc: Nick Piggin Cc: Darren Hart Cc: Peter Zijlstra LKML-Reference: <20091208121942.GA21298@basil.fritz.box> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index fb65e822fc4..d73ef1f3e55 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key) */ static int fault_in_user_writeable(u32 __user *uaddr) { - int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, (unsigned long)uaddr, + 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + return ret < 0 ? ret : 0; } -- cgit v1.2.3 From a7c312bed772c11138409c3a98531e85d690302e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:03:16 -0500 Subject: trace-kprobe: Support delete probe syntax Support delete probe syntax. The syntax is "-:[group/]event". Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220316.10142.39192.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aff5f80b59b..bf05fb49a6f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -606,23 +606,22 @@ static int create_trace_probe(int argc, char **argv) */ struct trace_probe *tp; int i, ret = 0; - int is_return = 0; + int is_return = 0, is_delete = 0; char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } - + /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = 0; else if (argv[0][0] == 'r') is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; else { - pr_info("Probe definition must be started with 'p' or 'r'.\n"); + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); return -EINVAL; } @@ -642,7 +641,29 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } } + if (!group) + group = KPROBE_EVENT_SYSTEM; + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + tp = find_probe_event(event, group); + if (!tp) { + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } if (isdigit(argv[1][0])) { if (is_return) { pr_info("Return probe point must be a symbol.\n"); @@ -671,8 +692,6 @@ static int create_trace_probe(int argc, char **argv) argc -= 2; argv += 2; /* setup a probe */ - if (!group) - group = KPROBE_EVENT_SYSTEM; if (!event) { /* Make a new event name */ if (symbol) -- cgit v1.2.3 From 44234adcdce38f83c56e05f808ce656175b4beeb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Dec 2009 09:25:48 +0100 Subject: hw-breakpoints: Modify breakpoints without unregistering them Currently, when ptrace needs to modify a breakpoint, like disabling it, changing its address, type or len, it calls modify_user_hw_breakpoint(). This latter will perform the heavy and racy task of unregistering the old breakpoint and registering a new one. This is racy as someone else might steal the reserved breakpoint slot under us, which is undesired as the breakpoint is only supposed to be modified, sometimes in the middle of a debugging workflow. We don't want our slot to be stolen in the middle. So instead of unregistering/registering the breakpoint, just disable it while we modify its breakpoint fields and re-enable it after if necessary. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Prasad LKML-Reference: <1260347148-5519-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 42 ++++++++++++++++++++++++++++++++---------- kernel/perf_event.c | 4 ++-- 2 files changed, 34 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 03a0773ac2b..366eedf949c 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -320,18 +320,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs */ -struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - /* - * FIXME: do it without unregistering - * - We don't want to lose our slot - * - If the new bp is incorrect, don't lose the older one - */ - unregister_hw_breakpoint(bp); + u64 old_addr = bp->attr.bp_addr; + int old_type = bp->attr.bp_type; + int old_len = bp->attr.bp_len; + int err = 0; + + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; - return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, - bp->overflow_handler); + err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fd43ff4ac86..3b0cf86eee8 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -567,7 +567,7 @@ static void __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_event_disable(struct perf_event *event) +void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -971,7 +971,7 @@ static void __perf_event_enable(void *info) * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -static void perf_event_enable(struct perf_event *event) +void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; -- cgit v1.2.3 From aa5452d70c0d559310598b243b8b1033c10056e7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:28:13 +0800 Subject: perf_event: Clean up __perf_event_init_context() Clean up the code a bit: - define 'perf_cpu_context' variable with 'static' - use kzalloc() instead of kmalloc() and memset() Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F194D.7080306@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3b0cf86eee8..2b06c45bfba 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -36,7 +36,7 @@ /* * Each CPU has a list of per CPU events: */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; @@ -1579,7 +1579,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { - memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->group_list); @@ -1654,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; @@ -5105,7 +5104,7 @@ int perf_event_init_task(struct task_struct *child) * First allocate and initialize a context for the child. */ - child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + child_ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!child_ctx) return -ENOMEM; -- cgit v1.2.3 From b93f7978ad6b46133e9453b90ccc057dc2429e75 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:29:44 +0800 Subject: perf_event: Allocate children's perf_event_ctxp at the right time In current code, children task will allocate memory for 'child->perf_event_ctxp' if the parent is counted, we can do it only if the parent allowed children inherit it. It can save memory and reduce overhead. Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F19A8.5040805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2b06c45bfba..77641ae6b23 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5083,7 +5083,7 @@ again: */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx = NULL, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5098,20 +5098,6 @@ int perf_event_init_task(struct task_struct *child) if (likely(!parent->perf_event_ctxp)) return 0; - /* - * This is executed from the parent task context, so inherit - * events that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - /* * If the parent's context is a clone, pin it so it won't get * swapped under us. @@ -5142,6 +5128,26 @@ int perf_event_init_task(struct task_struct *child) continue; } + if (!child->perf_event_ctxp) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) { + ret = -ENOMEM; + goto exit; + } + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { @@ -5170,6 +5176,7 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } +exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); -- cgit v1.2.3 From ec89a06fd4e12301f11ab039ee07d2353a18addc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:30:36 +0800 Subject: perf_event: Cleanup for cpu_clock_perf_event_update() Using atomic64_xchg() instead of atomic64_read() and atomic64_set(). Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F19DC.90204@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 77641ae6b23..94e1b28333a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4079,8 +4079,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&event->hw.prev_count); - atomic64_set(&event->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); atomic64_add(now - prev, &event->count); } -- cgit v1.2.3 From 3160568371da441b7f2fb57f2f1225404207e8f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Dec 2009 20:24:16 +0000 Subject: sched: Protect task->cpus_allowed access in sched_getaffinity() sched_getaffinity() is not protected against a concurrent modification of the tasks affinity. Serialize the access with task_rq_lock(task). Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra LKML-Reference: <20091208202026.769251187@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 281da29d080..c4635f74540 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6631,6 +6631,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); @@ -6645,7 +6647,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: read_unlock(&tasklist_lock); -- cgit v1.2.3 From dba091b9e3522b9d32fc9975e48d3b69633b45f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 09:32:03 +0100 Subject: sched: Protect sched_rr_get_param() access to task->sched_class sched_rr_get_param calls task->sched_class->get_rr_interval(task) without protection against a concurrent sched_setscheduler() call which modifies task->sched_class. Serialize the access with task_rq_lock(task) and hand the rq pointer into get_rr_interval() as it's needed at least in the sched_fair implementation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++++- kernel/sched_fair.c | 6 +----- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c4635f74540..68db5a2e654 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6887,6 +6887,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6903,7 +6905,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f61837ad336..613c1c74967 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2014,21 +2014,17 @@ static void moved_group_fair(struct task_struct *p) } #endif -unsigned int get_rr_interval_fair(struct task_struct *task) +unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { struct sched_entity *se = &task->se; - unsigned long flags; - struct rq *rq; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ - rq = task_rq_lock(task, &flags); if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); return rr_interval; } diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index b133a28fcde..33d5384a73a 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } -unsigned int get_rr_interval_idle(struct task_struct *task) +unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) { return 0; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 5c5fef37841..aecbd9c6b20 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } -unsigned int get_rr_interval_rt(struct task_struct *task) +unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* * Time slice is 0 for SCHED_FIFO tasks -- cgit v1.2.3 From 6b314d0e11924c803bf8cd944e87fd58cdb5088c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Dec 2009 18:58:05 +0100 Subject: sched: Remove sysctl.sched_features Since we've had a much saner debugfs interface to this, remove the sysctl one. Signed-off-by: Peter Zijlstra LKML-Reference: [ v2: build fix ] Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dbf93a52ee..e5cc53514ca 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -314,14 +314,6 @@ static struct ctl_table kern_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_migration_cost", -- cgit v1.2.3 From 970b13bacba14a8cef6f642861947df1d175b0b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Nov 2009 13:31:39 +0100 Subject: sched: Consolidate select_task_rq() callers Small cleanup. Signed-off-by: Peter Zijlstra LKML-Reference: [ v2: build fix ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 68db5a2e654..01fd131b47a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2323,6 +2323,14 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } +#ifdef CONFIG_SMP +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + return p->sched_class->select_task_rq(p, sd_flags, wake_flags); +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2376,7 +2384,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, p->state = TASK_WAKING; task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) { local_irq_save(flags); rq = cpu_rq(cpu); @@ -2593,7 +2601,7 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_class = &fair_sched_class; #ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif local_irq_save(flags); update_rq_clock(cpu_rq(cpu)); @@ -3156,7 +3164,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); + new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); -- cgit v1.2.3 From 5afcdab706d6002cb02b567ba46e650215e694e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 14:12:25 +0100 Subject: sched: Remove rq->clock coupling from set_task_cpu() set_task_cpu() should be rq invariant and only touch task state, it currently fails to do so, which opens up a few races, since not all callers hold both rq->locks. Remove the relyance on rq->clock, as any site calling set_task_cpu() should also do a remote clock update, which should ensure the observed time between these two cpus is monotonic, as per kernel/sched_clock.c:sched_clock_remote(). Therefore we can simply remove the clock_offset bits and be happy. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 01fd131b47a..1f9c6d99f15 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2060,23 +2060,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + struct rq *old_rq = cpu_rq(old_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; #ifdef CONFIG_SCHEDSTATS -- cgit v1.2.3 From ab19cb23313733c55e0517607844b86720b35f5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 15:44:43 +0100 Subject: sched: Clean up ttwu() rq locking Since set_task_clock() doesn't rely on rq->clock anymore we can simplyfy the mess in ttwu(). Optimize things a bit by not fiddling with the IRQ state there. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1f9c6d99f15..c92670f8e09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2371,17 +2371,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) { - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - local_irq_restore(flags); - } - rq = task_rq_lock(p, &flags); + + rq = __task_rq_lock(p); + update_rq_clock(rq); WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); -- cgit v1.2.3 From cd29fe6f2637cc2ccbda5ac65f5332d6bf5fa3c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 17:32:46 +0100 Subject: sched: Sanitize fork() handling Currently we try to do task placement in wake_up_new_task() after we do the load-balance pass in sched_fork(). This yields complicated semantics in that we have to deal with tasks on different RQs and the set_task_cpu() calls in copy_process() and sched_fork() Rename ->task_new() to ->task_fork() and call it from sched_fork() before the balancing, this gives the policy a clear point to place the task. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 47 ++++++++++++++++++----------------------------- kernel/sched_fair.c | 28 +++++++++++++++------------- 2 files changed, 33 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c92670f8e09..33c90357313 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1811,6 +1811,20 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_active(struct rq *this_rq); +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1967,20 +1981,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -2552,7 +2552,6 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); - unsigned long flags; __sched_fork(p); @@ -2586,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + #ifdef CONFIG_SMP cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif - local_irq_save(flags); - update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); - local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2625,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 613c1c74967..44ec80ccfa8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1922,28 +1922,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } /* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled */ -static void task_new_fair(struct rq *rq, struct task_struct *p) +static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); - sched_info_queued(p); + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); update_curr(cfs_rq); + if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && entity_before(curr, se)) { + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -1952,7 +1954,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } - enqueue_task_fair(rq, p, 0); + spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -2052,7 +2054,7 @@ static const struct sched_class fair_sched_class = { .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_new = task_new_fair, + .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, -- cgit v1.2.3 From a65ac745e47e91f9d98dbf07f22ed0492e34d998 Mon Sep 17 00:00:00 2001 From: Jupyung Lee Date: Tue, 17 Nov 2009 18:51:40 +0900 Subject: sched: Move update_curr() in check_preempt_wakeup() to avoid redundant call If a RT task is woken up while a non-RT task is running, check_preempt_wakeup() is called to check whether the new task can preempt the old task. The function returns quickly without going deeper because it is apparent that a RT task can always preempt a non-RT task. In this situation, check_preempt_wakeup() always calls update_curr() to update vruntime value of the currently running task. However, the function call is unnecessary and redundant at that moment because (1) a non-RT task can always be preempted by a RT task regardless of its vruntime value, and (2) update_curr() will be called shortly when the context switch between two occurs. By moving update_curr() in check_preempt_wakeup(), we can avoid redundant call to update_curr(), slightly reducing the time taken to wake up RT tasks. Signed-off-by: Jupyung Lee [ Place update_curr() right before the wake_preempt_entity() call, which is the only thing that relies on the updated vruntime ] Signed-off-by: Peter Zijlstra LKML-Reference: <1258451500-6714-1-git-send-email-jupyung@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 44ec80ccfa8..4dec18579c9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1651,8 +1651,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; - update_curr(cfs_rq); - if (unlikely(rt_prio(p->prio))) { resched_task(curr); return; @@ -1710,6 +1708,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ BUG_ON(!pse); + update_curr(cfs_rq); + if (wakeup_preempt_entity(se, pse) == 1) { resched_task(curr); /* -- cgit v1.2.3 From 3a7e73a2e26fffdbc46ba95fc0425418984f5140 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 28 Nov 2009 18:51:02 +0100 Subject: sched: Clean up check_preempt_wakeup() Streamline the wakeup preemption code a bit, unifying the preempt path so that they all do the same. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 73 ++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4dec18579c9..76b5792c419 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1651,10 +1651,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ int sync = wake_flags & WF_SYNC; int scale = cfs_rq->nr_running >= sched_nr_latency; - if (unlikely(rt_prio(p->prio))) { - resched_task(curr); - return; - } + if (unlikely(rt_prio(p->prio))) + goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) return; @@ -1680,52 +1678,47 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) { - resched_task(curr); - return; - } + if (unlikely(curr->policy == SCHED_IDLE)) + goto preempt; - if ((sched_feat(WAKEUP_SYNC) && sync) || - (sched_feat(WAKEUP_OVERLAP) && - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { - resched_task(curr); - return; - } + if (sched_feat(WAKEUP_SYNC) && sync) + goto preempt; - if (sched_feat(WAKEUP_RUNNING)) { - if (pse->avg_running < se->avg_running) { - set_next_buddy(pse); - resched_task(curr); - return; - } - } + if (sched_feat(WAKEUP_OVERLAP) && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) + goto preempt; + + if (sched_feat(WAKEUP_RUNNING) && pse->avg_running < se->avg_running) + goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; + update_curr(cfs_rq); find_matching_se(&se, &pse); - BUG_ON(!pse); + if (wakeup_preempt_entity(se, pse) == 1) + goto preempt; - update_curr(cfs_rq); + return; - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); - } +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.2.3 From 6cecd084d0fd27bb1e498e2829fd45846d806856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Nov 2009 13:00:37 +0100 Subject: sched: Discard some old bits WAKEUP_RUNNING was an experiment, not sure why that ever ended up being merged... Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++---------- kernel/sched_debug.c | 1 - kernel/sched_fair.c | 3 --- kernel/sched_features.h | 5 ----- 4 files changed, 7 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 33c90357313..0170735bdaf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2493,7 +2493,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; - p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -5379,13 +5378,14 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *p) +static void put_prev_task(struct rq *rq, struct task_struct *prev) { - u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; - update_avg(&p->se.avg_running, runtime); + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5395,12 +5395,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) * correlates to the amount of cache footprint a task can * build up. */ - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - update_avg(&p->se.avg_overlap, runtime); - } else { - update_avg(&p->se.avg_running, 0); + update_avg(&prev->se.avg_overlap, runtime); } - p->sched_class->put_prev_task(rq, p); + prev->sched_class->put_prev_task(rq, prev); } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6988cf08f70..5fda66615fe 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -399,7 +399,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); - PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 76b5792c419..e9f5daee12c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1689,9 +1689,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ pse->avg_overlap < sysctl_sched_migration_cost) goto preempt; - if (sched_feat(WAKEUP_RUNNING) && pse->avg_running < se->avg_running) - goto preempt; - if (!sched_feat(WAKEUP_PREEMPT)) return; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 0d94083582c..d5059fd761d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -53,11 +53,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0) */ SCHED_FEAT(WAKEUP_OVERLAP, 0) -/* - * Wakeup preemption towards tasks that run short - */ -SCHED_FEAT(WAKEUP_RUNNING, 0) - /* * Use the SYNC wakeup hint, pipes and the likes use this to indicate * the remote end is likely to consume the data we just wrote, and -- cgit v1.2.3 From fb58bac5c75bfff8bbf7d02071a10a62f32fe28b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Dec 2009 12:21:47 +0100 Subject: sched: Remove unnecessary RCU exclusion As Nick pointed out, and realized by myself when doing: sched: Fix balance vs hotplug race the patch: sched: for_each_domain() vs RCU is wrong, sched_domains are freed after synchronize_sched(), which means disabling preemption is enough. Reported-by: Nick Piggin Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e9f5daee12c..c163a285bf0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1403,7 +1403,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag new_cpu = prev_cpu; } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If power savings logic is enabled for a domain, see if we @@ -1484,10 +1483,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag update_shares(tmp); } - if (affine_sd && wake_affine(affine_sd, p, sync)) { - new_cpu = cpu; - goto out; - } + if (affine_sd && wake_affine(affine_sd, p, sync)) + return cpu; while (sd) { int load_idx = sd->forkexec_idx; @@ -1528,8 +1525,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* while loop will break here if sd == NULL */ } -out: - rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From cd8ad40de36c2fe75f3b731bd70198b385895246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Dec 2009 18:00:07 +0100 Subject: sched: cgroup: Implement different treatment for idle shares When setting the weight for a per-cpu task-group, we have to put in a phantom weight when there is no work on that cpu, otherwise we'll not service that cpu when new work gets placed there until we again update the per-cpu weights. We used to add these phantom weights to the total, so that the idle per-cpu shares don't get inflated, this however causes the non-idle parts to get deflated, causing unexpected weight distibutions. Reverse this, so that the non-idle shares are correct but the idle shares are inflated. Reported-by: Yasunori Goto Tested-by: Yasunori Goto Signed-off-by: Peter Zijlstra LKML-Reference: <1257934048.23203.76.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0170735bdaf..71eb0622f54 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1614,7 +1614,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; @@ -1630,6 +1630,7 @@ static int tg_shares_up(struct task_group *tg, void *data) weight = tg->cfs_rq[i]->load.weight; usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1638,10 +1639,13 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; -- cgit v1.2.3 From 57785df5ac53c70da9fb53696130f3c551bfe1f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Dec 2009 09:59:02 +0100 Subject: sched: Fix task priority bug 83f9ac removed a call to effective_prio() in wake_up_new_task(), which leads to tasks running at MAX_PRIO. This is caused by the idle thread being set to MAX_PRIO before forking off init. O(1) used that to make sure idle was always preempted, CFS uses check_preempt_curr_idle() for that so we can savely remove this bit of legacy code. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1259754383.4003.610.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 71eb0622f54..3878f501800 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3158,10 +3158,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ check_preempt_curr(this_rq, p, 0); } @@ -6992,7 +6988,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) __sched_fork(idle); idle->se.exec_start = sched_clock(); - idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); @@ -7696,7 +7691,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); -- cgit v1.2.3 From 0bcdcf28c979869f44e05121b96ff2cfb05bd8e6 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:46 +0100 Subject: sched: Fix missing sched tunable recalculation on cpu add/remove Based on Peter Zijlstras patch suggestion this enables recalculation of the scheduler tunables in response of a change in the number of cpus. It also adds a max of eight cpus that are considered in that scaling. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-2-git-send-email-ehrhardt@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++------------- kernel/sched_fair.c | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3878f501800..b54ecf84b6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; * default: 0.25ms */ unsigned int sysctl_sched_shares_ratelimit = 250000; +unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; /* * Inject some fuzzyness into changing the per-cpu group shares @@ -1814,6 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif static void calc_load_account_active(struct rq *this_rq); +static void update_sysctl(void); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -7028,22 +7030,23 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static inline void sched_init_granularity(void) +static void update_sysctl(void) { - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; - - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; + unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int factor = 1 + ilog2(cpus); - sysctl_sched_wakeup_granularity *= factor; +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_shares_ratelimit); +#undef SET_SYSCTL +} - sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ + update_sysctl(); } #ifdef CONFIG_SMP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c163a285bf0..71b3458245e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -35,12 +35,14 @@ * run vmstat and monitor the context-switches (cs) field) */ unsigned int sysctl_sched_latency = 5000000ULL; +unsigned int normalized_sysctl_sched_latency = 5000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -70,6 +72,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; * have immediate wakeup/sleep latencies. */ unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -1890,6 +1893,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } + +static void rq_online_fair(struct rq *rq) +{ + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ + update_sysctl(); +} + #endif /* CONFIG_SMP */ /* @@ -2035,6 +2049,8 @@ static const struct sched_class fair_sched_class = { .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v1.2.3 From 1983a922a1bc843806b9a36cf3a370b242783140 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:47 +0100 Subject: sched: Make tunable scaling style configurable As scaling now takes place on all kind of cpu add/remove events a user that configures values via proc should be able to configure if his set values are still rescaled or kept whatever happens. As the comments state that log2 was just a second guess that worked the interface is not just designed for on/off, but to choose a scaling type. Currently this allows none, log and linear, but more important it allwos us to keep the interface even if someone has an even better idea how to scale the values. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-3-git-send-email-ehrhardt@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++++- kernel/sched_debug.c | 10 ++++++++++ kernel/sched_fair.c | 13 +++++++++++++ kernel/sysctl.c | 14 ++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b54ecf84b6b..116efed962c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7033,7 +7033,20 @@ cpumask_var_t nohz_cpu_mask; static void update_sysctl(void) { unsigned int cpus = min(num_online_cpus(), 8U); - unsigned int factor = 1 + ilog2(cpus); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5fda66615fe..0fc5287fe80 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu) print_rq(m, rq, cpu); } +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + static int sched_debug_show(struct seq_file *m, void *v) { u64 now = ktime_to_ns(ktime_get()); @@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P + SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + for_each_online_cpu(cpu) print_cpu(m, cpu); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 71b3458245e..455106d318a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,7 @@ */ #include +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -37,6 +38,18 @@ unsigned int sysctl_sched_latency = 5000000ULL; unsigned int normalized_sysctl_sched_latency = 5000000ULL; +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; + /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e5cc53514ca..d10406e5fdf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -251,6 +251,8 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; #endif static struct ctl_table kern_table[] = { @@ -304,6 +306,18 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { .ctl_name = CTL_UNNUMBERED, .procname = "sched_shares_thresh", -- cgit v1.2.3 From acb4a848da821a095ae9e4d8b22ae2d9633ba5cd Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:48 +0100 Subject: sched: Update normalized values on user updates via proc The normalized values are also recalculated in case the scaling factor changes. This patch updates the internally used scheduler tuning values that are normalized to one cpu in case a user sets new values via sysfs. Together with patch 2 of this series this allows to let user configured values scale (or not) to cpu add/remove events taking place later. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-4-git-send-email-ehrhardt@linux.vnet.ibm.com> [ v2: fix warning ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++-- kernel/sched_fair.c | 11 ++++++++++- kernel/sysctl.c | 14 +++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 116efed962c..0a60e8e9b09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1816,6 +1816,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_active(struct rq *this_rq); static void update_sysctl(void); +static int get_update_sysctl_factor(void); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -7030,9 +7031,9 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static void update_sysctl(void) +static int get_update_sysctl_factor(void) { - unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int cpus = min(num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { @@ -7048,6 +7049,13 @@ static void update_sysctl(void) break; } + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) SET_SYSCTL(sched_min_granularity); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 455106d318a..804a411838f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -399,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ #ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -411,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_shares_ratelimit); +#undef WRT_SYSCTL + return 0; } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d10406e5fdf..b9e5a45f1e2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -253,6 +253,8 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +static int min_sched_shares_ratelimit = 100000; /* 100 usec */ +static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { @@ -271,7 +273,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, @@ -282,7 +284,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, @@ -293,7 +295,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, @@ -304,7 +306,9 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_shares_ratelimit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_proc_update_handler, + .extra1 = &min_sched_shares_ratelimit, + .extra2 = &max_sched_shares_ratelimit, }, { .ctl_name = CTL_UNNUMBERED, @@ -312,7 +316,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_tunable_scaling, .extra2 = &max_sched_tunable_scaling, -- cgit v1.2.3 From 822a6961112f0c9101d3359d8524604c3309ee6c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 8 Dec 2009 10:00:04 +0100 Subject: tracing/kprobes: Fix field creation's bad error handling When we define the common event fields in kprobe, we invert the error handling and return immediately in case of success. Then we omit to define specific kprobes fields (ip and nargs), and specific kretprobes fields (func, ret_ip, nargs). And we only define them when we fail to create common fields. The most visible consequence is that we can't create filter for k(ret)probes specific fields. This patch re-invert the success/error handling to fix it. Reported-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Acked-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Li Zefan LKML-Reference: <1260263815-5167-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bf05fb49a6f..b52d397e57e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1133,7 +1133,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); @@ -1151,7 +1151,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); -- cgit v1.2.3 From 21140f4d3387aa2213f1deea0128df1dbf924379 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 14:00:51 +0800 Subject: perf_event: Fix perf_swevent_hrtimer() variable initialization fix: [] ? printk+0x1d/0x24 [] ? perf_prepare_sample+0x269/0x280 [] warn_slowpath_common+0x71/0xd0 [] ? perf_prepare_sample+0x269/0x280 [] warn_slowpath_null+0x1a/0x20 [] perf_prepare_sample+0x269/0x280 [] ? cpu_clock+0x53/0x90 [] __perf_event_overflow+0x2a8/0x300 [] perf_event_overflow+0x1b/0x30 [] perf_swevent_hrtimer+0x7f/0x120 This is because 'data.raw' variable not initialize. Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B208E93.1010801@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 94e1b28333a..3a5d6c4786b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4010,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.raw = NULL; data.period = event->hw.last_period; regs = get_irq_regs(); /* -- cgit v1.2.3 From ea5b41f9d595be354f7a50e56b28c2d72e6e88a5 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 9 Dec 2009 14:29:36 -0800 Subject: lockdep: Avoid out of bounds array reference in save_trace() ia64 found this the hard way (because we currently have a stub for save_stack_trace() that does nothing). But it would be a good idea to be cautious in case a real save_stack_trace() bailed out with an error before it set trace->nr_entries. Signed-off-by: Tony Luck Acked-by: Peter Zijlstra Cc: luming.yu@intel.com LKML-Reference: <4b2024d085302c2a2@agluck-desktop.sc.intel.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7a3ae56b3a7..4f8df01dbe5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -386,7 +386,8 @@ static int save_trace(struct stack_trace *trace) * complete trace that maxes out the entries provided will be reported * as incomplete, friggin useless */ - if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + if (trace->nr_entries != 0 && + trace->entries[trace->nr_entries-1] == ULONG_MAX) trace->nr_entries--; trace->max_entries = trace->nr_entries; -- cgit v1.2.3 From 4ca3ef71f54655af98b66e8ff308a47a2a580a53 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 10 Dec 2009 09:25:53 +0100 Subject: sched: Fix build warning in get_update_sysctl_factor() Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a60e8e9b09..3de3deab809 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7033,7 +7033,7 @@ cpumask_var_t nohz_cpu_mask; */ static int get_update_sysctl_factor(void) { - unsigned int cpus = min(num_online_cpus(), 8); + unsigned int cpus = min_t(int, num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { -- cgit v1.2.3 From 41d2e494937715d3150e5c75d01f0e75ae899337 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Nov 2009 17:05:44 +0100 Subject: hrtimer: Tune hrtimer_interrupt hang logic The hrtimer_interrupt hang logic adjusts min_delta_ns based on the execution time of the hrtimer callbacks. This is error-prone for virtual machines, where a guest vcpu can be scheduled out during the execution of the callbacks (and the callbacks themselves can do operations that translate to blocking operations in the hypervisor), which in can lead to large min_delta_ns rendering the system unusable. Replace the current heuristics with something more reliable. Allow the interrupt code to try 3 times to catch up with the lost time. If that fails use the total time spent in the interrupt handler to defer the next timer interrupt so the system can catch up with other things which got delayed. Limit that deferment to 100ms. The retry events and the maximum time spent in the interrupt handler are recorded and exposed via /proc/timer_list Inspired by a patch from Marcelo. Reported-by: Michael Tokarev Signed-off-by: Thomas Gleixner Tested-by: Marcelo Tosatti Cc: kvm@vger.kernel.org --- kernel/hrtimer.c | 97 ++++++++++++++++++++++++++++-------------------- kernel/time/timer_list.c | 5 ++- 2 files changed, 61 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ede52770812..931a4d99bc5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (expires.tv64 < 0) return -ETIME; - if (expires.tv64 >= expires_next->tv64) + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) return 0; /* @@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, */ res = tick_program_event(expires, 0); if (!IS_ERR_VALUE(res)) - *expires_next = expires; + cpu_base->expires_next = expires; return res; } @@ -1217,30 +1226,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) #ifdef CONFIG_HIGH_RES_TIMERS -static int force_clock_reprogram; - -/* - * After 5 iteration's attempts, we consider that hrtimer_interrupt() - * is hanging, which could happen with something that slows the interrupt - * such as the tracing. Then we force the clock reprogramming for each future - * hrtimer interrupts to avoid infinite loops and use the min_delta_ns - * threshold that we will overwrite. - * The next tick event will be scheduled to 3 times we currently spend on - * hrtimer_interrupt(). This gives a good compromise, the cpus will spend - * 1/4 of their time to process the hrtimer interrupts. This is enough to - * let it running without serious starvation. - */ - -static inline void -hrtimer_interrupt_hanging(struct clock_event_device *dev, - ktime_t try_time) -{ - force_clock_reprogram = 1; - dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; - printk(KERN_WARNING "hrtimer: interrupt too slow, " - "forcing clock min delta to %llu ns\n", - (unsigned long long) dev->min_delta_ns); -} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1249,21 +1234,15 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; - ktime_t expires_next, now; - int nr_retries = 0; - int i; + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; - retry: - /* 5 retries is enough to notice a hang */ - if (!(++nr_retries % 5)) - hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); - - now = ktime_get(); - + entry_time = now = ktime_get(); +retry: expires_next.tv64 = KTIME_MAX; spin_lock(&cpu_base->lock); @@ -1325,10 +1304,48 @@ void hrtimer_interrupt(struct clock_event_device *dev) spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, force_clock_reprogram)) - goto retry; + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + */ + now = ktime_get(); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); } /* diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 665c76edbf1..9d80db4747d 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(expires_next); P(hres_active); P(nr_events); + P(nr_retries); + P(nr_hangs); + P_ns(max_hang_time); #endif #undef P #undef P_ns @@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.4\n"); + SEQ_printf(m, "Timer List Version: v0.5\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.2.3 From 5f201907dfe4ad42c44006ddfcec00ed12e59497 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 10 Dec 2009 10:56:29 +0100 Subject: hrtimer: move timer stats helper functions to hrtimer.c There is no reason to make timer_stats_hrtimer_set_start_info and friends visible to the rest of the kernel. So move all of them to hrtimer.c. Also make timer_stats_hrtimer_set_start_info a static inline function so it gets inlined and we avoid another function call. Based on a patch by Thomas Gleixner. Signed-off-by: Heiko Carstens LKML-Reference: <20091210095629.GC4144@osiris.boeblingen.de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 931a4d99bc5..d2f9239dc6b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -756,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ -#ifdef CONFIG_TIMER_STATS -void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { +#ifdef CONFIG_TIMER_STATS if (timer->start_site) return; - - timer->start_site = addr; + timer->start_site = __builtin_return_address(0); memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); timer->start_pid = current->pid; +#endif } + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; #endif +} + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); +#endif +} /* * Counterpart to lock_hrtimer_base above: -- cgit v1.2.3 From dfc12eb26a285df316be68a808af86964f3bff86 Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Thu, 10 Dec 2009 14:29:37 +0200 Subject: sched: Fix memory leak in two error corner cases If the second in each of these pairs of allocations fails, then the first one will not be freed in the error route out. Found by a static code analysis tool. Signed-off-by: Phil Carmody Acked-by: Peter Zijlstra LKML-Reference: <1260448177-28448-1-git-send-email-ext-phil.2.carmody@nokia.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3de3deab809..36cc05a7694 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9855,13 +9855,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) se = kzalloc_node(sizeof(struct sched_entity), GFP_KERNEL, cpu_to_node(i)); if (!se) - goto err; + goto err_free_rq; init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; + err_free_rq: + kfree(cfs_rq); err: return 0; } @@ -9943,13 +9945,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i)); if (!rt_se) - goto err; + goto err_free_rq; init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; + err_free_rq: + kfree(rt_rq); err: return 0; } -- cgit v1.2.3 From 5e855db5d8fec44e6604eb245aa9077bbd3f0d05 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 17:08:54 +0800 Subject: perf_event: Fix variable initialization in other codepaths Signed-off-by: Xiao Guangrong Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B20BAA6.7010609@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3a5d6c4786b..d891ec4a810 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4300,6 +4300,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; + sample.raw = NULL; sample.addr = bp->attr.bp_addr; if (!perf_exclude_event(bp, regs)) -- cgit v1.2.3 From b9889ed1ddeca5a3f3569c8de7354e9e97d803ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Dec 2009 20:32:39 +0100 Subject: sched: Remove forced2_migrations stats This build warning: kernel/sched.c: In function 'set_task_cpu': kernel/sched.c:2070: warning: unused variable 'old_rq' Made me realize that the forced2_migrations stat looks pretty pointless (and a misnomer) - remove it. Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ kernel/sched_debug.c | 2 -- 2 files changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 36cc05a7694..bc68037f319 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2067,7 +2067,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); @@ -2075,10 +2074,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (old_cpu != new_cpu) { p->se.nr_migrations++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } @@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 0fc5287fe80..5ae24fc65d7 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -432,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_failed_migrations_running); P(se.nr_failed_migrations_hot); P(se.nr_forced_migrations); - P(se.nr_forced2_migrations); P(se.nr_wakeups); P(se.nr_wakeups_sync); P(se.nr_wakeups_migrate); @@ -508,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; p->se.nr_wakeups_migrate = 0; -- cgit v1.2.3 From 84667d4849b0e0a939a76f9f62d45fa3b4d59692 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:13 -0600 Subject: kgdb: Read buffer overflow Roel Kluin reported an error found with Parfait. Where we want to ensure that that kgdb_info[-1] never gets accessed. Also check to ensure any negative tid does not exceed the size of the shadow CPU array, else report critical debug context because it is an internal kgdb failure. Reported-by: Roel Kluin Signed-off-by: Jason Wessel --- kernel/kgdb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 7d701463402..29357a9ccfb 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -541,12 +541,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid) */ if (tid == 0 || tid == -1) tid = -atomic_read(&kgdb_active) - 2; - if (tid < 0) { + if (tid < -1 && tid > -NR_CPUS - 2) { if (kgdb_info[-tid - 2].task) return kgdb_info[-tid - 2].task; else return idle_task(-tid - 2); } + if (tid <= 0) { + printk(KERN_ERR "KGDB: Internal thread select error\n"); + dump_stack(); + return NULL; + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore -- cgit v1.2.3 From 028e7b175970be8fca58bfd7d61cc375babe40b7 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:17 -0600 Subject: kgdb: allow for cpu switch when single stepping The kgdb core should not assume that a single step operation of a kernel thread will complete on the same CPU. The single step flag is set at the "thread" level and it is possible in a multi cpu system that a kernel thread can get scheduled on another cpu the next time it is run. As a further safety net in case a slave cpu is hung, the debug master cpu will try 100 times before giving up and assuming control of the slave cpus is no longer possible. It is more useful to be able to get some information out of kgdb instead of spinning forever. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 29357a9ccfb..ca21fe98e8d 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread; struct task_struct *kgdb_contthread; int kgdb_single_step; +pid_t kgdb_sstep_pid; /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; @@ -1400,6 +1401,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; unsigned long flags; + int sstep_tries = 100; int error = 0; int i, cpu; @@ -1430,13 +1432,14 @@ acquirelock: cpu_relax(); /* - * Do not start the debugger connection on this CPU if the last - * instance of the exception handler wanted to come into the - * debugger on a different CPU via a single step + * For single stepping, try to only enter on the processor + * that was single stepping. To gaurd against a deadlock, the + * kernel will only try for the value of sstep_tries before + * giving up and continuing on. */ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && - atomic_read(&kgdb_cpu_doing_single_step) != cpu) { - + (kgdb_info[cpu].task && + kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); clocksource_touch_watchdog(); @@ -1529,6 +1532,13 @@ acquirelock: } kgdb_restore: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); + if (kgdb_info[sstep_cpu].task) + kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; + else + kgdb_sstep_pid = 0; + } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); -- cgit v1.2.3 From d625e9c0d706eb43afbf52634d5cecacae1d57cc Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 27 Apr 2009 13:20:21 -0500 Subject: kgdb: continue and warn on signal passing from gdb On some architectures for the segv trap, gdb wants to pass the signal back on continue. For kgdb this is not the default behavior, because it can cause the kernel to crash if you arbitrarily pass back a exception outside of kgdb. Instead of causing instability, pass a message back to gdb about the supported kgdb signal passing and execute a standard kgdb continue operation. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index ca21fe98e8d..8584eac55e3 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1210,8 +1210,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks) return 1; } else { - error_packet(remcom_out_buffer, -EINVAL); - return 0; + kgdb_msg_write("KGDB only knows signal 9 (pass)" + " and 15 (pass and disconnect)\n" + "Executing a continue without signal passing\n", 0); + remcom_in_buffer[0] = 'c'; } /* Indicate fall through */ -- cgit v1.2.3 From 7f8b7ed6f825c729332b8190aca55c6bf95b158e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:20 -0600 Subject: kgdb: Always process the whole breakpoint list on activate or deactivate This patch fixes 2 edge cases in using kgdb in conjunction with gdb. 1) kgdb_deactivate_sw_breakpoints() should process the entire array of breakpoints. The failure to do so results in breakpoints that you cannot remove, because a break point can only be removed if its state flag is set to BP_SET. The easy way to duplicate this problem is to plant a break point in a kernel module and then unload the kernel module. 2) kgdb_activate_sw_breakpoints() should process the entire array of breakpoints. The failure to do so results in missed breakpoints when a breakpoint cannot be activated. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 8584eac55e3..2eb517e2351 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -625,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) static int kgdb_activate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -635,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_set_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + ret = error; + printk(KERN_INFO "KGDB: BP install failed: %lx", addr); + continue; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_ACTIVE; } - return 0; + return ret; } static int kgdb_set_sw_break(unsigned long addr) @@ -688,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr) static int kgdb_deactivate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -697,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_remove_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); + ret = error; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_SET; } - return 0; + return ret; } static int kgdb_remove_sw_break(unsigned long addr) -- cgit v1.2.3 From 5ec93d1154fd1e269162398f8e70efc7e004485d Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 30 Nov 2009 13:18:45 +0000 Subject: tty: Move the leader test in disassociate There are two call points, both want to check that tty->signal->leader is set. Move the test into disassociate_ctty() as that will make locking changes easier in a bit Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1143012951e..6f50ef55a6f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -971,7 +971,7 @@ NORET_TYPE void do_exit(long code) exit_thread(); cgroup_exit(tsk, 1); - if (group_dead && tsk->signal->leader) + if (group_dead) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); -- cgit v1.2.3