From 627371d73cdd04ed23fe098755b4f855138ad9e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:16:20 +0200 Subject: [PATCH] pi-futex: robust-futex exit crash fix Fix pi_state->list handling bugs: list handling mishap, locking error. Plus add more debug checks and fix a few style issues i noticed while debugging this. (reported by Ulrich Drepper and Jakub Jelinek.) Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/futex.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index cf0c8e21d1a..f59003b1d8f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -415,15 +415,15 @@ out_unlock: */ void exit_pi_state_list(struct task_struct *curr) { - struct futex_hash_bucket *hb; struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselfs + * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { @@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; + hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); - hb = hash_futex(&key); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ if (head->next != next) { spin_unlock(&hb->lock); continue; } - list_del_init(&pi_state->list); - WARN_ON(pi_state->owner != curr); - + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); @@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &me->key)) { + if (match_futex(&this->key, &me->key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) if (unlikely(!pi_state)) return -EINVAL; + WARN_ON(!atomic_read(&pi_state->refcount)); + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -510,6 +515,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) pi_state->key = me->key; spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; spin_unlock_irq(&p->pi_lock); @@ -584,9 +590,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (curval != uval) return -EINVAL; - list_del_init(&pi_state->owner->pi_state_list); + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; + spin_unlock_irq(&new_owner->pi_lock); + rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1236,6 +1250,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock_irq(&q.pi_state->owner->pi_lock); + WARN_ON(list_empty(&q.pi_state->list)); list_del_init(&q.pi_state->list); spin_unlock_irq(&q.pi_state->owner->pi_lock); } else @@ -1244,6 +1259,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, q.pi_state->owner = current; spin_lock_irq(¤t->pi_lock); + WARN_ON(!list_empty(&q.pi_state->list)); list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); -- cgit v1.2.3 From e3f2ddeac718c768fdac4b7fe69d465172f788a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:17:57 +0200 Subject: [PATCH] pi-futex: robust-futex exit Fix robust PI-futexes to be properly unlocked on unexpected exit. For this to work the kernel has to know whether a futex is a PI or a non-PI one, because the semantics are different. Since the space in relevant glibc data structures is extremely scarce, the best solution is to encode the 'PI' information in bit 0 of the robust list pointer. Existing (non-PI) glibc robust futexes have this bit always zero, so the ABI is kept. New glibc with PI-robust-futexes will set this bit. Further fixes from Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Ulrich Drepper Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/futex.c | 91 +++++++++++++++++++++++++++++++++++---------------- kernel/futex_compat.c | 34 ++++++++++++++----- 2 files changed, 87 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f59003b1d8f..dda2049692a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -495,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) } /* - * We are the first waiter - try to look up the real owner and - * attach the new pi_state to it: + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when the owner died bit is set + * and TID = 0: */ pid = uval & FUTEX_TID_MASK; + if (!pid && (uval & FUTEX_OWNER_DIED)) + return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; @@ -579,16 +582,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * kept enabled while there is PI state around. We must also * preserve the owner died bit.) */ - newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; - - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + newval = FUTEX_WAITERS | new_owner->pid; - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + } spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1443,9 +1447,11 @@ retry_locked: * again. If it succeeds then we can return without waking * anyone else up: */ - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + } if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1478,9 +1484,11 @@ retry_locked: /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); @@ -1699,9 +1707,9 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval; + u32 uval, nval, mval; retry: if (get_user(uval, uaddr)) @@ -1718,20 +1726,44 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED); + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); + if (nval == -EFAULT) return -1; if (nval != uval) goto retry; - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi) { + if (uval & FUTEX_WAITERS) + futex_wake(uaddr, 1); + } } return 0; } +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user **head, int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long *)head)) + return -EFAULT; + + *entry = (void *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -1742,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(entry, &head->list.next)) + if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: @@ -1760,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(pending, &head->list_op_pending)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; + if (pending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1772,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(entry, &entry->next)) + if (fetch_robust_entry(&entry, &entry->next, &pi)) return; /* * Avoid excessively long or circular lists: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1d92b441fb..d1aab1a452c 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -12,6 +12,23 @@ #include + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t *head, int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi; compat_uptr_t uentry, upending; - unsigned int limit = ROBUST_LIST_LIMIT; compat_long_t futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(uentry, &head->list.next)) + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; - entry = compat_ptr(uentry); /* * Fetch the relative futex offset: */ @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(upending, &head->list_op_pending)) + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pi)) return; - pending = compat_ptr(upending); if (upending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pi); while (compat_ptr(uentry) != &head->list) { /* @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(uentry, (compat_uptr_t *)&entry->next)) + if (fetch_robust_entry(&uentry, &entry, + (compat_uptr_t *)&entry->next, &pi)) return; - entry = compat_ptr(uentry); /* * Avoid excessively long or circular lists: */ -- cgit v1.2.3 From f712c0c7e1796f92e45e4de144e247816d974b8f Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sun, 30 Jul 2006 03:02:59 -0700 Subject: [PATCH] sched: build_sched_domains() fix Use the correct groups while initializing sched groups power for allnodes_domain. This fixes the crash observed while creating exclusive cpusets. Signed-off-by: Suresh Siddha Reported-and-tested-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b44b9a43b0f..41a571806ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6494,7 +6494,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - init_numa_sched_groups_power(sched_group_allnodes); + if (sched_group_allnodes) { + int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); + struct sched_group *sg = &sched_group_allnodes[group]; + + init_numa_sched_groups_power(sg); + } #endif /* Attach the domains */ -- cgit v1.2.3 From 15a647eba94c3da27ccc666bea72e7cca06b2d19 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Sun, 30 Jul 2006 03:03:08 -0700 Subject: [PATCH] genirq: {en,dis}able_irq_wake() need refcounting too IRQs need refcounting and a state flag to track whether the the IRQ should be enabled or disabled as a "normal IRQ" source after a series of calls to {en,dis}able_irq(). For shared IRQs, the IRQ must be enabled so long as at least one driver needs it active. Likewise, IRQs need the same support to track whether the IRQ should be enabled or disabled as a "wakeup event" source after a series of calls to {en,dis}able_irq_wake(). For shared IRQs, the IRQ must be enabled as a wakeup source during sleep so long as at least one driver needs it. But right now they _don't have_ that refcounting ... which means sharing a wakeup-capable IRQ can't work correctly in some configurations. This patch adds the refcount and flag mechanisms to set_irq_wake() -- which is what {en,dis}able_irq_wake() call -- and minimal documentation of what the irq wake mechanism does. Drivers relying on the older (broken) "toggle" semantics will trigger a warning; that'll be a handful of drivers on ARM systems. Signed-off-by: David Brownell Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4e461438e48..92be519eff2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq); * @irq: interrupt to control * @on: enable/disable power management wakeup * - * Enable/disable power management wakeup mode + * Enable/disable power management wakeup mode, which is + * disabled by default. Enables and disables must match, + * just as they match for non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep + * states like "suspend to RAM". */ int set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_desc + irq; unsigned long flags; int ret = -ENXIO; + int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake; + /* wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ spin_lock_irqsave(&desc->lock, flags); - if (desc->chip->set_wake) + if (on) { + if (desc->wake_depth++ == 0) + desc->status |= IRQ_WAKEUP; + else + set_wake = NULL; + } else { + if (desc->wake_depth == 0) { + printk(KERN_WARNING "Unbalanced IRQ %d " + "wake disable\n", irq); + WARN_ON(1); + } else if (--desc->wake_depth == 0) + desc->status &= ~IRQ_WAKEUP; + else + set_wake = NULL; + } + if (set_wake) ret = desc->chip->set_wake(irq, on); spin_unlock_irqrestore(&desc->lock, flags); return ret; -- cgit v1.2.3 From 7d94dddd438bcba97db44f120da39bb001b5249f Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:10 -0700 Subject: [PATCH] make taskstats sending completely independent of delay accounting on/off status Complete the separation of delay accounting and taskstats by ignoring the return value of delay accounting functions that fill in parts of taskstats before it is sent out (either in response to a command or as part of a task exit). Also make delayacct_add_tsk return silently when delay accounting is turned off rather than treat it as an error. Signed-off-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f45179ce028..b4c737a1140 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -177,7 +177,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) static int fill_pid(pid_t pid, struct task_struct *pidtsk, struct taskstats *stats) { - int rc; + int rc = 0; struct task_struct *tsk = pidtsk; if (!pidtsk) { @@ -196,12 +196,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, * Each accounting subsystem adds calls to its functions to * fill in relevant parts of struct taskstsats as follows * - * rc = per-task-foo(stats, tsk); - * if (rc) - * goto err; + * per-task-foo(stats, tsk); */ - rc = delayacct_add_tsk(stats, tsk); + delayacct_add_tsk(stats, tsk); stats->version = TASKSTATS_VERSION; /* Define err: label here if needed */ -- cgit v1.2.3 From d94a041519f3ab1ac023bf917619cd8c4a7d3c01 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:11 -0700 Subject: [PATCH] taskstats: free skb, avoid returns in send_cpu_listeners Add a missing freeing of skb in the case there are no listeners at all. Also remove the returning of error values by the function as it is unused by the sole caller. Signed-off-by: Shailabh Nagar Signed-off-by: Chandra Seetharaman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b4c737a1140..e7818765733 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid) /* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ -static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); struct listener_list *listeners; struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, ret, delcount = 0; + int rc, delcount = 0; rc = genlmsg_end(skb, reply); if (rc < 0) { nlmsg_free(skb); - return rc; + return; } rc = 0; listeners = &per_cpu(listener_array, cpu); down_read(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { + list_for_each_entry(s, &listeners->list, list) { skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); - if (!skb_next) { - nlmsg_free(skb_cur); - rc = -ENOMEM; + if (!skb_next) break; - } } - ret = genlmsg_unicast(skb_cur, s->pid); - if (ret == -ECONNREFUSED) { + rc = genlmsg_unicast(skb_cur, s->pid); + if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; - rc = ret; } skb_cur = skb_next; } up_read(&listeners->sem); + if (skb_cur) + nlmsg_free(skb_cur); + if (!delcount) - return rc; + return; /* Delete invalidated entries */ down_write(&listeners->sem); @@ -171,7 +170,6 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } } up_write(&listeners->sem); - return rc; } static int fill_pid(pid_t pid, struct task_struct *pidtsk, -- cgit v1.2.3 From 163ecdff060f2fa9e8f5238882fd0137493556a6 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:11 -0700 Subject: [PATCH] delay accounting: temporarily enable by default Enable delay accounting by default so that feature gets coverage testing without requiring special measures. Earlier, it was off by default and had to be enabled via a boot time param. This patch reverses the default behaviour to improve coverage testing. It can be removed late in the kernel development cycle if its believed users shouldn't have to incur any cost if they don't want delay accounting. Or it can be retained forever if the utility of the stats is deemed common enough to warrant keeping the feature on. Signed-off-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index f05392d6426..57ca3730205 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -19,15 +19,15 @@ #include #include -int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ kmem_cache_t *delayacct_cache; -static int __init delayacct_setup_enable(char *str) +static int __init delayacct_setup_disable(char *str) { - delayacct_on = 1; + delayacct_on = 0; return 1; } -__setup("delayacct", delayacct_setup_enable); +__setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { -- cgit v1.2.3 From a9ad965ea9a6d719daf333847a2ceb0e363994bd Mon Sep 17 00:00:00 2001 From: "bibo, mao" Date: Sun, 30 Jul 2006 03:03:26 -0700 Subject: [PATCH] IA64: kprobe invalidate icache of jump buffer Kprobe inserts breakpoint instruction in probepoint and then jumps to instruction slot when breakpoint is hit, the instruction slot icache must be consistent with dcache. Here is the patch which invalidates instruction slot icache area. Without this patch, in some machines there will be fault when executing instruction slot where icache content is inconsistent with dcache. Signed-off-by: bibo,mao Acked-by: "Luck, Tony" Acked-by: Keshavamurthy Anil S Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 64aab081153..3f57dfdc8f9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { copy_kprobe(p, ap); + flush_insn_slot(ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; -- cgit v1.2.3 From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Sun, 30 Jul 2006 03:03:35 -0700 Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu notifications Few of the callback functions and notifier blocks that are associated with cpu notifications incorrectly have __devinit and __devinitdata. They should be __cpuinit and __cpuinitdata instead. It makes no functional difference but wastes text area when CONFIG_HOTPLUG is enabled and CONFIG_HOTPLUG_CPU is not. This patch fixes all those instances. Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 4 ++-- kernel/rcupdate.c | 4 ++-- kernel/softirq.c | 4 ++-- kernel/softlockup.c | 4 ++-- kernel/timer.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d17766d40da..be989efc785 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit hrtimer_cpu_notify(struct notifier_block *self, +static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata hrtimers_nb = { +static struct notifier_block __cpuinitdata hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 759805c9859..436ab35f6fa 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } -static int __devinit rcu_cpu_notify(struct notifier_block *self, +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata rcu_nb = { +static struct notifier_block __cpuinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, }; diff --git a/kernel/softirq.c b/kernel/softirq.c index 0f08a84ae30..aab880677ce 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -547,7 +547,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit cpu_callback(struct notifier_block *nfb, +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -587,7 +587,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 6b76caa2298..03e6a2b0b78 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) /* * Create/destroy watchdog threads as CPUs come and go: */ -static int __devinit +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/timer.c b/kernel/timer.c index 05809c2e2fd..ee319fc69f4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1688,7 +1688,7 @@ static void __devinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit timer_cpu_notify(struct notifier_block *self, +static int __cpuinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1708,7 +1708,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata timers_nb = { +static struct notifier_block __cpuinitdata timers_nb = { .notifier_call = timer_cpu_notify, }; -- cgit v1.2.3 From 6ea24f9ad18c65cc179593b5cc2a88cdadf8cc0c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 30 Jul 2006 03:03:38 -0700 Subject: [PATCH] fix bad macro param in timer.c We have #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK and it's used via list = varray[i + 1]->vec + (INDEX(i + 1)); So, due to underparenthesisation, this INDEX(i+1) is now a ... (TVR_BITS + i + 1 * TVN_BITS)) ... So this bugfix changes behaviour. It worked before by sheer luck: "If i was anything but 0, it was broken. But this was only used by s390 and arm. Since it was for the next interrupt, could that next interrupt be a problem (going into the second cascade)? But it was probably seldom wrong. That is, this would fail if the next interrupt was in the second cascade, and was wrapped. Which may never of happened. Also if it did happen, it would have just missed the interrupt. If an interrupt was missed, and no one was there to miss it, was it really missed :-)" Signed-off-by: Steven Rostedt Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index ee319fc69f4..726caf9a0a6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -408,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * This function cascades all vectors and executes all expired timer * vectors. */ -#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) static inline void __run_timers(tvec_base_t *base) { -- cgit v1.2.3 From 2d7d253548cffdce80f4e03664686e9ccb1b0ed7 Mon Sep 17 00:00:00 2001 From: Jim Houston Date: Sun, 30 Jul 2006 03:03:39 -0700 Subject: [PATCH] fix cond_resched() fix In cond_resched_lock() it calls __resched_legal() before dropping the spin lock. __resched_legal() will always finds the preempt_count non-zero and will prevent the call to __cond_resched(). The attached patch adds a parameter to __resched_legal() with the expected preempt_count value. Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 41a571806ce..de440b220b4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4456,9 +4456,9 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline int __resched_legal(void) +static inline int __resched_legal(int expected_preempt_count) { - if (unlikely(preempt_count())) + if (unlikely(preempt_count() != expected_preempt_count)) return 0; if (unlikely(system_state != SYSTEM_RUNNING)) return 0; @@ -4484,7 +4484,7 @@ static void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(0)) { __cond_resched(); return 1; } @@ -4510,7 +4510,7 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(1)) { spin_release(&lock->dep_map, 1, _THIS_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); @@ -4526,7 +4526,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(0)) { raw_local_irq_disable(); _local_bh_enable(); raw_local_irq_enable(); -- cgit v1.2.3 From 0fcb78c22f06340cdba884d7381adb3a0148bbb6 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Sun, 30 Jul 2006 03:03:42 -0700 Subject: [PATCH] Add DocBook documentation for workqueue functions kernel/workqueue.c was omitted from generating kernel documentation. This adds a new section "Workqueues and Kevents" and adds documentation for some of the functions. Some functions in this file already had DocBook-style comments, now they finally become visible. Signed-off-by: Rolf Eike Beer Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eebb1d83923..448e8f7b342 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, spin_unlock_irqrestore(&cwq->lock, flags); } -/* - * Queue work on a workqueue. Return non-zero if it was successfully - * added. +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns non-zero if it was successfully added. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data) __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) { @@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work); +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) { @@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } } -/* +/** * flush_workqueue - ensure that any scheduled work has run to completion. + * @wq: workqueue to flush * * Forces execution of the workqueue and blocks until its completion. * This is typically used in driver shutdown handlers. @@ -400,6 +421,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) kthread_stop(p); } +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ void destroy_workqueue(struct workqueue_struct *wq) { int cpu; @@ -425,18 +452,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); static struct workqueue_struct *keventd_wq; +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * This puts a job in the kernel-global workqueue. + */ int fastcall schedule_work(struct work_struct *work) { return queue_work(keventd_wq, work); } EXPORT_SYMBOL(schedule_work); +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) { return queue_delayed_work(keventd_wq, work, delay); } EXPORT_SYMBOL(schedule_delayed_work); +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay) { -- cgit v1.2.3 From b50f60ceeef2e38e529737c0260d9543939915ad Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 30 Jul 2006 03:03:52 -0700 Subject: [PATCH] pi-futex: missing pi_waiters plist initialization Initialize init task's pi_waiters plist. Otherwise cpu hotplug of cpu 0 might crash, since rt_mutex_getprio() accesses an uninitialized list head. call chain which led to crash: take_cpu_down sched_idle_next __setscheduler rt_mutex_getprio Using PLIST_HEAD_INIT in the INIT_TASK macro doesn't work unfortunately, since the pi_waiters member is only conditionally present. Cc: Arjan van de Ven Cc: Thomas Gleixner Acked-by: Ingo Molnar Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index de440b220b4..a2be2d05529 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6766,6 +6766,11 @@ void __init sched_init(void) } set_load_weight(&init_task); + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); +#endif + /* * The boot idle thread does lazy MMU switching as well: */ -- cgit v1.2.3 From 3c829c367a1a52550378584a657768217971e587 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 30 Jul 2006 03:04:02 -0700 Subject: [PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index aab880677ce..3789ca98197 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void) * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ +#ifdef CONFIG_TRACE_IRQFLAGS static void __local_bh_disable(unsigned long ip) { unsigned long flags; @@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip) trace_softirqs_off(ip); raw_local_irq_restore(flags); } +#else /* !CONFIG_TRACE_IRQFLAGS */ +static inline void __local_bh_disable(unsigned long ip) +{ + add_preempt_count(SOFTIRQ_OFFSET); + barrier(); +} +#endif /* CONFIG_TRACE_IRQFLAGS */ void local_bh_disable(void) { @@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable); void local_bh_enable(void) { +#ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; WARN_ON_ONCE(in_irq()); +#endif WARN_ON_ONCE(irqs_disabled()); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_save(flags); +#endif /* * Are softirqs going to be turned on now: */ @@ -142,18 +154,22 @@ void local_bh_enable(void) do_softirq(); dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_restore(flags); +#endif preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable); void local_bh_enable_ip(unsigned long ip) { +#ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; WARN_ON_ONCE(in_irq()); local_irq_save(flags); +#endif /* * Are softirqs going to be turned on now: */ @@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip) do_softirq(); dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_restore(flags); +#endif preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable_ip); -- cgit v1.2.3 From d07fe82c24daab2360e2790f488bcffa7db74825 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 30 Jul 2006 03:04:03 -0700 Subject: [PATCH] reference rt-mutex-design in rtmutex.c In order to prevent Doc Rot, this patch adds a reference to the design document for rtmutex.c in rtmutex.c. So when someone needs to update or change the design of that file they will know that a document actually exists that explains the design (helping them change it), and hopefully that they will update the document if they too change the design. Signed-off-by: Steven Rostedt Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rtmutex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index d2ef13b485e..3e13a1e5856 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -7,6 +7,8 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * + * See Documentation/rt-mutex-design.txt for details. */ #include #include -- cgit v1.2.3 From 51d8c5edd3b166fcc51aba84d78761d578400a7c Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:04:14 -0700 Subject: [PATCH] timer: Fix tvec_bases initializer kernel/timer.c defines a (per-cpu) pointer to tvec_base_t, but initializes it using { &a_tvec_base_t }, which sparse warns about; change this to just &a_tvec_base_t. Signed-off-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 726caf9a0a6..b650f04888e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t; tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; +static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) -- cgit v1.2.3