From 627371d73cdd04ed23fe098755b4f855138ad9e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 29 Jul 2006 05:16:20 +0200
Subject: [PATCH] pi-futex: robust-futex exit crash fix

Fix pi_state->list handling bugs: list handling mishap, locking error.
Plus add more debug checks and fix a few style issues i noticed while
debugging this.

(reported by Ulrich Drepper and Jakub Jelinek.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index cf0c8e21d1a..f59003b1d8f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -415,15 +415,15 @@ out_unlock:
  */
 void exit_pi_state_list(struct task_struct *curr)
 {
-	struct futex_hash_bucket *hb;
 	struct list_head *next, *head = &curr->pi_state_list;
 	struct futex_pi_state *pi_state;
+	struct futex_hash_bucket *hb;
 	union futex_key key;
 
 	/*
 	 * We are a ZOMBIE and nobody can enqueue itself on
 	 * pi_state_list anymore, but we have to be careful
-	 * versus waiters unqueueing themselfs
+	 * versus waiters unqueueing themselves:
 	 */
 	spin_lock_irq(&curr->pi_lock);
 	while (!list_empty(head)) {
@@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr)
 		next = head->next;
 		pi_state = list_entry(next, struct futex_pi_state, list);
 		key = pi_state->key;
+		hb = hash_futex(&key);
 		spin_unlock_irq(&curr->pi_lock);
 
-		hb = hash_futex(&key);
 		spin_lock(&hb->lock);
 
 		spin_lock_irq(&curr->pi_lock);
+		/*
+		 * We dropped the pi-lock, so re-check whether this
+		 * task still owns the PI-state:
+		 */
 		if (head->next != next) {
 			spin_unlock(&hb->lock);
 			continue;
 		}
 
-		list_del_init(&pi_state->list);
-
 		WARN_ON(pi_state->owner != curr);
-
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
 		pi_state->owner = NULL;
 		spin_unlock_irq(&curr->pi_lock);
 
@@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	head = &hb->chain;
 
 	list_for_each_entry_safe(this, next, head, list) {
-		if (match_futex (&this->key, &me->key)) {
+		if (match_futex(&this->key, &me->key)) {
 			/*
 			 * Another waiter already exists - bump up
 			 * the refcount and return its pi_state:
@@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 			if (unlikely(!pi_state))
 				return -EINVAL;
 
+			WARN_ON(!atomic_read(&pi_state->refcount));
+
 			atomic_inc(&pi_state->refcount);
 			me->pi_state = pi_state;
 
@@ -510,6 +515,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	pi_state->key = me->key;
 
 	spin_lock_irq(&p->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &p->pi_state_list);
 	pi_state->owner = p;
 	spin_unlock_irq(&p->pi_lock);
@@ -584,9 +590,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	if (curval != uval)
 		return -EINVAL;
 
-	list_del_init(&pi_state->owner->pi_state_list);
+	spin_lock_irq(&pi_state->owner->pi_lock);
+	WARN_ON(list_empty(&pi_state->list));
+	list_del_init(&pi_state->list);
+	spin_unlock_irq(&pi_state->owner->pi_lock);
+
+	spin_lock_irq(&new_owner->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &new_owner->pi_state_list);
 	pi_state->owner = new_owner;
+	spin_unlock_irq(&new_owner->pi_lock);
+
 	rt_mutex_unlock(&pi_state->pi_mutex);
 
 	return 0;
@@ -1236,6 +1250,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 		/* Owner died? */
 		if (q.pi_state->owner != NULL) {
 			spin_lock_irq(&q.pi_state->owner->pi_lock);
+			WARN_ON(list_empty(&q.pi_state->list));
 			list_del_init(&q.pi_state->list);
 			spin_unlock_irq(&q.pi_state->owner->pi_lock);
 		} else
@@ -1244,6 +1259,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 		q.pi_state->owner = current;
 
 		spin_lock_irq(&current->pi_lock);
+		WARN_ON(!list_empty(&q.pi_state->list));
 		list_add(&q.pi_state->list, &current->pi_state_list);
 		spin_unlock_irq(&current->pi_lock);
 
-- 
cgit v1.2.3


From e3f2ddeac718c768fdac4b7fe69d465172f788a8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 29 Jul 2006 05:17:57 +0200
Subject: [PATCH] pi-futex: robust-futex exit

Fix robust PI-futexes to be properly unlocked on unexpected exit.

For this to work the kernel has to know whether a futex is a PI or a
non-PI one, because the semantics are different.  Since the space in
relevant glibc data structures is extremely scarce, the best solution is
to encode the 'PI' information in bit 0 of the robust list pointer.
Existing (non-PI) glibc robust futexes have this bit always zero, so the
ABI is kept.  New glibc with PI-robust-futexes will set this bit.

Further fixes from Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c        | 91 +++++++++++++++++++++++++++++++++++----------------
 kernel/futex_compat.c | 34 ++++++++++++++-----
 2 files changed, 87 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index f59003b1d8f..dda2049692a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -495,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	}
 
 	/*
-	 * We are the first waiter - try to look up the real owner and
-	 * attach the new pi_state to it:
+	 * We are the first waiter - try to look up the real owner and attach
+	 * the new pi_state to it, but bail out when the owner died bit is set
+	 * and TID = 0:
 	 */
 	pid = uval & FUTEX_TID_MASK;
+	if (!pid && (uval & FUTEX_OWNER_DIED))
+		return -ESRCH;
 	p = futex_find_get_task(pid);
 	if (!p)
 		return -ESRCH;
@@ -579,16 +582,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	 * kept enabled while there is PI state around. We must also
 	 * preserve the owner died bit.)
 	 */
-	newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
-
-	inc_preempt_count();
-	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-	dec_preempt_count();
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		newval = FUTEX_WAITERS | new_owner->pid;
 
-	if (curval == -EFAULT)
-		return -EFAULT;
-	if (curval != uval)
-		return -EINVAL;
+		inc_preempt_count();
+		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+		dec_preempt_count();
+		if (curval == -EFAULT)
+			return -EFAULT;
+		if (curval != uval)
+			return -EINVAL;
+	}
 
 	spin_lock_irq(&pi_state->owner->pi_lock);
 	WARN_ON(list_empty(&pi_state->list));
@@ -1443,9 +1447,11 @@ retry_locked:
 	 * again. If it succeeds then we can return without waking
 	 * anyone else up:
 	 */
-	inc_preempt_count();
-	uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-	dec_preempt_count();
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		inc_preempt_count();
+		uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+		dec_preempt_count();
+	}
 
 	if (unlikely(uval == -EFAULT))
 		goto pi_faulted;
@@ -1478,9 +1484,11 @@ retry_locked:
 	/*
 	 * No waiters - kernel unlocks the futex:
 	 */
-	ret = unlock_futex_pi(uaddr, uval);
-	if (ret == -EFAULT)
-		goto pi_faulted;
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		ret = unlock_futex_pi(uaddr, uval);
+		if (ret == -EFAULT)
+			goto pi_faulted;
+	}
 
 out_unlock:
 	spin_unlock(&hb->lock);
@@ -1699,9 +1707,9 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-	u32 uval, nval;
+	u32 uval, nval, mval;
 
 retry:
 	if (get_user(uval, uaddr))
@@ -1718,20 +1726,44 @@ retry:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
-						     uval | FUTEX_OWNER_DIED);
+		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
 		if (nval == -EFAULT)
 			return -1;
 
 		if (nval != uval)
 			goto retry;
 
-		if (uval & FUTEX_WAITERS)
-			futex_wake(uaddr, 1);
+		/*
+		 * Wake robust non-PI futexes here. The wakeup of
+		 * PI futexes happens in exit_pi_state():
+		 */
+		if (!pi) {
+			if (uval & FUTEX_WAITERS)
+				futex_wake(uaddr, 1);
+		}
 	}
 	return 0;
 }
 
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+				     struct robust_list __user **head, int *pi)
+{
+	unsigned long uentry;
+
+	if (get_user(uentry, (unsigned long *)head))
+		return -EFAULT;
+
+	*entry = (void *)(uentry & ~1UL);
+	*pi = uentry & 1;
+
+	return 0;
+}
+
 /*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
@@ -1742,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr)
 {
 	struct robust_list_head __user *head = curr->robust_list;
 	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned long futex_offset;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
 	 * sys_set_robust_list()):
 	 */
-	if (get_user(entry, &head->list.next))
+	if (fetch_robust_entry(&entry, &head->list.next, &pi))
 		return;
 	/*
 	 * Fetch the relative futex offset:
@@ -1760,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr)
 	 * Fetch any possibly pending lock-add first, and handle it
 	 * if it exists:
 	 */
-	if (get_user(pending, &head->list_op_pending))
+	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
 		return;
+
 	if (pending)
-		handle_futex_death((void *)pending + futex_offset, curr);
+		handle_futex_death((void *)pending + futex_offset, curr, pip);
 
 	while (entry != &head->list) {
 		/*
@@ -1772,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr)
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
-						curr))
+						curr, pi))
 				return;
 		/*
 		 * Fetch the next entry in the list:
 		 */
-		if (get_user(entry, &entry->next))
+		if (fetch_robust_entry(&entry, &entry->next, &pi))
 			return;
 		/*
 		 * Avoid excessively long or circular lists:
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1d92b441fb..d1aab1a452c 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
 
 #include <asm/uaccess.h>
 
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+		   compat_uptr_t *head, int *pi)
+{
+	if (get_user(*uentry, head))
+		return -EFAULT;
+
+	*entry = compat_ptr((*uentry) & ~1);
+	*pi = (unsigned int)(*uentry) & 1;
+
+	return 0;
+}
+
 /*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
 	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi;
 	compat_uptr_t uentry, upending;
-	unsigned int limit = ROBUST_LIST_LIMIT;
 	compat_long_t futex_offset;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
 	 * sys_set_robust_list()):
 	 */
-	if (get_user(uentry, &head->list.next))
+	if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
 		return;
-	entry = compat_ptr(uentry);
 	/*
 	 * Fetch the relative futex offset:
 	 */
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
 	 * Fetch any possibly pending lock-add first, and handle it
 	 * if it exists:
 	 */
-	if (get_user(upending, &head->list_op_pending))
+	if (fetch_robust_entry(&upending, &pending,
+			       &head->list_op_pending, &pi))
 		return;
-	pending = compat_ptr(upending);
 	if (upending)
-		handle_futex_death((void *)pending + futex_offset, curr);
+		handle_futex_death((void *)pending + futex_offset, curr, pi);
 
 	while (compat_ptr(uentry) != &head->list) {
 		/*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
-						curr))
+						curr, pi))
 				return;
 
 		/*
 		 * Fetch the next entry in the list:
 		 */
-		if (get_user(uentry, (compat_uptr_t *)&entry->next))
+		if (fetch_robust_entry(&uentry, &entry,
+				       (compat_uptr_t *)&entry->next, &pi))
 			return;
-		entry = compat_ptr(uentry);
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
-- 
cgit v1.2.3


From f712c0c7e1796f92e45e4de144e247816d974b8f Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Sun, 30 Jul 2006 03:02:59 -0700
Subject: [PATCH] sched: build_sched_domains() fix

Use the correct groups while initializing sched groups power for
allnodes_domain.  This fixes the crash observed while creating exclusive
cpusets.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Reported-and-tested-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b44b9a43b0f..41a571806ce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6494,7 +6494,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
-	init_numa_sched_groups_power(sched_group_allnodes);
+	if (sched_group_allnodes) {
+		int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+		struct sched_group *sg = &sched_group_allnodes[group];
+
+		init_numa_sched_groups_power(sg);
+	}
 #endif
 
 	/* Attach the domains */
-- 
cgit v1.2.3


From 15a647eba94c3da27ccc666bea72e7cca06b2d19 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 30 Jul 2006 03:03:08 -0700
Subject: [PATCH] genirq: {en,dis}able_irq_wake() need refcounting too

IRQs need refcounting and a state flag to track whether the the IRQ should
be enabled or disabled as a "normal IRQ" source after a series of calls to
{en,dis}able_irq().  For shared IRQs, the IRQ must be enabled so long as at
least one driver needs it active.

Likewise, IRQs need the same support to track whether the IRQ should be
enabled or disabled as a "wakeup event" source after a series of calls to
{en,dis}able_irq_wake().  For shared IRQs, the IRQ must be enabled as a
wakeup source during sleep so long as at least one driver needs it.  But
right now they _don't have_ that refcounting ...  which means sharing a
wakeup-capable IRQ can't work correctly in some configurations.

This patch adds the refcount and flag mechanisms to set_irq_wake() -- which
is what {en,dis}able_irq_wake() call -- and minimal documentation of what
the irq wake mechanism does.

Drivers relying on the older (broken) "toggle" semantics will trigger a
warning; that'll be a handful of drivers on ARM systems.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4e461438e48..92be519eff2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq);
  *	@irq:	interrupt to control
  *	@on:	enable/disable power management wakeup
  *
- *	Enable/disable power management wakeup mode
+ *	Enable/disable power management wakeup mode, which is
+ *	disabled by default.  Enables and disables must match,
+ *	just as they match for non-wakeup mode support.
+ *
+ *	Wakeup mode lets this IRQ wake the system from sleep
+ *	states like "suspend to RAM".
  */
 int set_irq_wake(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 	int ret = -ENXIO;
+	int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
 
+	/* wakeup-capable irqs can be shared between drivers that
+	 * don't need to have the same sleep mode behaviors.
+	 */
 	spin_lock_irqsave(&desc->lock, flags);
-	if (desc->chip->set_wake)
+	if (on) {
+		if (desc->wake_depth++ == 0)
+			desc->status |= IRQ_WAKEUP;
+		else
+			set_wake = NULL;
+	} else {
+		if (desc->wake_depth == 0) {
+			printk(KERN_WARNING "Unbalanced IRQ %d "
+					"wake disable\n", irq);
+			WARN_ON(1);
+		} else if (--desc->wake_depth == 0)
+			desc->status &= ~IRQ_WAKEUP;
+		else
+			set_wake = NULL;
+	}
+	if (set_wake)
 		ret = desc->chip->set_wake(irq, on);
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
-- 
cgit v1.2.3


From 7d94dddd438bcba97db44f120da39bb001b5249f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Sun, 30 Jul 2006 03:03:10 -0700
Subject: [PATCH] make taskstats sending completely independent of delay
 accounting on/off status

Complete the separation of delay accounting and taskstats by ignoring the
return value of delay accounting functions that fill in parts of taskstats
before it is sent out (either in response to a command or as part of a task
exit).

Also make delayacct_add_tsk return silently when delay accounting is turned
off rather than treat it as an error.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f45179ce028..b4c737a1140 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -177,7 +177,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
-	int rc;
+	int rc = 0;
 	struct task_struct *tsk = pidtsk;
 
 	if (!pidtsk) {
@@ -196,12 +196,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 * Each accounting subsystem adds calls to its functions to
 	 * fill in relevant parts of struct taskstsats as follows
 	 *
-	 *	rc = per-task-foo(stats, tsk);
-	 *	if (rc)
-	 *		goto err;
+	 *	per-task-foo(stats, tsk);
 	 */
 
-	rc = delayacct_add_tsk(stats, tsk);
+	delayacct_add_tsk(stats, tsk);
 	stats->version = TASKSTATS_VERSION;
 
 	/* Define err: label here if needed */
-- 
cgit v1.2.3


From d94a041519f3ab1ac023bf917619cd8c4a7d3c01 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Sun, 30 Jul 2006 03:03:11 -0700
Subject: [PATCH] taskstats: free skb, avoid returns in send_cpu_listeners

Add a missing freeing of skb in the case there are no listeners at all.
Also remove the returning of error values by the function as it is unused
by the sole caller.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b4c737a1140..e7818765733 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
  * Send taskstats data in @skb to listeners registered for @cpu's exit data
  */
-static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
 	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret, delcount = 0;
+	int rc, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
-		return rc;
+		return;
 	}
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
 	down_read(&listeners->sem);
-	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+	list_for_each_entry(s, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
 			skb_next = skb_clone(skb_cur, GFP_KERNEL);
-			if (!skb_next) {
-				nlmsg_free(skb_cur);
-				rc = -ENOMEM;
+			if (!skb_next)
 				break;
-			}
 		}
-		ret = genlmsg_unicast(skb_cur, s->pid);
-		if (ret == -ECONNREFUSED) {
+		rc = genlmsg_unicast(skb_cur, s->pid);
+		if (rc == -ECONNREFUSED) {
 			s->valid = 0;
 			delcount++;
-			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
 	up_read(&listeners->sem);
 
+	if (skb_cur)
+		nlmsg_free(skb_cur);
+
 	if (!delcount)
-		return rc;
+		return;
 
 	/* Delete invalidated entries */
 	down_write(&listeners->sem);
@@ -171,7 +170,6 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 	}
 	up_write(&listeners->sem);
-	return rc;
 }
 
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
-- 
cgit v1.2.3


From 163ecdff060f2fa9e8f5238882fd0137493556a6 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Sun, 30 Jul 2006 03:03:11 -0700
Subject: [PATCH] delay accounting: temporarily enable by default

Enable delay accounting by default so that feature gets coverage testing
without requiring special measures.

Earlier, it was off by default and had to be enabled via a boot time param.
 This patch reverses the default behaviour to improve coverage testing.  It
can be removed late in the kernel development cycle if its believed users
shouldn't have to incur any cost if they don't want delay accounting.  Or
it can be retained forever if the utility of the stats is deemed common
enough to warrant keeping the feature on.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/delayacct.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index f05392d6426..57ca3730205 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,15 +19,15 @@
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
 
-int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
+int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
 kmem_cache_t *delayacct_cache;
 
-static int __init delayacct_setup_enable(char *str)
+static int __init delayacct_setup_disable(char *str)
 {
-	delayacct_on = 1;
+	delayacct_on = 0;
 	return 1;
 }
-__setup("delayacct", delayacct_setup_enable);
+__setup("nodelayacct", delayacct_setup_disable);
 
 void delayacct_init(void)
 {
-- 
cgit v1.2.3


From a9ad965ea9a6d719daf333847a2ceb0e363994bd Mon Sep 17 00:00:00 2001
From: "bibo, mao" <bibo.mao@intel.com>
Date: Sun, 30 Jul 2006 03:03:26 -0700
Subject: [PATCH] IA64: kprobe invalidate icache of jump buffer

Kprobe inserts breakpoint instruction in probepoint and then jumps to
instruction slot when breakpoint is hit, the instruction slot icache must
be consistent with dcache.  Here is the patch which invalidates instruction
slot icache area.

Without this patch, in some machines there will be fault when executing
instruction slot where icache content is inconsistent with dcache.

Signed-off-by: bibo,mao <bibo.mao@intel.com>
Acked-by: "Luck, Tony" <tony.luck@intel.com>
Acked-by: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 64aab081153..3f57dfdc8f9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	copy_kprobe(p, ap);
+	flush_insn_slot(ap);
 	ap->addr = p->addr;
 	ap->pre_handler = aggr_pre_handler;
 	ap->fault_handler = aggr_fault_handler;
-- 
cgit v1.2.3


From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Sun, 30 Jul 2006 03:03:35 -0700
Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu
 notifications

Few of the callback functions and notifier blocks that are associated with cpu
notifications incorrectly have __devinit and __devinitdata.  They should be
__cpuinit and __cpuinitdata instead.

It makes no functional difference but wastes text area when CONFIG_HOTPLUG is
enabled and CONFIG_HOTPLUG_CPU is not.

This patch fixes all those instances.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/hrtimer.c    | 4 ++--
 kernel/rcupdate.c   | 4 ++--
 kernel/softirq.c    | 4 ++--
 kernel/softlockup.c | 4 ++--
 kernel/timer.c      | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d17766d40da..be989efc785 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block __cpuinitdata hrtimers_nb = {
 	.notifier_call = hrtimer_cpu_notify,
 };
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 759805c9859..436ab35f6fa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
 	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
-static int __devinit rcu_cpu_notify(struct notifier_block *self,
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block __cpuinitdata rcu_nb = {
 	.notifier_call	= rcu_cpu_notify,
 };
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f08a84ae30..aab880677ce 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -547,7 +547,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
@@ -587,7 +587,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b76caa2298..03e6a2b0b78 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
  * Create/destroy watchdog threads as CPUs come and go:
  */
-static int __devinit
+static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 05809c2e2fd..ee319fc69f4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1688,7 +1688,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit timer_cpu_notify(struct notifier_block *self,
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1708,7 +1708,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block __cpuinitdata timers_nb = {
 	.notifier_call	= timer_cpu_notify,
 };
 
-- 
cgit v1.2.3


From 6ea24f9ad18c65cc179593b5cc2a88cdadf8cc0c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sun, 30 Jul 2006 03:03:38 -0700
Subject: [PATCH] fix bad macro param in timer.c

We have

#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK

and it's used via

	list = varray[i + 1]->vec + (INDEX(i + 1));

So, due to underparenthesisation, this INDEX(i+1) is now a ...  (TVR_BITS + i
+ 1 * TVN_BITS)) ...

So this bugfix changes behaviour.  It worked before by sheer luck:

  "If i was anything but 0, it was broken.  But this was only used by
   s390 and arm.  Since it was for the next interrupt, could that next
   interrupt be a problem (going into the second cascade)? But it was
   probably seldom wrong.  That is, this would fail if the next
   interrupt was in the second cascade, and was wrapped.  Which may
   never of happened.  Also if it did happen, it would have just missed
   the interrupt.

   If an interrupt was missed, and no one was there to miss it, was it
   really missed :-)"

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index ee319fc69f4..726caf9a0a6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -408,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 static inline void __run_timers(tvec_base_t *base)
 {
-- 
cgit v1.2.3


From 2d7d253548cffdce80f4e03664686e9ccb1b0ed7 Mon Sep 17 00:00:00 2001
From: Jim Houston <jim.houston@ccur.com>
Date: Sun, 30 Jul 2006 03:03:39 -0700
Subject: [PATCH] fix cond_resched() fix

In cond_resched_lock() it calls __resched_legal() before dropping the spin
lock.  __resched_legal() will always finds the preempt_count non-zero and
will prevent the call to __cond_resched().

The attached patch adds a parameter to __resched_legal() with the expected
preempt_count value.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 41a571806ce..de440b220b4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4456,9 +4456,9 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }
 
-static inline int __resched_legal(void)
+static inline int __resched_legal(int expected_preempt_count)
 {
-	if (unlikely(preempt_count()))
+	if (unlikely(preempt_count() != expected_preempt_count))
 		return 0;
 	if (unlikely(system_state != SYSTEM_RUNNING))
 		return 0;
@@ -4484,7 +4484,7 @@ static void __cond_resched(void)
 
 int __sched cond_resched(void)
 {
-	if (need_resched() && __resched_legal()) {
+	if (need_resched() && __resched_legal(0)) {
 		__cond_resched();
 		return 1;
 	}
@@ -4510,7 +4510,7 @@ int cond_resched_lock(spinlock_t *lock)
 		ret = 1;
 		spin_lock(lock);
 	}
-	if (need_resched() && __resched_legal()) {
+	if (need_resched() && __resched_legal(1)) {
 		spin_release(&lock->dep_map, 1, _THIS_IP_);
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
@@ -4526,7 +4526,7 @@ int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
-	if (need_resched() && __resched_legal()) {
+	if (need_resched() && __resched_legal(0)) {
 		raw_local_irq_disable();
 		_local_bh_enable();
 		raw_local_irq_enable();
-- 
cgit v1.2.3


From 0fcb78c22f06340cdba884d7381adb3a0148bbb6 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Sun, 30 Jul 2006 03:03:42 -0700
Subject: [PATCH] Add DocBook documentation for workqueue functions

kernel/workqueue.c was omitted from generating kernel documentation.  This
adds a new section "Workqueues and Kevents" and adds documentation for some
of the functions.

Some functions in this file already had DocBook-style comments, now they
finally become visible.

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eebb1d83923..448e8f7b342 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
-/*
- * Queue work on a workqueue. Return non-zero if it was successfully
- * added.
+/**
+ * queue_work - queue work on a workqueue
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns non-zero if it was successfully added.
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
@@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
 	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
 {
@@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
 
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
 {
@@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 	}
 }
 
-/*
+/**
  * flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
  *
  * Forces execution of the workqueue and blocks until its completion.
  * This is typically used in driver shutdown handlers.
@@ -400,6 +421,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
 		kthread_stop(p);
 }
 
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
@@ -425,18 +452,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
 
 static struct workqueue_struct *keventd_wq;
 
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * This puts a job in the kernel-global workqueue.
+ */
 int fastcall schedule_work(struct work_struct *work)
 {
 	return queue_work(keventd_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
 
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
 int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
 {
 	return queue_delayed_work(keventd_wq, work, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
 int schedule_delayed_work_on(int cpu,
 			struct work_struct *work, unsigned long delay)
 {
-- 
cgit v1.2.3


From b50f60ceeef2e38e529737c0260d9543939915ad Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 30 Jul 2006 03:03:52 -0700
Subject: [PATCH] pi-futex: missing pi_waiters plist initialization

Initialize init task's pi_waiters plist.  Otherwise cpu hotplug of cpu 0
might crash, since rt_mutex_getprio() accesses an uninitialized list head.

call chain which led to crash:

take_cpu_down
sched_idle_next
__setscheduler
rt_mutex_getprio

Using PLIST_HEAD_INIT in the INIT_TASK macro doesn't work unfortunately,
since the pi_waiters member is only conditionally present.

Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index de440b220b4..a2be2d05529 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6766,6 +6766,11 @@ void __init sched_init(void)
 	}
 
 	set_load_weight(&init_task);
+
+#ifdef CONFIG_RT_MUTEXES
+	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
+
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-- 
cgit v1.2.3


From 3c829c367a1a52550378584a657768217971e587 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Sun, 30 Jul 2006 03:04:02 -0700
Subject: [PATCH] Reducing local_bh_enable/disable overhead in irqtrace

The recent changes from irqtrace feature has added overheads to
local_bh_disable and local_bh_enable that reduces UDP performance across
x86_64 and IA64, even though IA64 does not support the irqtrace feature.
Patch in question is

[PATCH]lockdep: irqtrace subsystem, core
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c
ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986

Prior to this patch, local_bh_disable was a short macro.  Now it is a
function which calls __local_bh_disable with added irq flags save and
restore.  The irq flags save and restore were also added to
local_bh_enable, probably for injecting the trace irqs code.

This overhead is on the generic code path across all architectures.  On a
IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's
UDP streaming test, the added overhead results in a drop of 3% in
throughput, as udp_sendmsg calls the local_bh_enable/disable several times.

Other workloads that have heavy usages of local_bh_enable/disable could
also be affected.  The patch ideally should not have affected IA-64
performance as it does not have IRQ tracing support.  A significant portion
of the overhead is in the added irq flags save and restore, which I think
is not needed if IRQ tracing is unused.  A suggested patch is attached
below that recovers the lost performance.  However, the "ifdef"s in the
patch are a bit ugly.

Signed-off-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index aab880677ce..3789ca98197 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void)
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
  */
+#ifdef CONFIG_TRACE_IRQFLAGS
 static void __local_bh_disable(unsigned long ip)
 {
 	unsigned long flags;
@@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
 }
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+	add_preempt_count(SOFTIRQ_OFFSET);
+	barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
 
 void local_bh_disable(void)
 {
@@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable);
 
 void local_bh_enable(void)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned long flags;
 
 	WARN_ON_ONCE(in_irq());
+#endif
 	WARN_ON_ONCE(irqs_disabled());
 
+#ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_save(flags);
+#endif
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
@@ -142,18 +154,22 @@ void local_bh_enable(void)
 		do_softirq();
 
 	dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_restore(flags);
+#endif
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable);
 
 void local_bh_enable_ip(unsigned long ip)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned long flags;
 
 	WARN_ON_ONCE(in_irq());
 
 	local_irq_save(flags);
+#endif
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
@@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip)
 		do_softirq();
 
 	dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_restore(flags);
+#endif
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
-- 
cgit v1.2.3


From d07fe82c24daab2360e2790f488bcffa7db74825 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sun, 30 Jul 2006 03:04:03 -0700
Subject: [PATCH] reference rt-mutex-design in rtmutex.c

In order to prevent Doc Rot, this patch adds a reference to the design
document for rtmutex.c in rtmutex.c.  So when someone needs to update or
change the design of that file they will know that a document actually
exists that explains the design (helping them change it), and hopefully
that they will update the document if they too change the design.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rtmutex.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index d2ef13b485e..3e13a1e5856 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -7,6 +7,8 @@
  *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
+ *
+ *  See Documentation/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
 #include <linux/module.h>
-- 
cgit v1.2.3


From 51d8c5edd3b166fcc51aba84d78761d578400a7c Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Sun, 30 Jul 2006 03:04:14 -0700
Subject: [PATCH] timer: Fix tvec_bases initializer

kernel/timer.c defines a (per-cpu) pointer to tvec_base_t, but initializes
it using { &a_tvec_base_t }, which sparse warns about; change this to just
&a_tvec_base_t.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 726caf9a0a6..b650f04888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
 
 tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
-- 
cgit v1.2.3