From ca94c442535a44d508c99a77e54f21a59f4fc462 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 15 Jun 2009 17:17:47 +0200
Subject: sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag

This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed
to the kernel via sched_setscheduler(), ORed in the policy parameter. If
set this will make sure that when the process forks a) the scheduling
priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling
policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR.

Why have this?

Currently, if a process is real-time scheduled this will 'leak' to all
its child processes. For security reasons it is often (always?) a good
idea to make sure that if a process acquires RT scheduling this is
confined to this process and only this process. More specifically this
makes the per-process resource limit RLIMIT_RTTIME useful for security
purposes, because it makes it impossible to use a fork bomb to
circumvent the per-process RLIMIT_RTTIME accounting.

This feature is also useful for tools like 'renice' which can then
change the nice level of a process without having this spill to all its
child processes.

Why expose this via sched_setscheduler() and not other syscalls such as
prctl() or sched_setparam()?

prctl() does not take a pid parameter. Due to that it would be
impossible to modify this flag for other processes than the current one.

The struct passed to sched_setparam() can unfortunately not be extended
without breaking compatibility, since sched_setparam() lacks a size
parameter.

How to use this from userspace? In your RT program simply replace this:

  sched_setscheduler(pid, SCHED_FIFO, &param);

by this:

  sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, &param);

Signed-off-by: Lennart Poettering <lennart@poettering.net>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090615152714.GA29092@tango.0pointer.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8ec9d13140b..32e6ede8525 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2613,12 +2613,28 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	set_task_cpu(p, cpu);
 
 	/*
-	 * Make sure we do not leak PI boosting priority to the child:
+	 * Revert to default priority/policy on fork if requested. Make sure we
+	 * do not leak PI boosting priority to the child.
 	 */
-	p->prio = current->normal_prio;
+	if (current->sched_reset_on_fork &&
+			(p->policy == SCHED_FIFO || p->policy == SCHED_RR))
+		p->policy = SCHED_NORMAL;
+
+	if (current->sched_reset_on_fork &&
+			(current->normal_prio < DEFAULT_PRIO))
+		p->prio = DEFAULT_PRIO;
+	else
+		p->prio = current->normal_prio;
+
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
+	/*
+	 * We don't need the reset flag anymore after the fork. It has
+	 * fulfilled its duty:
+	 */
+	p->sched_reset_on_fork = 0;
+
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -6094,17 +6110,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
+	int reset_on_fork;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
-	if (policy < 0)
+	if (policy < 0) {
+		reset_on_fork = p->sched_reset_on_fork;
 		policy = oldpolicy = p->policy;
-	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-			policy != SCHED_IDLE)
-		return -EINVAL;
+	} else {
+		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+		policy &= ~SCHED_RESET_ON_FORK;
+
+		if (policy != SCHED_FIFO && policy != SCHED_RR &&
+				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+				policy != SCHED_IDLE)
+			return -EINVAL;
+	}
+
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6148,6 +6172,10 @@ recheck:
 		/* can't change other user's priorities */
 		if (!check_same_owner(p))
 			return -EPERM;
+
+		/* Normal users shall not reset the sched_reset_on_fork flag */
+		if (p->sched_reset_on_fork && !reset_on_fork)
+			return -EPERM;
 	}
 
 	if (user) {
@@ -6191,6 +6219,8 @@ recheck:
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 
+	p->sched_reset_on_fork = reset_on_fork;
+
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
@@ -6307,14 +6337,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
-			retval = p->policy;
+			retval = p->policy
+				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
 	}
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 
 /**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
-- 
cgit v1.2.3


From b9dc29e72fd3dc2a739ce8eafd958220d0745734 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 17 Jun 2009 10:46:01 +0200
Subject: sched: Clean up SCHED_RESET_ON_FORK

Make SCHED_RESET_ON_FORK sched_fork() bits a self-contained unlikely code path.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Lennart Poettering <mzxreary@0pointer.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1245228361.18329.6.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 32e6ede8525..50e4e3d15e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2613,28 +2613,30 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	set_task_cpu(p, cpu);
 
 	/*
-	 * Revert to default priority/policy on fork if requested. Make sure we
-	 * do not leak PI boosting priority to the child.
+	 * Make sure we do not leak PI boosting priority to the child.
 	 */
-	if (current->sched_reset_on_fork &&
-			(p->policy == SCHED_FIFO || p->policy == SCHED_RR))
-		p->policy = SCHED_NORMAL;
+	p->prio = current->normal_prio;
 
-	if (current->sched_reset_on_fork &&
-			(current->normal_prio < DEFAULT_PRIO))
-		p->prio = DEFAULT_PRIO;
-	else
-		p->prio = current->normal_prio;
+	/*
+	 * Revert to default priority/policy on fork if requested.
+	 */
+	if (unlikely(p->sched_reset_on_fork)) {
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+			p->policy = SCHED_NORMAL;
+
+		if (p->normal_prio < DEFAULT_PRIO)
+			p->prio = DEFAULT_PRIO;
+
+		/*
+		 * We don't need the reset flag anymore after the fork. It has
+		 * fulfilled its duty:
+		 */
+		p->sched_reset_on_fork = 0;
+	}
 
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
-	/*
-	 * We don't need the reset flag anymore after the fork. It has
-	 * fulfilled its duty:
-	 */
-	p->sched_reset_on_fork = 0;
-
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
-- 
cgit v1.2.3


From 6c697bdf08a09ce461e305a22362973036e95db3 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 17 Jun 2009 10:48:02 +0200
Subject: sched: Add SCHED_RESET_ON_FORK functionality for nice < 0 tasks

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Lennart Poettering <mzxreary@0pointer.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1245228482.27326.1.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 50e4e3d15e8..34f94240642 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2627,6 +2627,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 		if (p->normal_prio < DEFAULT_PRIO)
 			p->prio = DEFAULT_PRIO;
 
+		if (PRIO_TO_NICE(p->static_prio) < 0) {
+			p->static_prio = NICE_TO_PRIO(0);
+			set_load_weight(p);
+		}
+
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
 		 * fulfilled its duty:
-- 
cgit v1.2.3


From c17ef45342cc033fdf7bdd5b28615e0090f8d2e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 23 Jun 2009 17:12:47 -0700
Subject: rcu: Remove Classic RCU

Remove Classic RCU, given that the combination of Tree RCU and
the proposed Bloatwatch RCU do everything that Classic RCU can
with fewer bugs.

Tree RCU has been default in x86 builds for almost six months,
and seems to be quite reliable, so there does not seem to be
much justification for keeping the Classic RCU code and config
complexity around anymore.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: akpm@linux-foundation.org
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: dipankar@in.ibm.com
Cc: dhowells@redhat.com
Cc: lethal@linux-sh.org
Cc: kernel@wantstofly.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile     |   1 -
 kernel/rcuclassic.c | 807 ----------------------------------------------------
 2 files changed, 808 deletions(-)
 delete mode 100644 kernel/rcuclassic.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 0a32cb21ec9..d6042bcc9e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,7 +80,6 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b31130..00000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2001
- *
- * Authors: Dipankar Sarma <dipankar@in.ibm.com>
- *	    Manfred Spraul <manfred@colorfullife.com>
- *
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
- * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/time.h>
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
-	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_BITS_NONE,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_BITS_NONE,
-};
-
-static DEFINE_PER_CPU(struct rcu_data, rcu_data);
-static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-
-void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
-
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	unsigned long flags;
-
-	set_need_resched();
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (unlikely(!rcp->signaled)) {
-		rcp->signaled = 1;
-		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 *
-		 * cpu_online_mask is updated by the _cpu_down()
-		 * using __stop_machine(). Since we're in irqs disabled
-		 * section, __stop_machine() is not exectuting, hence
-		 * the cpu_online_mask is stable.
-		 *
-		 * However,  a cpu might have been offlined _just_ before
-		 * we disabled irqs while entering here.
-		 * And rcu subsystem might not yet have handled the CPU_DEAD
-		 * notification, leading to the offlined cpu's bit
-		 * being set in the rcp->cpumask.
-		 *
-		 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
-		 * sending smp_reschedule() to an offlined CPU.
-		 */
-		for_each_cpu_and(cpu,
-				  to_cpumask(rcp->cpumask), cpu_online_mask) {
-			if (cpu != rdp->cpu)
-				smp_send_reschedule(cpu);
-		}
-	}
-	spin_unlock_irqrestore(&rcp->lock, flags);
-}
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
-{
-	set_need_resched();
-}
-#endif
-
-static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
-		struct rcu_data *rdp)
-{
-	long batch;
-
-	head->next = NULL;
-	smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
-
-	/*
-	 * Determine the batch number of this callback.
-	 *
-	 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
-	 * local variable "batch" and emits codes like this:
-	 *	1) rdp->batch = rcp->cur + 1 # gets old value
-	 *	......
-	 *	2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
-	 * then [*nxttail[0], *nxttail[1]) may contain callbacks
-	 * that batch# = rdp->batch, see the comment of struct rcu_data.
-	 */
-	batch = ACCESS_ONCE(rcp->cur) + 1;
-
-	if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
-		/* process callbacks */
-		rdp->nxttail[0] = rdp->nxttail[1];
-		rdp->nxttail[1] = rdp->nxttail[2];
-		if (rcu_batch_after(batch - 1, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[2];
-	}
-
-	rdp->batch = batch;
-	*rdp->nxttail[2] = head;
-	rdp->nxttail[2] = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
-}
-
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
-static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
-{
-	rcp->gp_start = jiffies;
-	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
-}
-
-static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	long delta;
-	unsigned long flags;
-
-	/* Only let one CPU complain about others per time interval. */
-
-	spin_lock_irqsave(&rcp->lock, flags);
-	delta = jiffies - rcp->jiffies_stall;
-	if (delta < 2 || rcp->cur != rcp->completed) {
-		spin_unlock_irqrestore(&rcp->lock, flags);
-		return;
-	}
-	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-	spin_unlock_irqrestore(&rcp->lock, flags);
-
-	/* OK, time to rat on our buddy... */
-
-	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
-	for_each_possible_cpu(cpu) {
-		if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
-			printk(" %d", cpu);
-	}
-	printk(" (detected by %d, t=%ld jiffies)\n",
-	       smp_processor_id(), (long)(jiffies - rcp->gp_start));
-}
-
-static void print_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	unsigned long flags;
-
-	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
-			smp_processor_id(), jiffies,
-			jiffies - rcp->gp_start);
-	dump_stack();
-	spin_lock_irqsave(&rcp->lock, flags);
-	if ((long)(jiffies - rcp->jiffies_stall) >= 0)
-		rcp->jiffies_stall =
-			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-	spin_unlock_irqrestore(&rcp->lock, flags);
-	set_need_resched();  /* kick ourselves to get things going. */
-}
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	long delta;
-
-	delta = jiffies - rcp->jiffies_stall;
-	if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
-		delta >= 0) {
-
-		/* We haven't checked in, so go dump stack. */
-		print_cpu_stall(rcp);
-
-	} else if (rcp->cur != rcp->completed && delta >= 2) {
-
-		/* They had two seconds to dump stack, so complain. */
-		print_other_cpu_stall(rcp);
-	}
-}
-
-#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
-{
-}
-
-static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-/**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
-	return rcu_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed_bh(void)
-{
-	return rcu_bh_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
-/* Raises the softirq for processing rcu_callbacks. */
-static inline void raise_rcu_softirq(void)
-{
-	raise_softirq(RCU_SOFTIRQ);
-}
-
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
-	unsigned long flags;
-	struct rcu_head *next, *list;
-	int count = 0;
-
-	list = rdp->donelist;
-	while (list) {
-		next = list->next;
-		prefetch(next);
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
-	}
-	rdp->donelist = list;
-
-	local_irq_save(flags);
-	rdp->qlen -= count;
-	local_irq_restore(flags);
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
-
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		raise_rcu_softirq();
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
-
-/*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
- */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
-{
-	if (rcp->cur != rcp->pending &&
-			rcp->completed == rcp->cur) {
-		rcp->cur++;
-		record_gp_stall_check_time(rcp);
-
-		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
-		 */
-		smp_mb();
-		cpumask_andnot(to_cpumask(rcp->cpumask),
-			       cpu_online_mask, nohz_cpu_mask);
-
-		rcp->signaled = 0;
-	}
-}
-
-/*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
- */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
-{
-	cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
-	if (cpumask_empty(to_cpumask(rcp->cpumask))) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
-	}
-}
-
-/*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
- */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	unsigned long flags;
-
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
-		return;
-	}
-
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
-	 */
-	if (!rdp->qs_pending)
-		return;
-
-	/*
-	 * Was there a quiescent state since the beginning of the grace
-	 * period? If no, then exit and wait for the next call.
-	 */
-	if (!rdp->passed_quiesc)
-		return;
-	rdp->qs_pending = 0;
-
-	spin_lock_irqsave(&rcp->lock, flags);
-	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
-	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail, long batch)
-{
-	unsigned long flags;
-
-	if (list) {
-		local_irq_save(flags);
-		this_rdp->batch = batch;
-		*this_rdp->nxttail[2] = list;
-		this_rdp->nxttail[2] = tail;
-		local_irq_restore(flags);
-	}
-}
-
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	unsigned long flags;
-
-	/*
-	 * if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
-	 */
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-	spin_unlock(&rcp->lock);
-
-	this_rdp->qlen += rdp->qlen;
-	local_irq_restore(flags);
-}
-
-static void rcu_offline_cpu(int cpu)
-{
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
-}
-
-#else
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif
-
-/*
- * This does the RCU processing work from softirq context.
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	unsigned long flags;
-	long completed_snap;
-
-	if (rdp->nxtlist) {
-		local_irq_save(flags);
-		completed_snap = ACCESS_ONCE(rcp->completed);
-
-		/*
-		 * move the other grace-period-completed entries to
-		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
-		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
-		else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
-			rdp->nxttail[0] = rdp->nxttail[1];
-
-		/*
-		 * the grace period for entries in
-		 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
-		 * move these entries to donelist
-		 */
-		if (rdp->nxttail[0] != &rdp->nxtlist) {
-			*rdp->donetail = rdp->nxtlist;
-			rdp->donetail = rdp->nxttail[0];
-			rdp->nxtlist = *rdp->nxttail[0];
-			*rdp->donetail = NULL;
-
-			if (rdp->nxttail[1] == rdp->nxttail[0])
-				rdp->nxttail[1] = &rdp->nxtlist;
-			if (rdp->nxttail[2] == rdp->nxttail[0])
-				rdp->nxttail[2] = &rdp->nxtlist;
-			rdp->nxttail[0] = &rdp->nxtlist;
-		}
-
-		local_irq_restore(flags);
-
-		if (rcu_batch_after(rdp->batch, rcp->pending)) {
-			unsigned long flags2;
-
-			/* and start it/schedule start if it's a new batch */
-			spin_lock_irqsave(&rcp->lock, flags2);
-			if (rcu_batch_after(rdp->batch, rcp->pending)) {
-				rcp->pending = rdp->batch;
-				rcu_start_batch(rcp);
-			}
-			spin_unlock_irqrestore(&rcp->lock, flags2);
-		}
-	}
-
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
-	/*
-	 * Memory references from any prior RCU read-side critical sections
-	 * executed by the interrupted code must be see before any RCU
-	 * grace-period manupulations below.
-	 */
-
-	smp_mb(); /* See above block comment. */
-
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-
-	/*
-	 * Memory references from any later RCU read-side critical sections
-	 * executed by the interrupted code must be see after any RCU
-	 * grace-period manupulations above.
-	 */
-
-	smp_mb(); /* See above block comment. */
-}
-
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* Check for CPU stalls, if enabled. */
-	check_cpu_stall(rcp);
-
-	if (rdp->nxtlist) {
-		long completed_snap = ACCESS_ONCE(rcp->completed);
-
-		/*
-		 * This cpu has pending rcu entries and the grace period
-		 * for them has completed.
-		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			return 1;
-		if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
-				rdp->nxttail[0] != rdp->nxttail[1])
-			return 1;
-		if (rdp->nxttail[0] != &rdp->nxtlist)
-			return 1;
-
-		/*
-		 * This cpu has pending rcu entries and the new batch
-		 * for then hasn't been started nor scheduled start
-		 */
-		if (rcu_batch_after(rdp->batch, rcp->pending))
-			return 1;
-	}
-
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
-		return 1;
-
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
-		return 1;
-
-	/* nothing to do */
-	return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so.  This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-int rcu_pending(int cpu)
-{
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-
-	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
-}
-
-/*
- * Top-level function driving RCU grace-period detection, normally
- * invoked from the scheduler-clock interrupt.  This function simply
- * increments counters that are read only from softirq by this same
- * CPU, so there are no memory barriers required.
- */
-void rcu_check_callbacks(int cpu, int user)
-{
-	if (user ||
-	    (idle_cpu(cpu) && rcu_scheduler_active &&
-	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-
-		/*
-		 * Get here if this CPU took its interrupt from user
-		 * mode or from the idle loop, and if this is not a
-		 * nested interrupt.  In this case, the CPU is in
-		 * a quiescent state, so count it.
-		 *
-		 * Also do a memory barrier.  This is needed to handle
-		 * the case where writes from a preempt-disable section
-		 * of code get reordered into schedule() by this CPU's
-		 * write buffer.  The memory barrier makes sure that
-		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
-		 * by other CPUs to happen after any such write.
-		 */
-
-		smp_mb();  /* See above block comment. */
-		rcu_qsctr_inc(cpu);
-		rcu_bh_qsctr_inc(cpu);
-
-	} else if (!in_softirq()) {
-
-		/*
-		 * Get here if this CPU did not take its interrupt from
-		 * softirq, in other words, if it is not interrupting
-		 * a rcu_bh read-side critical section.  This is an _bh
-		 * critical section, so count it.  The memory barrier
-		 * is needed for the same reason as is the above one.
-		 */
-
-		smp_mb();  /* See above block comment. */
-		rcu_bh_qsctr_inc(cpu);
-	}
-	raise_rcu_softirq();
-}
-
-static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rcp->lock, flags);
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
-	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
-	rdp->blimit = blimit;
-	spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-static void __cpuinit rcu_online_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		rcu_online_cpu(cpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		rcu_offline_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata rcu_nb = {
-	.notifier_call	= rcu_cpu_notify,
-};
-
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
-void __init __rcu_init(void)
-{
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	/* Register notifier for non-boot CPUs */
-	register_cpu_notifier(&rcu_nb);
-}
-
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
-- 
cgit v1.2.3


From 9e48858f7d36a6a3849f1d1b40c3bf5624b4ee7c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 7 May 2009 19:26:19 +1000
Subject: security: rename ptrace_may_access => ptrace_access_check

The ->ptrace_may_access() methods are named confusingly - the real
ptrace_may_access() returns a bool, while these security checks have
a retval convention.

Rename it to ptrace_access_check, to reduce the confusion factor.

[ Impact: cleanup, no code changed ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07b..9a4184e04f2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
-	return security_ptrace_may_access(task, mode);
+	return security_ptrace_access_check(task, mode);
 }
 
 bool ptrace_may_access(struct task_struct *task, unsigned int mode)
-- 
cgit v1.2.3


From 54d35f29f49224d86b994acb6e5969b9ba09022d Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Date: Mon, 29 Jun 2009 14:44:57 +0900
Subject: sched: Hide runqueues from direct reference at source code level for
 __raw_get_cpu_var()

Hide __raw_get_cpu_var() as well - thus all the direct
references to runqueues will abstracted out.

Signed-off-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
LKML-Reference: <20090629.144457.886429910353660979.mitake@dcl.info.waseda.ac.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 168b2680ae2..ebc5151f88c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -692,6 +692,7 @@ static inline int cpu_of(struct rq *rq)
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+#define raw_rq()		(&__raw_get_cpu_var(runqueues))
 
 inline void update_rq_clock(struct rq *rq)
 {
@@ -6669,7 +6670,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-	struct rq *rq = &__raw_get_cpu_var(runqueues);
+	struct rq *rq = raw_rq();
 
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
@@ -6681,7 +6682,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-	struct rq *rq = &__raw_get_cpu_var(runqueues);
+	struct rq *rq = raw_rq();
 	long ret;
 
 	delayacct_blkio_start();
-- 
cgit v1.2.3


From 03b042bf1dc14a268a3d65d38b4ec2a4261e8477 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 25 Jun 2009 09:08:16 -0700
Subject: rcu: Add synchronize_sched_expedited() primitive

This adds the synchronize_sched_expedited() primitive that
implements the "big hammer" expedited RCU grace periods.

This primitive is placed in kernel/sched.c rather than
kernel/rcupdate.c due to its need to interact closely with the
migration_thread() kthread.

The idea is to wake up this kthread with req->task set to NULL,
in response to which the kthread reports the quiescent state
resulting from the kthread having been scheduled.

Because this patch needs to fallback to the slow versions of
the primitives in response to some races with CPU onlining and
offlining, a new synchronize_rcu_bh() primitive is added as
well.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Cc: davem@davemloft.net
Cc: dada1@cosmosbay.com
Cc: zbr@ioremap.net
Cc: jeff.chua.linux@gmail.com
Cc: paulus@samba.org
Cc: laijs@cn.fujitsu.com
Cc: jengelh@medozas.de
Cc: r000n@r000n.net
Cc: benh@kernel.crashing.org
Cc: mathieu.desnoyers@polymtl.ca
LKML-Reference: <12459460982947-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c |  25 +++++++++++
 kernel/sched.c    | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 152 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90..eae29c25fb1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -98,6 +98,30 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ */
+void synchronize_rcu_bh(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_bh(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type)
 static inline void wait_migrated_callbacks(void)
 {
 	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+	smp_mb(); /* In case we didn't sleep. */
 }
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index 7c9098d186e..9ae80bec1c1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7024,6 +7024,11 @@ fail:
 	return ret;
 }
 
+#define RCU_MIGRATION_IDLE	0
+#define RCU_MIGRATION_NEED_QS	1
+#define RCU_MIGRATION_GOT_QS	2
+#define RCU_MIGRATION_MUST_SYNC	3
+
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7031,6 +7036,7 @@ fail:
  */
 static int migration_thread(void *data)
 {
+	int badcpu;
 	int cpu = (long)data;
 	struct rq *rq;
 
@@ -7065,8 +7071,17 @@ static int migration_thread(void *data)
 		req = list_entry(head->next, struct migration_req, list);
 		list_del_init(head->next);
 
-		spin_unlock(&rq->lock);
-		__migrate_task(req->task, cpu, req->dest_cpu);
+		if (req->task != NULL) {
+			spin_unlock(&rq->lock);
+			__migrate_task(req->task, cpu, req->dest_cpu);
+		} else if (likely(cpu == (badcpu = smp_processor_id()))) {
+			req->dest_cpu = RCU_MIGRATION_GOT_QS;
+			spin_unlock(&rq->lock);
+		} else {
+			req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
+			spin_unlock(&rq->lock);
+			WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
+		}
 		local_irq_enable();
 
 		complete(&req->done);
@@ -10554,3 +10569,113 @@ struct cgroup_subsys cpuacct_subsys = {
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
+
+#ifndef CONFIG_SMP
+
+int rcu_expedited_torture_stats(char *page)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+void synchronize_sched_expedited(void)
+{
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+
+#define RCU_EXPEDITED_STATE_POST -2
+#define RCU_EXPEDITED_STATE_IDLE -1
+
+static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+
+int rcu_expedited_torture_stats(char *page)
+{
+	int cnt = 0;
+	int cpu;
+
+	cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+	for_each_online_cpu(cpu) {
+		 cnt += sprintf(&page[cnt], " %d:%d",
+				cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+	}
+	cnt += sprintf(&page[cnt], "\n");
+	return cnt;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+static long synchronize_sched_expedited_count;
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ */
+void synchronize_sched_expedited(void)
+{
+	int cpu;
+	unsigned long flags;
+	bool need_full_sync = 0;
+	struct rq *rq;
+	struct migration_req *req;
+	long snap;
+	int trycount = 0;
+
+	smp_mb();  /* ensure prior mod happens before capturing snap. */
+	snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+	get_online_cpus();
+	while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+		put_online_cpus();
+		if (trycount++ < 10)
+			udelay(trycount * num_online_cpus());
+		else {
+			synchronize_sched();
+			return;
+		}
+		if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			return;
+		}
+		get_online_cpus();
+	}
+	rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+	for_each_online_cpu(cpu) {
+		rq = cpu_rq(cpu);
+		req = &per_cpu(rcu_migration_req, cpu);
+		init_completion(&req->done);
+		req->task = NULL;
+		req->dest_cpu = RCU_MIGRATION_NEED_QS;
+		spin_lock_irqsave(&rq->lock, flags);
+		list_add(&req->list, &rq->migration_queue);
+		spin_unlock_irqrestore(&rq->lock, flags);
+		wake_up_process(rq->migration_thread);
+	}
+	for_each_online_cpu(cpu) {
+		rcu_expedited_state = cpu;
+		req = &per_cpu(rcu_migration_req, cpu);
+		rq = cpu_rq(cpu);
+		wait_for_completion(&req->done);
+		spin_lock_irqsave(&rq->lock, flags);
+		if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
+			need_full_sync = 1;
+		req->dest_cpu = RCU_MIGRATION_IDLE;
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+	rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+	mutex_unlock(&rcu_sched_expedited_mutex);
+	put_online_cpus();
+	if (need_full_sync)
+		synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
-- 
cgit v1.2.3


From 0acc512cb1a29636df5e982c7d845edafe77c2d0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 25 Jun 2009 09:08:17 -0700
Subject: rcu: Add synchronize_sched_expedited() torture tests

This patch adds rcutorture tests for the new
synchronize_sched_expedited() primitive, and also does some
whitespace cleanups in kernel/rcutorture.c as well.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Cc: davem@davemloft.net
Cc: dada1@cosmosbay.com
Cc: zbr@ioremap.net
Cc: jeff.chua.linux@gmail.com
Cc: paulus@samba.org
Cc: laijs@cn.fujitsu.com
Cc: jengelh@medozas.de
Cc: r000n@r000n.net
Cc: benh@kernel.crashing.org
Cc: mathieu.desnoyers@polymtl.ca
LKML-Reference: <12459460981342-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 202 ++++++++++++++++++++++++++++------------------------
 1 file changed, 110 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4..b33db539a8a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	int (*readlock)(void);
-	void (*readdelay)(struct rcu_random_state *rrsp);
+	void (*read_delay)(struct rcu_random_state *rrsp);
 	void (*readunlock)(int idx);
 	int (*completed)(void);
-	void (*deferredfree)(struct rcu_torture *p);
+	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
 	void (*cb_barrier)(void);
 	int (*stats)(char *page);
-	int irqcapable;
+	int irq_capable;
 	char *name;
 };
 static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
 		rp->rtort_mbtest = 0;
 		rcu_torture_free(rp);
 	} else
-		cur_ops->deferredfree(rp);
+		cur_ops->deferred_free(rp);
 }
 
 static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
 }
 
 static struct rcu_torture_ops rcu_ops = {
-	.init = NULL,
-	.cleanup = NULL,
-	.readlock = rcu_torture_read_lock,
-	.readdelay = rcu_read_delay,
-	.readunlock = rcu_torture_read_unlock,
-	.completed = rcu_torture_completed,
-	.deferredfree = rcu_torture_deferred_free,
-	.sync = synchronize_rcu,
-	.cb_barrier = rcu_barrier,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu"
+	.init		= NULL,
+	.cleanup	= NULL,
+	.readlock	= rcu_torture_read_lock,
+	.read_delay	= rcu_read_delay,
+	.readunlock	= rcu_torture_read_unlock,
+	.completed	= rcu_torture_completed,
+	.deferred_free	= rcu_torture_deferred_free,
+	.sync		= synchronize_rcu,
+	.cb_barrier	= rcu_barrier,
+	.stats		= NULL,
+	.irq_capable 	= 1,
+	.name 		= "rcu"
 };
 
 static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
 }
 
 static struct rcu_torture_ops rcu_sync_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = rcu_torture_read_lock,
-	.readdelay = rcu_read_delay,
-	.readunlock = rcu_torture_read_unlock,
-	.completed = rcu_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = synchronize_rcu,
-	.cb_barrier = NULL,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu_sync"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_torture_read_lock,
+	.read_delay	= rcu_read_delay,
+	.readunlock	= rcu_torture_read_unlock,
+	.completed	= rcu_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= synchronize_rcu,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_sync"
 };
 
 /*
@@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
 }
 
 static struct rcu_torture_ops rcu_bh_ops = {
-	.init = NULL,
-	.cleanup = NULL,
-	.readlock = rcu_bh_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = rcu_bh_torture_read_unlock,
-	.completed = rcu_bh_torture_completed,
-	.deferredfree = rcu_bh_torture_deferred_free,
-	.sync = rcu_bh_torture_synchronize,
-	.cb_barrier = rcu_barrier_bh,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu_bh"
+	.init		= NULL,
+	.cleanup	= NULL,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_torture_completed,
+	.deferred_free	= rcu_bh_torture_deferred_free,
+	.sync		= rcu_bh_torture_synchronize,
+	.cb_barrier	= rcu_barrier_bh,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh"
 };
 
 static struct rcu_torture_ops rcu_bh_sync_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = rcu_bh_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = rcu_bh_torture_read_unlock,
-	.completed = rcu_bh_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = rcu_bh_torture_synchronize,
-	.cb_barrier = NULL,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu_bh_sync"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= rcu_bh_torture_synchronize,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh_sync"
 };
 
 /*
@@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page)
 }
 
 static struct rcu_torture_ops srcu_ops = {
-	.init = srcu_torture_init,
-	.cleanup = srcu_torture_cleanup,
-	.readlock = srcu_torture_read_lock,
-	.readdelay = srcu_read_delay,
-	.readunlock = srcu_torture_read_unlock,
-	.completed = srcu_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = srcu_torture_synchronize,
-	.cb_barrier = NULL,
-	.stats = srcu_torture_stats,
-	.name = "srcu"
+	.init		= srcu_torture_init,
+	.cleanup	= srcu_torture_cleanup,
+	.readlock	= srcu_torture_read_lock,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= srcu_torture_synchronize,
+	.cb_barrier	= NULL,
+	.stats		= srcu_torture_stats,
+	.name		= "srcu"
 };
 
 /*
@@ -574,32 +574,49 @@ static void sched_torture_synchronize(void)
 }
 
 static struct rcu_torture_ops sched_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = sched_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = sched_torture_read_unlock,
-	.completed = sched_torture_completed,
-	.deferredfree = rcu_sched_torture_deferred_free,
-	.sync = sched_torture_synchronize,
-	.cb_barrier = rcu_barrier_sched,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "sched"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= sched_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= sched_torture_read_unlock,
+	.completed	= sched_torture_completed,
+	.deferred_free	= rcu_sched_torture_deferred_free,
+	.sync		= sched_torture_synchronize,
+	.cb_barrier	= rcu_barrier_sched,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "sched"
 };
 
 static struct rcu_torture_ops sched_ops_sync = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = sched_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = sched_torture_read_unlock,
-	.completed = sched_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = sched_torture_synchronize,
-	.cb_barrier = NULL,
-	.stats = NULL,
-	.name = "sched_sync"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= sched_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= sched_torture_read_unlock,
+	.completed	= sched_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= sched_torture_synchronize,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.name		= "sched_sync"
+};
+
+extern int rcu_expedited_torture_stats(char *page);
+
+static struct rcu_torture_ops sched_expedited_ops = {
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= sched_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= sched_torture_read_unlock,
+	.completed	= sched_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= synchronize_sched_expedited,
+	.cb_barrier	= NULL,
+	.stats		= rcu_expedited_torture_stats,
+	.irq_capable	= 1,
+	.name		= "sched_expedited"
 };
 
 /*
@@ -635,7 +652,7 @@ rcu_torture_writer(void *arg)
 				i = RCU_TORTURE_PIPE_LEN;
 			atomic_inc(&rcu_torture_wcount[i]);
 			old_rp->rtort_pipe_count++;
-			cur_ops->deferredfree(old_rp);
+			cur_ops->deferred_free(old_rp);
 		}
 		rcu_torture_current_version++;
 		oldbatch = cur_ops->completed();
@@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
 	if (p->rtort_mbtest == 0)
 		atomic_inc(&n_rcu_torture_mberror);
 	spin_lock(&rand_lock);
-	cur_ops->readdelay(&rand);
+	cur_ops->read_delay(&rand);
 	n_rcu_torture_timers++;
 	spin_unlock(&rand_lock);
 	preempt_disable();
@@ -738,11 +755,11 @@ rcu_torture_reader(void *arg)
 
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
 	set_user_nice(current, 19);
-	if (irqreader && cur_ops->irqcapable)
+	if (irqreader && cur_ops->irq_capable)
 		setup_timer_on_stack(&t, rcu_torture_timer, 0);
 
 	do {
-		if (irqreader && cur_ops->irqcapable) {
+		if (irqreader && cur_ops->irq_capable) {
 			if (!timer_pending(&t))
 				mod_timer(&t, 1);
 		}
@@ -757,7 +774,7 @@ rcu_torture_reader(void *arg)
 		}
 		if (p->rtort_mbtest == 0)
 			atomic_inc(&n_rcu_torture_mberror);
-		cur_ops->readdelay(&rand);
+		cur_ops->read_delay(&rand);
 		preempt_disable();
 		pipe_count = p->rtort_pipe_count;
 		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +795,7 @@ rcu_torture_reader(void *arg)
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_reader");
-	if (irqreader && cur_ops->irqcapable)
+	if (irqreader && cur_ops->irq_capable)
 		del_timer_sync(&t);
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
@@ -1078,6 +1095,7 @@ rcu_torture_init(void)
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+		  &sched_expedited_ops,
 		  &srcu_ops, &sched_ops, &sched_ops_sync, };
 
 	mutex_lock(&fullstop_mutex);
-- 
cgit v1.2.3


From 4d09161196c9a836eacea4b36e2f217bc34894cf Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Wed, 1 Jul 2009 21:08:37 -0400
Subject: printk: Enable the use of more than one CON_BOOT (early console)

Today, register_console() assumes the following usage:

  - The first console to register with a flag set to CON_BOOT
    is the one and only bootconsole.

  - If another register_console() is called with an additional
    CON_BOOT, it is silently rejected.

  - As soon as a console without the CON_BOOT set calls
    registers the bootconsole is automatically unregistered.

  - Once there is a "real" console - register_console() will
    silently reject any consoles with it's CON_BOOT flag set.

In many systems (alpha, blackfin, microblaze, mips, powerpc,
sh, & x86), there are early_printk implementations, which use
the CON_BOOT which come out serial ports, vga, usb, & memory
buffers.

In many embedded systems, it would be nice to have two
bootconsoles - in case the primary fails, you always have
access to a backup memory buffer - but this requires at least
two CON_BOOT consoles...

This patch enables that functionality.

With the change applied, on boot you get (if you try to
re-enable a boot console after the "real" console has been
registered):

  root:/> dmesg | grep console
  bootconsole [early_shadow0] enabled
  bootconsole [early_BFuart0] enabled
  Kernel command line: root=/dev/mtdblock0 rw earlyprintk=serial,uart0,57600 console=ttyBF0,57600 nmi_debug=regs
  console handover:boot [early_BFuart0] boot [early_shadow0]  -> real [ttyBF0]
  Too late to register bootconsole early_shadow0

or:

  root:/> dmesg | grep console
  Kernel command line: root=/dev/mtdblock0 rw console=ttyBF0,57600
  console [ttyBF0] enabled

Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Cc: "Linus Torvalds" <torvalds@linux-foundation.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: "Mike Frysinger" <vapier.adi@gmail.com>
Cc: "Paul Mundt" <lethal@linux-sh.org>
LKML-Reference: <200907012108.38030.rgetz@blackfin.uclinux.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 146 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 92 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1e..41fe6099553 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,12 @@
 
 #include <asm/uaccess.h>
 
+/*
+ * for_each_console() allows you to iterate on each console
+ */
+#define for_each_console(con) \
+	for (con = console_drivers; con != NULL; con = con->next)
+
 /*
  * Architectures can override it:
  */
@@ -412,7 +418,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
 {
 	struct console *con;
 
-	for (con = console_drivers; con; con = con->next) {
+	for_each_console(con) {
 		if ((con->flags & CON_ENABLED) && con->write &&
 				(cpu_online(smp_processor_id()) ||
 				(con->flags & CON_ANYTIME)))
@@ -544,7 +550,7 @@ static int have_callable_console(void)
 {
 	struct console *con;
 
-	for (con = console_drivers; con; con = con->next)
+	for_each_console(con)
 		if (con->flags & CON_ANYTIME)
 			return 1;
 
@@ -1082,7 +1088,7 @@ void console_unblank(void)
 
 	console_locked = 1;
 	console_may_schedule = 0;
-	for (c = console_drivers; c != NULL; c = c->next)
+	for_each_console(c)
 		if ((c->flags & CON_ENABLED) && c->unblank)
 			c->unblank();
 	release_console_sem();
@@ -1097,7 +1103,7 @@ struct tty_driver *console_device(int *index)
 	struct tty_driver *driver = NULL;
 
 	acquire_console_sem();
-	for (c = console_drivers; c != NULL; c = c->next) {
+	for_each_console(c) {
 		if (!c->device)
 			continue;
 		driver = c->device(c, index);
@@ -1134,25 +1140,49 @@ EXPORT_SYMBOL(console_start);
  * to register the console printing procedure with printk() and to
  * print any messages that were printed by the kernel before the
  * console driver was initialized.
+ *
+ * This can happen pretty early during the boot process (because of
+ * early_printk) - sometimes before setup_arch() completes - be careful
+ * of what kernel features are used - they may not be initialised yet.
+ *
+ * There are two types of consoles - bootconsoles (early_printk) and
+ * "real" consoles (everything which is not a bootconsole) which are
+ * handled differently.
+ *  - Any number of bootconsoles can be registered at any time.
+ *  - As soon as a "real" console is registered, all bootconsoles
+ *    will be unregistered automatically.
+ *  - Once a "real" console is registered, any attempt to register a
+ *    bootconsoles will be rejected
  */
-void register_console(struct console *console)
+void register_console(struct console *newcon)
 {
 	int i;
 	unsigned long flags;
-	struct console *bootconsole = NULL;
+	struct console *bcon = NULL;
 
-	if (console_drivers) {
-		if (console->flags & CON_BOOT)
-			return;
-		if (console_drivers->flags & CON_BOOT)
-			bootconsole = console_drivers;
+	/*
+	 * before we register a new CON_BOOT console, make sure we don't
+	 * already have a valid console
+	 */
+	if (console_drivers && newcon->flags & CON_BOOT) {
+		/* find the last or real console */
+		for_each_console(bcon) {
+			if (!(bcon->flags & CON_BOOT)) {
+				printk(KERN_INFO "Too late to register bootconsole %s%d\n",
+					newcon->name, newcon->index);
+				return;
+			}
+		}
 	}
 
-	if (preferred_console < 0 || bootconsole || !console_drivers)
+	if (console_drivers && console_drivers->flags & CON_BOOT)
+		bcon = console_drivers;
+
+	if (preferred_console < 0 || bcon || !console_drivers)
 		preferred_console = selected_console;
 
-	if (console->early_setup)
-		console->early_setup();
+	if (newcon->early_setup)
+		newcon->early_setup();
 
 	/*
 	 *	See if we want to use this console driver. If we
@@ -1160,13 +1190,13 @@ void register_console(struct console *console)
 	 *	that registers here.
 	 */
 	if (preferred_console < 0) {
-		if (console->index < 0)
-			console->index = 0;
-		if (console->setup == NULL ||
-		    console->setup(console, NULL) == 0) {
-			console->flags |= CON_ENABLED;
-			if (console->device) {
-				console->flags |= CON_CONSDEV;
+		if (newcon->index < 0)
+			newcon->index = 0;
+		if (newcon->setup == NULL ||
+		    newcon->setup(newcon, NULL) == 0) {
+			newcon->flags |= CON_ENABLED;
+			if (newcon->device) {
+				newcon->flags |= CON_CONSDEV;
 				preferred_console = 0;
 			}
 		}
@@ -1178,47 +1208,53 @@ void register_console(struct console *console)
 	 */
 	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
 			i++) {
-		if (strcmp(console_cmdline[i].name, console->name) != 0)
+		if (strcmp(console_cmdline[i].name, newcon->name) != 0)
 			continue;
-		if (console->index >= 0 &&
-		    console->index != console_cmdline[i].index)
+		if (newcon->index >= 0 &&
+		    newcon->index != console_cmdline[i].index)
 			continue;
-		if (console->index < 0)
-			console->index = console_cmdline[i].index;
+		if (newcon->index < 0)
+			newcon->index = console_cmdline[i].index;
 #ifdef CONFIG_A11Y_BRAILLE_CONSOLE
 		if (console_cmdline[i].brl_options) {
-			console->flags |= CON_BRL;
-			braille_register_console(console,
+			newcon->flags |= CON_BRL;
+			braille_register_console(newcon,
 					console_cmdline[i].index,
 					console_cmdline[i].options,
 					console_cmdline[i].brl_options);
 			return;
 		}
 #endif
-		if (console->setup &&
-		    console->setup(console, console_cmdline[i].options) != 0)
+		if (newcon->setup &&
+		    newcon->setup(newcon, console_cmdline[i].options) != 0)
 			break;
-		console->flags |= CON_ENABLED;
-		console->index = console_cmdline[i].index;
+		newcon->flags |= CON_ENABLED;
+		newcon->index = console_cmdline[i].index;
 		if (i == selected_console) {
-			console->flags |= CON_CONSDEV;
+			newcon->flags |= CON_CONSDEV;
 			preferred_console = selected_console;
 		}
 		break;
 	}
 
-	if (!(console->flags & CON_ENABLED))
+	if (!(newcon->flags & CON_ENABLED))
 		return;
 
-	if (bootconsole && (console->flags & CON_CONSDEV)) {
-		printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
-		       bootconsole->name, bootconsole->index,
-		       console->name, console->index);
-		unregister_console(bootconsole);
-		console->flags &= ~CON_PRINTBUFFER;
+	if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+		/* we need to iterate through twice, to make sure we print
+		 * everything out, before we unregister the console(s)
+		 */
+		printk(KERN_INFO "console handover:");
+		for_each_console(bcon)
+			printk("boot [%s%d] ", bcon->name, bcon->index);
+		printk(" -> real [%s%d]\n", newcon->name, newcon->index);
+		for_each_console(bcon)
+			unregister_console(bcon);
+		newcon->flags &= ~CON_PRINTBUFFER;
 	} else {
-		printk(KERN_INFO "console [%s%d] enabled\n",
-		       console->name, console->index);
+		printk(KERN_INFO "%sconsole [%s%d] enabled\n",
+			(newcon->flags & CON_BOOT) ? "boot" : "" ,
+			newcon->name, newcon->index);
 	}
 
 	/*
@@ -1226,16 +1262,16 @@ void register_console(struct console *console)
 	 *	preferred driver at the head of the list.
 	 */
 	acquire_console_sem();
-	if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
-		console->next = console_drivers;
-		console_drivers = console;
-		if (console->next)
-			console->next->flags &= ~CON_CONSDEV;
+	if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
+		newcon->next = console_drivers;
+		console_drivers = newcon;
+		if (newcon->next)
+			newcon->next->flags &= ~CON_CONSDEV;
 	} else {
-		console->next = console_drivers->next;
-		console_drivers->next = console;
+		newcon->next = console_drivers->next;
+		console_drivers->next = newcon;
 	}
-	if (console->flags & CON_PRINTBUFFER) {
+	if (newcon->flags & CON_PRINTBUFFER) {
 		/*
 		 * release_console_sem() will print out the buffered messages
 		 * for us.
@@ -1287,11 +1323,13 @@ EXPORT_SYMBOL(unregister_console);
 
 static int __init disable_boot_consoles(void)
 {
-	if (console_drivers != NULL) {
-		if (console_drivers->flags & CON_BOOT) {
+	struct console *con;
+
+	for_each_console(con) {
+		if (con->flags & CON_BOOT) {
 			printk(KERN_INFO "turn off boot console %s%d\n",
-				console_drivers->name, console_drivers->index);
-			return unregister_console(console_drivers);
+				con->name, con->index);
+			return unregister_console(con);
 		}
 	}
 	return 0;
-- 
cgit v1.2.3


From c71320d0c445e01555a86fa6523609db47eeaef6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 5 Jul 2009 00:22:34 +0200
Subject: genirq: Fix comment describing suspend_device_irqs()

The kerneldoc comment describing suspend_device_irqs() is currently
misleading, because generally the function doesn't really disable
interrupt lines at the chip level.  Replace it with a more accurate
one.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
LKML-Reference: <200907050022.35117.rjw@sisk.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/pm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec1..a0bb09e7986 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
 /**
  * suspend_device_irqs - disable all currently enabled interrupt lines
  *
- * During system-wide suspend or hibernation device interrupts need to be
- * disabled at the chip level and this function is provided for this purpose.
- * It disables all interrupt lines that are enabled at the moment and sets the
- * IRQ_SUSPENDED flag for them.
+ * During system-wide suspend or hibernation device drivers need to be prevented
+ * from receiving interrupts and this function is provided for this purpose.
+ * It marks all interrupt lines in use, except for the timer ones, as disabled
+ * and sets the IRQ_SUSPENDED flag for each of them.
  */
 void suspend_device_irqs(void)
 {
-- 
cgit v1.2.3


From 8259cf4342029aad37660e524178c8858f48b0ab Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Thu, 9 Jul 2009 13:08:37 -0400
Subject: printk: Ensure that "console enabled" messages are printed on the
 console

Today, when a console is registered without CON_PRINTBUFFER,
end users never see the announcement of it being added, and
never know if they missed something, if the console is really
at the start or not, and just leads to general confusion.

This re-orders existing code, to make sure the console is
added, before the "console [%s%d] enabled" is printed out -
ensuring that this message is _always_ seen.

This has the desired/intended side effect of making sure that
"console enabled:" messages are printed on the bootconsole, and
the real console. This does cause the same line is printed
twice if the bootconsole and real console are the same device,
but if they are on different devices, the message is printed to
both consoles.

Signed-off-by : Robin Getz <rgetz@blackfin.uclinux.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: "Linus Torvalds" <torvalds@linux-foundation.org>
LKML-Reference: <200907091308.37370.rgetz@blackfin.uclinux.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 41fe6099553..668df351e6a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1240,22 +1240,14 @@ void register_console(struct console *newcon)
 	if (!(newcon->flags & CON_ENABLED))
 		return;
 
-	if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
-		/* we need to iterate through twice, to make sure we print
-		 * everything out, before we unregister the console(s)
-		 */
-		printk(KERN_INFO "console handover:");
-		for_each_console(bcon)
-			printk("boot [%s%d] ", bcon->name, bcon->index);
-		printk(" -> real [%s%d]\n", newcon->name, newcon->index);
-		for_each_console(bcon)
-			unregister_console(bcon);
+	/*
+	 * If we have a bootconsole, and are switching to a real console,
+	 * don't print everything out again, since when the boot console, and
+	 * the real console are the same physical device, it's annoying to
+	 * see the beginning boot messages twice
+	 */
+	if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
 		newcon->flags &= ~CON_PRINTBUFFER;
-	} else {
-		printk(KERN_INFO "%sconsole [%s%d] enabled\n",
-			(newcon->flags & CON_BOOT) ? "boot" : "" ,
-			newcon->name, newcon->index);
-	}
 
 	/*
 	 *	Put this console in the list - keep the
@@ -1281,6 +1273,28 @@ void register_console(struct console *newcon)
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
 	release_console_sem();
+
+	/*
+	 * By unregistering the bootconsoles after we enable the real console
+	 * we get the "console xxx enabled" message on all the consoles -
+	 * boot consoles, real consoles, etc - this is to ensure that end
+	 * users know there might be something in the kernel's log buffer that
+	 * went to the bootconsole (that they do not see on the real console)
+	 */
+	if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+		/* we need to iterate through twice, to make sure we print
+		 * everything out, before we unregister the console(s)
+		 */
+		printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
+			newcon->name, newcon->index);
+		for_each_console(bcon)
+			if (bcon->flags & CON_BOOT)
+				unregister_console(bcon);
+	} else {
+		printk(KERN_INFO "%sconsole [%s%d] enabled\n",
+			(newcon->flags & CON_BOOT) ? "boot" : "" ,
+			newcon->name, newcon->index);
+	}
 }
 EXPORT_SYMBOL(register_console);
 
-- 
cgit v1.2.3


From 1aaad49e856ce41adc07d8ae0c8ef35fc4483245 Mon Sep 17 00:00:00 2001
From: Frans Pop <elendil@planet.nl>
Date: Mon, 6 Jul 2009 13:31:48 +0200
Subject: printk: Restore previous console_loglevel when re-enabling logging

When logging to console is disabled from userspace using klogctl()
and later re-enabled, console_loglevel gets set to the default
log level instead to the previous value.

This means that if the kernel was booted with 'quiet', the boot is
suddenly no longer quiet after logging to console gets re-enabled.

Save the current console_loglevel when logging is disabled and
restore to that value. If the log level is set to a specific value
while disabled, this is interpreted as an implicit re-enabling of
the logging.

The problem that prompted this patch is described in:

    http://lkml.org/lkml/2009/6/28/234

There are two variations possible on the patch below:

 1) If klogctl(7) is called while logging is not disabled, then set level
    to default (partially preserving current functionality):
  	case 7:		/* Enable logging to console */
 -		console_loglevel = default_console_loglevel;
 +		if (saved_console_loglevel == -1)
 +			console_loglevel = default_console_loglevel;
 +		else {
 +			console_loglevel = saved_console_loglevel;
 +			saved_console_loglevel = -1;
 +		}

 2) If klogctl(8) is called while logging is disabled, then don't enable
    logging, but remember the requested value for when logging does get
    enabled again:
  	case 8:		/* Set level of messages printed to console */
 [...]
 - 		console_loglevel = len;
 +		if (saved_console_loglevel == -1)
 +			console_loglevel = len;
 +		else
 +			saved_console_loglevel = len;

Yet another option would be to ignore the request.

Signed-off-by: Frans Pop <elendil@planet.nl>
Cc: cryptsetup@packages.debian.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <200907061331.49930.elendil@planet.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 668df351e6a..e0daaf57985 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -67,6 +67,8 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
+static int saved_console_loglevel = -1;
+
 /*
  * Low level drivers may need that to know if they can schedule in
  * their unblank() callback or not. So let's export it.
@@ -378,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
 		logged_chars = 0;
 		break;
 	case 6:		/* Disable logging to console */
+		if (saved_console_loglevel == -1)
+			saved_console_loglevel = console_loglevel;
 		console_loglevel = minimum_console_loglevel;
 		break;
 	case 7:		/* Enable logging to console */
-		console_loglevel = default_console_loglevel;
+		if (saved_console_loglevel != -1) {
+			console_loglevel = saved_console_loglevel;
+			saved_console_loglevel = -1;
+		}
 		break;
 	case 8:		/* Set level of messages printed to console */
 		error = -EINVAL;
@@ -390,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
 		if (len < minimum_console_loglevel)
 			len = minimum_console_loglevel;
 		console_loglevel = len;
+		/* Implicitly re-enable logging to console */
+		saved_console_loglevel = -1;
 		error = 0;
 		break;
 	case 9:		/* Number of chars in the log buffer */
-- 
cgit v1.2.3


From 134e63756d5f3d0f7604dfcca847b09d1b14fd66 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 10 Jul 2009 09:51:34 +0000
Subject: genetlink: make netns aware

This makes generic netlink network namespace aware. No
generic netlink families except for the controller family
are made namespace aware, they need to be checked one by
one and then set the family->netnsok member to true.

A new function genlmsg_multicast_netns() is introduced to
allow sending a multicast message in a given namespace,
for example when it applies to an object that lives in
that namespace, a new function genlmsg_multicast_allns()
to send a message to all network namespaces (for objects
that do not have an associated netns).

The function genlmsg_multicast() is changed to multicast
the message in just init_net, which is currently correct
for all generic netlink families since they only work in
init_net right now. Some will later want to work in all
net namespaces because they do not care about the netns
at all -- those will have to be converted to use one of
the new functions genlmsg_multicast_allns() or
genlmsg_multicast_netns() whenever they are made netns
aware in some way.

After this patch families can easily decide whether or
not they should be available in all net namespaces. Many
genl families us it for objects not related to networking
and should therefore be available in all namespaces, but
that will have to be done on a per family basis.

Note that this doesn't touch on the checkpoint/restart
problem where network namespaces could be used, genl
families and multicast groups are numbered globally and
I see no easy way of changing that, especially since it
must be possible to multicast to all network namespaces
for those families that do not care about netns.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30..ea8384d3caa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 /*
  * Send taskstats data in @skb to listener with nl_pid @pid
  */
-static int send_reply(struct sk_buff *skb, pid_t pid)
+static int send_reply(struct sk_buff *skb, struct genl_info *info)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
 	void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 		return rc;
 	}
 
-	return genlmsg_unicast(skb, pid);
+	return genlmsg_reply(skb, info);
 }
 
 /*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
 			if (!skb_next)
 				break;
 		}
-		rc = genlmsg_unicast(skb_cur, s->pid);
+		rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
 		if (rc == -ECONNREFUSED) {
 			s->valid = 0;
 			delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 		goto err;
 	}
 
-	rc = send_reply(rep_skb, info->snd_pid);
+	rc = send_reply(rep_skb, info);
 
 err:
 	fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
 	} else
 		goto err;
 
-	return send_reply(rep_skb, info->snd_pid);
+	return send_reply(rep_skb, info);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
-- 
cgit v1.2.3


From e7aaaa6934636d7a6cadd9e2a05250fbb6a34f65 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Drop the need_resched() loop from cond_resched()

The schedule() function is a loop that reschedules the current
task while the TIF_NEED_RESCHED flag is set:

void schedule(void)
{
need_resched:
	/* schedule code */
	if (need_resched())
		goto need_resched;
}

And cond_resched() repeat this loop:

do {
	add_preempt_count(PREEMPT_ACTIVE);
	schedule();
	sub_preempt_count(PREEMPT_ACTIVE);
} while(need_resched());

This loop is needless because schedule() already did the check
and nothing can set TIF_NEED_RESCHED between schedule() exit
and the loop check in need_resched().

Then remove this needless loop.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 03f7e3fd80b..4c5ee843d57 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6618,11 +6618,9 @@ static void __cond_resched(void)
 	 * PREEMPT_ACTIVE, which could trigger a second
 	 * cond_resched() call.
 	 */
-	do {
-		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
-	} while (need_resched());
+	add_preempt_count(PREEMPT_ACTIVE);
+	schedule();
+	sub_preempt_count(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
-- 
cgit v1.2.3


From 4b2155678d7cc7b5f45d6b36049091376c3408a2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Remove obsolete comment in __cond_resched()

Remove the outdated comment from __cond_resched() related to
the now removed Big Kernel Semaphore.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4c5ee843d57..4d39e96044b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6613,11 +6613,6 @@ static void __cond_resched(void)
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 	__might_sleep(__FILE__, __LINE__);
 #endif
-	/*
-	 * The BKS might be reacquired before we have dropped
-	 * PREEMPT_ACTIVE, which could trigger a second
-	 * cond_resched() call.
-	 */
 	add_preempt_count(PREEMPT_ACTIVE);
 	schedule();
 	sub_preempt_count(PREEMPT_ACTIVE);
-- 
cgit v1.2.3


From e09758fae8ccde97e026c704319eaa18d488dc86 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Cover the CONFIG_DEBUG_SPINLOCK_SLEEP off-case for
 __might_sleep()

Cover the off case for __might_sleep(), so that we avoid
#ifdefs in files that make use of it. Especially, this prepares
for the __might_sleep() pull up on cond_resched().

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4d39e96044b..370a6c31c5e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6610,9 +6610,8 @@ static inline int should_resched(void)
 
 static void __cond_resched(void)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 	__might_sleep(__FILE__, __LINE__);
-#endif
+
 	add_preempt_count(PREEMPT_ACTIVE);
 	schedule();
 	sub_preempt_count(PREEMPT_ACTIVE);
-- 
cgit v1.2.3


From e4aafea2d4bde8b44d6500c4ee7195bbfc51269e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Add a preempt count base offset to __might_sleep()

Add a preempt count base offset to compare against the current
preempt level count. It prepares to pull up the might_sleep
check from cond_resched() to cond_resched_lock() and
cond_resched_bh().

For these two helpers, we need to respectively ensure that once
we'll unlock the given spinlock / reenable local softirqs, we
will reach a sleepable state.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
[ Move and rename preempt_count_equals() ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 370a6c31c5e..3ff4d004bd9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6610,7 +6610,7 @@ static inline int should_resched(void)
 
 static void __cond_resched(void)
 {
-	__might_sleep(__FILE__, __LINE__);
+	__might_sleep(__FILE__, __LINE__, 0);
 
 	add_preempt_count(PREEMPT_ACTIVE);
 	schedule();
@@ -9429,13 +9429,20 @@ void __init sched_init(void)
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+	int nested = preempt_count() & ~PREEMPT_ACTIVE;
+
+	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+
+void __might_sleep(char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((!in_atomic() && !irqs_disabled()) ||
-		    system_state != SYSTEM_RUNNING || oops_in_progress)
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 		return;
-- 
cgit v1.2.3


From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Pull up the might_sleep() check into cond_resched()

might_sleep() is called late-ish in cond_resched(), after the
need_resched()/preempt enabled/system running tests are
checked.

It's better to check the sleeps while atomic earlier and not
depend on some environment datas that reduce the chances to
detect a problem.

Also define cond_resched_*() helpers as macros, so that the
FILE/LINE reported in the sleeping while atomic warning
displays the real origin and not sched.h

Changes in v2:

 - Call __might_sleep() directly instead of might_sleep() which
   may call cond_resched()

 - Turn cond_resched() into a macro so that the file:line
   couple reported refers to the caller of cond_resched() and
   not __cond_resched() itself.

Changes in v3:

 - Also propagate this __might_sleep() pull up to
   cond_resched_lock() and cond_resched_softirq()

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3ff4d004bd9..1f7919add8a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6610,8 +6610,6 @@ static inline int should_resched(void)
 
 static void __cond_resched(void)
 {
-	__might_sleep(__FILE__, __LINE__, 0);
-
 	add_preempt_count(PREEMPT_ACTIVE);
 	schedule();
 	sub_preempt_count(PREEMPT_ACTIVE);
@@ -6628,14 +6626,14 @@ int __sched _cond_resched(void)
 EXPORT_SYMBOL(_cond_resched);
 
 /*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
 {
 	int resched = should_resched();
 	int ret = 0;
@@ -6651,9 +6649,9 @@ int cond_resched_lock(spinlock_t *lock)
 	}
 	return ret;
 }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
 
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
@@ -6665,7 +6663,7 @@ int __sched cond_resched_softirq(void)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
 
 /**
  * yield - yield the current processor to other threads.
-- 
cgit v1.2.3


From a004cd42181409eda70804ded240a791f4564d61 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 21 Jul 2009 09:54:05 +0200
Subject: sched: Fix return value of migration_init()

migration_init() returns the return value of the hotplug notifier. In
the success case this is NOTIFY_OK which is 1. initcall_debug
evaluates that as an error code because init calls are expected to
return 0 on success.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1f7919add8a..953f037dc05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7654,7 +7654,7 @@ static int __init migration_init(void)
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
-	return err;
+	return 0;
 }
 early_initcall(migration_init);
 #endif
-- 
cgit v1.2.3


From c94aa5ca3088018d2a7a9bd3258aefffe29df265 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Print the shortest dependency chain if finding a circle

Currently lockdep will print the 1st circle detected if it
exists when acquiring a new (next) lock.

This patch prints the shortest path from the next lock to be
acquired to the previous held lock if a circle is found.

The patch still uses the current method to check circle, and
once the circle is found, breadth-first search algorithem is
used to compute the shortest path from the next lock to the
previous lock in the forward lock dependency graph.

Printing the shortest path will shorten the dependency chain,
and make troubleshooting for possible circular locking easier.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-2-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 115 +++++++++++++++++++++++++++++++++++++++++----
 kernel/lockdep_internals.h |  83 ++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c7..93dc70d18cd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -897,6 +897,79 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	return 1;
 }
 
+static struct circular_queue  lock_cq;
+static int __search_shortest_path(struct lock_list *source_entry,
+				struct lock_class *target,
+				struct lock_list **target_entry,
+				int forward)
+{
+	struct lock_list *entry;
+	struct circular_queue *cq = &lock_cq;
+	int ret = 1;
+
+	__cq_init(cq);
+
+	mark_lock_accessed(source_entry, NULL);
+	if (source_entry->class == target) {
+		*target_entry = source_entry;
+		ret = 0;
+		goto exit;
+	}
+
+	__cq_enqueue(cq, (unsigned long)source_entry);
+
+	while (!__cq_empty(cq)) {
+		struct lock_list *lock;
+		struct list_head *head;
+
+		__cq_dequeue(cq, (unsigned long *)&lock);
+
+		if (!lock->class) {
+			ret = -2;
+			goto exit;
+		}
+
+		if (forward)
+			head = &lock->class->locks_after;
+		else
+			head = &lock->class->locks_before;
+
+		list_for_each_entry(entry, head, entry) {
+			if (!lock_accessed(entry)) {
+				mark_lock_accessed(entry, lock);
+				if (entry->class == target) {
+					*target_entry = entry;
+					ret = 0;
+					goto exit;
+				}
+
+				if (__cq_enqueue(cq, (unsigned long)entry)) {
+					ret = -1;
+					goto exit;
+				}
+			}
+		}
+	}
+exit:
+	return ret;
+}
+
+static inline int __search_forward_shortest_path(struct lock_list *src_entry,
+				struct lock_class *target,
+				struct lock_list **target_entry)
+{
+	return __search_shortest_path(src_entry, target, target_entry, 1);
+
+}
+
+static inline int __search_backward_shortest_path(struct lock_list *src_entry,
+				struct lock_class *target,
+				struct lock_list **target_entry)
+{
+	return __search_shortest_path(src_entry, target, target_entry, 0);
+
+}
+
 /*
  * Recursive, forwards-direction lock-dependency checking, used for
  * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
@@ -934,7 +1007,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 {
 	struct task_struct *curr = current;
 
-	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+	if (debug_locks_silent)
 		return 0;
 
 	printk("\n=======================================================\n");
@@ -954,19 +1027,41 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 	return 0;
 }
 
-static noinline int print_circular_bug_tail(void)
+static noinline int print_circular_bug(void)
 {
 	struct task_struct *curr = current;
 	struct lock_list this;
+	struct lock_list *target;
+	struct lock_list *parent;
+	int result;
+	unsigned long depth;
 
-	if (debug_locks_silent)
+	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
 	this.class = hlock_class(check_source);
 	if (!save_trace(&this.trace))
 		return 0;
 
-	print_circular_bug_entry(&this, 0);
+	result = __search_forward_shortest_path(&this,
+						hlock_class(check_target),
+						&target);
+	if (result) {
+		printk("\n%s:search shortest path failed:%d\n", __func__,
+			result);
+		return 0;
+	}
+
+	depth = get_lock_depth(target);
+
+	print_circular_bug_header(target, depth);
+
+	parent = get_lock_parent(target);
+
+	while (parent) {
+		print_circular_bug_entry(parent, --depth);
+		parent = get_lock_parent(parent);
+	}
 
 	printk("\nother info that might help us debug this:\n\n");
 	lockdep_print_held_locks(curr);
@@ -1072,14 +1167,15 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	 */
 	list_for_each_entry(entry, &source->locks_after, entry) {
 		if (entry->class == hlock_class(check_target))
-			return print_circular_bug_header(entry, depth+1);
+			return 2;
 		debug_atomic_inc(&nr_cyclic_checks);
-		if (!check_noncircular(entry->class, depth+1))
-			return print_circular_bug_entry(entry, depth+1);
+		if (check_noncircular(entry->class, depth+1) == 2)
+			return 2;
 	}
 	return 1;
 }
 
+
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
@@ -1484,8 +1580,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 */
 	check_source = next;
 	check_target = prev;
-	if (!(check_noncircular(hlock_class(next), 0)))
-		return print_circular_bug_tail();
+	if (check_noncircular(hlock_class(next), 0) == 2)
+		return print_circular_bug();
+
 
 	if (!check_prev_add_irq(curr, prev, next))
 		return 0;
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d..6f48d37d5be 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -136,3 +136,86 @@ extern atomic_t nr_find_usage_backwards_recursions;
 # define debug_atomic_dec(ptr)		do { } while (0)
 # define debug_atomic_read(ptr)		0
 #endif
+
+/* The circular_queue and helpers is used to implement the
+ * breadth-first search(BFS)algorithem, by which we can build
+ * the shortest path from the next lock to be acquired to the
+ * previous held lock if there is a circular between them.
+ * */
+#define  MAX_CIRCULAR_QUE_SIZE	    4096UL
+struct circular_queue{
+	unsigned long element[MAX_CIRCULAR_QUE_SIZE];
+	unsigned int  front, rear;
+};
+
+#define LOCK_ACCESSED 		1UL
+#define LOCK_ACCESSED_MASK	(~LOCK_ACCESSED)
+
+static inline void __cq_init(struct circular_queue *cq)
+{
+	cq->front = cq->rear = 0;
+}
+
+static inline int __cq_empty(struct circular_queue *cq)
+{
+	return (cq->front == cq->rear);
+}
+
+static inline int __cq_full(struct circular_queue *cq)
+{
+	return ((cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE)  == cq->front;
+}
+
+static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+{
+	if (__cq_full(cq))
+		return -1;
+
+	cq->element[cq->rear] = elem;
+	cq->rear = (cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE;
+	return 0;
+}
+
+static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+{
+	if (__cq_empty(cq))
+		return -1;
+
+	*elem = cq->element[cq->front];
+	cq->front = (cq->front + 1)%MAX_CIRCULAR_QUE_SIZE;
+	return 0;
+}
+
+static inline int __cq_get_elem_count(struct circular_queue *cq)
+{
+	return (cq->rear - cq->front)%MAX_CIRCULAR_QUE_SIZE;
+}
+
+static inline void mark_lock_accessed(struct lock_list *lock,
+					struct lock_list *parent)
+{
+	lock->parent = (void *) parent + LOCK_ACCESSED;
+}
+
+static inline unsigned long lock_accessed(struct lock_list *lock)
+{
+	return (unsigned long)lock->parent & LOCK_ACCESSED;
+}
+
+static inline struct lock_list *get_lock_parent(struct lock_list *child)
+{
+	return (struct lock_list *)
+		((unsigned long)child->parent & LOCK_ACCESSED_MASK);
+}
+
+static inline unsigned long get_lock_depth(struct lock_list *child)
+{
+	unsigned long depth = 0;
+	struct lock_list *parent;
+
+	while ((parent = get_lock_parent(child))) {
+		child = parent;
+		depth++;
+	}
+	return depth;
+}
-- 
cgit v1.2.3


From d588e46155e9c51217b9840db1e94a0f594c1af2 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Improve implementation of BFS

1,replace %MAX_CIRCULAR_QUE_SIZE with &(MAX_CIRCULAR_QUE_SIZE-1)
since we define MAX_CIRCULAR_QUE_SIZE as power of 2;

2,use bitmap to mark if a lock is accessed in BFS in order to
clear it quickly, because we may search a graph many times.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-3-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 23 ++++++++++++++++-------
 kernel/lockdep_internals.h | 35 +++++++++++++++++++++++------------
 2 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 93dc70d18cd..5dcca26a826 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,7 +42,7 @@
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
-
+#include <linux/bitops.h>
 #include <asm/sections.h>
 
 #include "lockdep_internals.h"
@@ -118,7 +118,7 @@ static inline int debug_locks_off_graph_unlock(void)
 static int lockdep_initialized;
 
 unsigned long nr_list_entries;
-static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 
 /*
  * All data structures here are protected by the global debug_lock.
@@ -897,30 +897,38 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	return 1;
 }
 
+unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)];
 static struct circular_queue  lock_cq;
+
 static int __search_shortest_path(struct lock_list *source_entry,
 				struct lock_class *target,
 				struct lock_list **target_entry,
 				int forward)
 {
 	struct lock_list *entry;
+	struct list_head *head;
 	struct circular_queue *cq = &lock_cq;
 	int ret = 1;
 
-	__cq_init(cq);
-
-	mark_lock_accessed(source_entry, NULL);
 	if (source_entry->class == target) {
 		*target_entry = source_entry;
 		ret = 0;
 		goto exit;
 	}
 
+	if (forward)
+		head = &source_entry->class->locks_after;
+	else
+		head = &source_entry->class->locks_before;
+
+	if (list_empty(head))
+		goto exit;
+
+	__cq_init(cq);
 	__cq_enqueue(cq, (unsigned long)source_entry);
 
 	while (!__cq_empty(cq)) {
 		struct lock_list *lock;
-		struct list_head *head;
 
 		__cq_dequeue(cq, (unsigned long *)&lock);
 
@@ -1040,6 +1048,7 @@ static noinline int print_circular_bug(void)
 		return 0;
 
 	this.class = hlock_class(check_source);
+	this.parent = NULL;
 	if (!save_trace(&this.trace))
 		return 0;
 
@@ -1580,10 +1589,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 */
 	check_source = next;
 	check_target = prev;
+
 	if (check_noncircular(hlock_class(next), 0) == 2)
 		return print_circular_bug();
 
-
 	if (!check_prev_add_irq(curr, prev, next))
 		return 0;
 
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 6f48d37d5be..c2f6594966f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -137,23 +137,28 @@ extern atomic_t nr_find_usage_backwards_recursions;
 # define debug_atomic_read(ptr)		0
 #endif
 
+
+extern unsigned long nr_list_entries;
+extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+extern unsigned long bfs_accessed[];
+
+/*For good efficiency of modular, we use power of 2*/
+#define  MAX_CIRCULAR_QUE_SIZE	    4096UL
+
 /* The circular_queue and helpers is used to implement the
  * breadth-first search(BFS)algorithem, by which we can build
  * the shortest path from the next lock to be acquired to the
  * previous held lock if there is a circular between them.
  * */
-#define  MAX_CIRCULAR_QUE_SIZE	    4096UL
 struct circular_queue{
 	unsigned long element[MAX_CIRCULAR_QUE_SIZE];
 	unsigned int  front, rear;
 };
 
-#define LOCK_ACCESSED 		1UL
-#define LOCK_ACCESSED_MASK	(~LOCK_ACCESSED)
-
 static inline void __cq_init(struct circular_queue *cq)
 {
 	cq->front = cq->rear = 0;
+	bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES);
 }
 
 static inline int __cq_empty(struct circular_queue *cq)
@@ -163,7 +168,7 @@ static inline int __cq_empty(struct circular_queue *cq)
 
 static inline int __cq_full(struct circular_queue *cq)
 {
-	return ((cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE)  == cq->front;
+	return ((cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1))  == cq->front;
 }
 
 static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
@@ -172,7 +177,7 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
 		return -1;
 
 	cq->element[cq->rear] = elem;
-	cq->rear = (cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE;
+	cq->rear = (cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1);
 	return 0;
 }
 
@@ -182,30 +187,36 @@ static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
 		return -1;
 
 	*elem = cq->element[cq->front];
-	cq->front = (cq->front + 1)%MAX_CIRCULAR_QUE_SIZE;
+	cq->front = (cq->front + 1)&(MAX_CIRCULAR_QUE_SIZE-1);
 	return 0;
 }
 
 static inline int __cq_get_elem_count(struct circular_queue *cq)
 {
-	return (cq->rear - cq->front)%MAX_CIRCULAR_QUE_SIZE;
+	return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1);
 }
 
 static inline void mark_lock_accessed(struct lock_list *lock,
 					struct lock_list *parent)
 {
-	lock->parent = (void *) parent + LOCK_ACCESSED;
+	unsigned long nr;
+	nr = lock - list_entries;
+	WARN_ON(nr >= nr_list_entries);
+	lock->parent = parent;
+	set_bit(nr, bfs_accessed);
 }
 
 static inline unsigned long lock_accessed(struct lock_list *lock)
 {
-	return (unsigned long)lock->parent & LOCK_ACCESSED;
+	unsigned long nr;
+	nr = lock - list_entries;
+	WARN_ON(nr >= nr_list_entries);
+	return test_bit(nr, bfs_accessed);
 }
 
 static inline struct lock_list *get_lock_parent(struct lock_list *child)
 {
-	return (struct lock_list *)
-		((unsigned long)child->parent & LOCK_ACCESSED_MASK);
+	return child->parent;
 }
 
 static inline unsigned long get_lock_depth(struct lock_list *child)
-- 
cgit v1.2.3


From 9e2d551ea0d767c0d624965f0c273e942f4be536 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Introduce match function to BFS

1,introduce match() to BFS in order to make it usable to
match different pattern;

2,also rename some functions to make them more suitable.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-4-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5dcca26a826..ce6d09e65ad 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -900,17 +900,18 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)];
 static struct circular_queue  lock_cq;
 
-static int __search_shortest_path(struct lock_list *source_entry,
-				struct lock_class *target,
-				struct lock_list **target_entry,
-				int forward)
+static int __bfs(struct lock_list *source_entry,
+			void *data,
+			int (*match)(struct lock_list *entry, void *data),
+			struct lock_list **target_entry,
+			int forward)
 {
 	struct lock_list *entry;
 	struct list_head *head;
 	struct circular_queue *cq = &lock_cq;
 	int ret = 1;
 
-	if (source_entry->class == target) {
+	if (match(source_entry, data)) {
 		*target_entry = source_entry;
 		ret = 0;
 		goto exit;
@@ -945,7 +946,7 @@ static int __search_shortest_path(struct lock_list *source_entry,
 		list_for_each_entry(entry, head, entry) {
 			if (!lock_accessed(entry)) {
 				mark_lock_accessed(entry, lock);
-				if (entry->class == target) {
+				if (match(entry, data)) {
 					*target_entry = entry;
 					ret = 0;
 					goto exit;
@@ -962,19 +963,21 @@ exit:
 	return ret;
 }
 
-static inline int __search_forward_shortest_path(struct lock_list *src_entry,
-				struct lock_class *target,
-				struct lock_list **target_entry)
+static inline int __bfs_forward(struct lock_list *src_entry,
+			void *data,
+			int (*match)(struct lock_list *entry, void *data),
+			struct lock_list **target_entry)
 {
-	return __search_shortest_path(src_entry, target, target_entry, 1);
+	return __bfs(src_entry, data, match, target_entry, 1);
 
 }
 
-static inline int __search_backward_shortest_path(struct lock_list *src_entry,
-				struct lock_class *target,
-				struct lock_list **target_entry)
+static inline int __bfs_backward(struct lock_list *src_entry,
+			void *data,
+			int (*match)(struct lock_list *entry, void *data),
+			struct lock_list **target_entry)
 {
-	return __search_shortest_path(src_entry, target, target_entry, 0);
+	return __bfs(src_entry, data, match, target_entry, 0);
 
 }
 
@@ -1035,6 +1038,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 	return 0;
 }
 
+static inline int class_equal(struct lock_list *entry, void *data)
+{
+	return entry->class == data;
+}
+
 static noinline int print_circular_bug(void)
 {
 	struct task_struct *curr = current;
@@ -1052,9 +1060,10 @@ static noinline int print_circular_bug(void)
 	if (!save_trace(&this.trace))
 		return 0;
 
-	result = __search_forward_shortest_path(&this,
-						hlock_class(check_target),
-						&target);
+	result = __bfs_forward(&this,
+			hlock_class(check_target),
+			class_equal,
+			&target);
 	if (result) {
 		printk("\n%s:search shortest path failed:%d\n", __func__,
 			result);
-- 
cgit v1.2.3


From db0002a32f31060ca900b533d93a074ddf7d5b61 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Implement check_noncircular() by BFS

This patch uses BFS to implement check_noncircular() and
prints the generated shortest circle if exists.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-5-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 89 +++++++++++++++++++++++---------------------------------
 1 file changed, 37 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ce6d09e65ad..5609d309d56 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -985,12 +985,7 @@ static inline int __bfs_backward(struct lock_list *src_entry,
  * Recursive, forwards-direction lock-dependency checking, used for
  * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
  * checking.
- *
- * (to keep the stackframe of the recursive functions small we
- *  use these global variables, and we also mark various helper
- *  functions as noinline.)
  */
-static struct held_lock *check_source, *check_target;
 
 /*
  * Print a dependency chain entry (this is only done when a deadlock
@@ -1014,7 +1009,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
  * header first:
  */
 static noinline int
-print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+print_circular_bug_header(struct lock_list *entry, unsigned int depth,
+			struct held_lock *check_src,
+			struct held_lock *check_tgt)
 {
 	struct task_struct *curr = current;
 
@@ -1027,9 +1024,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 	printk(  "-------------------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
-	print_lock(check_source);
+	print_lock(check_src);
 	printk("\nbut task is already holding lock:\n");
-	print_lock(check_target);
+	print_lock(check_tgt);
 	printk("\nwhich lock already depends on the new lock.\n\n");
 	printk("\nthe existing dependency chain (in reverse order) is:\n");
 
@@ -1043,36 +1040,24 @@ static inline int class_equal(struct lock_list *entry, void *data)
 	return entry->class == data;
 }
 
-static noinline int print_circular_bug(void)
+static noinline int print_circular_bug(struct lock_list *this,
+				struct lock_list *target,
+				struct held_lock *check_src,
+				struct held_lock *check_tgt)
 {
 	struct task_struct *curr = current;
-	struct lock_list this;
-	struct lock_list *target;
 	struct lock_list *parent;
-	int result;
 	unsigned long depth;
 
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	this.class = hlock_class(check_source);
-	this.parent = NULL;
-	if (!save_trace(&this.trace))
+	if (!save_trace(&this->trace))
 		return 0;
 
-	result = __bfs_forward(&this,
-			hlock_class(check_target),
-			class_equal,
-			&target);
-	if (result) {
-		printk("\n%s:search shortest path failed:%d\n", __func__,
-			result);
-		return 0;
-	}
-
 	depth = get_lock_depth(target);
 
-	print_circular_bug_header(target, depth);
+	print_circular_bug_header(target, depth, check_src, check_tgt);
 
 	parent = get_lock_parent(target);
 
@@ -1090,6 +1075,16 @@ static noinline int print_circular_bug(void)
 	return 0;
 }
 
+static noinline int print_bfs_bug(int ret)
+{
+	if (!debug_locks_off_graph_unlock())
+		return 0;
+
+	WARN(1, "lockdep bfs error:%d\n", ret);
+
+	return 0;
+}
+
 #define RECURSION_LIMIT 40
 
 static int noinline print_infinite_recursion_bug(void)
@@ -1168,31 +1163,17 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
  * lead to <target>. Print an error and return 0 if it does.
  */
 static noinline int
-check_noncircular(struct lock_class *source, unsigned int depth)
+check_noncircular(struct lock_list *root, struct lock_class *target,
+		struct lock_list **target_entry)
 {
-	struct lock_list *entry;
+	int result;
 
-	if (lockdep_dependency_visit(source, depth))
-		return 1;
+	debug_atomic_inc(&nr_cyclic_checks);
 
-	debug_atomic_inc(&nr_cyclic_check_recursions);
-	if (depth > max_recursion_depth)
-		max_recursion_depth = depth;
-	if (depth >= RECURSION_LIMIT)
-		return print_infinite_recursion_bug();
-	/*
-	 * Check this lock's dependency list:
-	 */
-	list_for_each_entry(entry, &source->locks_after, entry) {
-		if (entry->class == hlock_class(check_target))
-			return 2;
-		debug_atomic_inc(&nr_cyclic_checks);
-		if (check_noncircular(entry->class, depth+1) == 2)
-			return 2;
-	}
-	return 1;
-}
+	result = __bfs_forward(root, target, class_equal, target_entry);
 
+	return result;
+}
 
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
@@ -1586,6 +1567,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 {
 	struct lock_list *entry;
 	int ret;
+	struct lock_list this;
+	struct lock_list *uninitialized_var(target_entry);
 
 	/*
 	 * Prove that the new <prev> -> <next> dependency would not
@@ -1596,11 +1579,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * We are using global variables to control the recursion, to
 	 * keep the stackframe size of the recursive functions low:
 	 */
-	check_source = next;
-	check_target = prev;
-
-	if (check_noncircular(hlock_class(next), 0) == 2)
-		return print_circular_bug();
+	this.class = hlock_class(next);
+	this.parent = NULL;
+	ret = check_noncircular(&this, hlock_class(prev), &target_entry);
+	if (unlikely(!ret))
+		return print_circular_bug(&this, target_entry, next, prev);
+	else if (unlikely(ret < 0))
+		return print_bfs_bug(ret);
 
 	if (!check_prev_add_irq(curr, prev, next))
 		return 0;
-- 
cgit v1.2.3


From d7aaba140a09b7a2295aec061629c880f088de3d Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Implement find_usage_*wards by BFS

This patch uses BFS to implement find_usage_*wards(),which
was originally writen by DFS.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-6-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 180 ++++++++++++++++++++++---------------------------------
 1 file changed, 72 insertions(+), 108 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5609d309d56..b3ade507c6c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -963,7 +963,7 @@ exit:
 	return ret;
 }
 
-static inline int __bfs_forward(struct lock_list *src_entry,
+static inline int __bfs_forwards(struct lock_list *src_entry,
 			void *data,
 			int (*match)(struct lock_list *entry, void *data),
 			struct lock_list **target_entry)
@@ -972,7 +972,7 @@ static inline int __bfs_forward(struct lock_list *src_entry,
 
 }
 
-static inline int __bfs_backward(struct lock_list *src_entry,
+static inline int __bfs_backwards(struct lock_list *src_entry,
 			void *data,
 			int (*match)(struct lock_list *entry, void *data),
 			struct lock_list **target_entry)
@@ -1085,18 +1085,6 @@ static noinline int print_bfs_bug(int ret)
 	return 0;
 }
 
-#define RECURSION_LIMIT 40
-
-static int noinline print_infinite_recursion_bug(void)
-{
-	if (!debug_locks_off_graph_unlock())
-		return 0;
-
-	WARN_ON(1);
-
-	return 0;
-}
-
 unsigned long __lockdep_count_forward_deps(struct lock_class *class,
 					   unsigned int depth)
 {
@@ -1170,7 +1158,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
 
 	debug_atomic_inc(&nr_cyclic_checks);
 
-	result = __bfs_forward(root, target, class_equal, target_entry);
+	result = __bfs_forwards(root, target, class_equal, target_entry);
 
 	return result;
 }
@@ -1181,101 +1169,70 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
  * proving that two subgraphs can be connected by a new dependency
  * without creating any illegal irq-safe -> irq-unsafe lock dependency.
  */
-static enum lock_usage_bit find_usage_bit;
 static struct lock_class *forwards_match, *backwards_match;
 
+
+#define   BFS_PROCESS_RET(ret)	do { \
+					if (ret < 0) \
+						return print_bfs_bug(ret); \
+					if (ret == 1) \
+						return 1; \
+				} while (0)
+
+static inline int usage_match(struct lock_list *entry, void *bit)
+{
+	return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+}
+
+
+
 /*
  * Find a node in the forwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
  *
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <forwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
  *
- * Return 1 otherwise and keep <forwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
  */
-static noinline int
-find_usage_forwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+			struct lock_list **target_entry)
 {
-	struct lock_list *entry;
-	int ret;
-
-	if (lockdep_dependency_visit(source, depth))
-		return 1;
-
-	if (depth > max_recursion_depth)
-		max_recursion_depth = depth;
-	if (depth >= RECURSION_LIMIT)
-		return print_infinite_recursion_bug();
+	int result;
 
 	debug_atomic_inc(&nr_find_usage_forwards_checks);
-	if (source->usage_mask & (1 << find_usage_bit)) {
-		forwards_match = source;
-		return 2;
-	}
 
-	/*
-	 * Check this lock's dependency list:
-	 */
-	list_for_each_entry(entry, &source->locks_after, entry) {
-		debug_atomic_inc(&nr_find_usage_forwards_recursions);
-		ret = find_usage_forwards(entry->class, depth+1);
-		if (ret == 2 || ret == 0)
-			return ret;
-	}
-	return 1;
+	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+
+	return result;
 }
 
 /*
  * Find a node in the backwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
  *
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <backwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
  *
- * Return 1 otherwise and keep <backwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
  */
-static noinline int
-find_usage_backwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+			struct lock_list **target_entry)
 {
-	struct lock_list *entry;
-	int ret;
-
-	if (lockdep_dependency_visit(source, depth))
-		return 1;
-
-	if (!__raw_spin_is_locked(&lockdep_lock))
-		return DEBUG_LOCKS_WARN_ON(1);
-
-	if (depth > max_recursion_depth)
-		max_recursion_depth = depth;
-	if (depth >= RECURSION_LIMIT)
-		return print_infinite_recursion_bug();
+	int result;
 
 	debug_atomic_inc(&nr_find_usage_backwards_checks);
-	if (source->usage_mask & (1 << find_usage_bit)) {
-		backwards_match = source;
-		return 2;
-	}
 
-	if (!source && debug_locks_off_graph_unlock()) {
-		WARN_ON(1);
-		return 0;
-	}
+	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
 
-	/*
-	 * Check this lock's dependency list:
-	 */
-	list_for_each_entry(entry, &source->locks_before, entry) {
-		debug_atomic_inc(&nr_find_usage_backwards_recursions);
-		ret = find_usage_backwards(entry->class, depth+1);
-		if (ret == 2 || ret == 0)
-			return ret;
-	}
-	return 1;
+	return result;
 }
 
+
 static int
 print_bad_irq_dependency(struct task_struct *curr,
 			 struct held_lock *prev,
@@ -1343,18 +1300,21 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 	    enum lock_usage_bit bit_forwards, const char *irqclass)
 {
 	int ret;
+	struct lock_list this;
+	struct lock_list *uninitialized_var(target_entry);
+
+	this.parent = NULL;
+
+	this.class = hlock_class(prev);
+	ret = find_usage_backwards(&this, bit_backwards, &target_entry);
+	BFS_PROCESS_RET(ret);
+	backwards_match = target_entry->class;
+
+	this.class = hlock_class(next);
+	ret = find_usage_forwards(&this, bit_forwards, &target_entry);
+	BFS_PROCESS_RET(ret);
+	forwards_match = target_entry->class;
 
-	find_usage_bit = bit_backwards;
-	/* fills in <backwards_match> */
-	ret = find_usage_backwards(hlock_class(prev), 0);
-	if (!ret || ret == 1)
-		return ret;
-
-	find_usage_bit = bit_forwards;
-	ret = find_usage_forwards(hlock_class(next), 0);
-	if (!ret || ret == 1)
-		return ret;
-	/* ret == 2 */
 	return print_bad_irq_dependency(curr, prev, next,
 			bit_backwards, bit_forwards, irqclass);
 }
@@ -2029,14 +1989,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit bit, const char *irqclass)
 {
 	int ret;
+	struct lock_list root;
+	struct lock_list *uninitialized_var(target_entry);
 
-	find_usage_bit = bit;
-	/* fills in <forwards_match> */
-	ret = find_usage_forwards(hlock_class(this), 0);
-	if (!ret || ret == 1)
-		return ret;
+	root.parent = NULL;
+	root.class = hlock_class(this);
+	ret = find_usage_forwards(&root, bit, &target_entry);
+	BFS_PROCESS_RET(ret);
 
-	return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
+	return print_irq_inversion_bug(curr, target_entry->class,
+					this, 1, irqclass);
 }
 
 /*
@@ -2048,14 +2010,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 		      enum lock_usage_bit bit, const char *irqclass)
 {
 	int ret;
+	struct lock_list root;
+	struct lock_list *uninitialized_var(target_entry);
 
-	find_usage_bit = bit;
-	/* fills in <backwards_match> */
-	ret = find_usage_backwards(hlock_class(this), 0);
-	if (!ret || ret == 1)
-		return ret;
+	root.parent = NULL;
+	root.class = hlock_class(this);
+	ret = find_usage_backwards(&root, bit, &target_entry);
+	BFS_PROCESS_RET(ret);
 
-	return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
+	return print_irq_inversion_bug(curr, target_entry->class,
+					this, 1, irqclass);
 }
 
 void print_irqtrace_events(struct task_struct *curr)
-- 
cgit v1.2.3


From 24208ca76707581a097c01a73fd63781e73d3404 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Introduce print_shortest_lock_dependencies

Since the shortest lock dependencies' path may be obtained by BFS,
we print the shortest one by print_shortest_lock_dependencies().

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-7-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 93 +++++++++++++++++++++++++++++++++-------------
 kernel/lockdep_internals.h |  4 +-
 2 files changed, 69 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b3ade507c6c..938dc500f1a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -575,6 +575,36 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 	print_ip_sym((unsigned long)class->key);
 }
 
+/*
+ * printk the shortest lock dependencies from @start to @end in reverse order:
+ */
+static void __used
+print_shortest_lock_dependencies(struct lock_list *leaf,
+				struct lock_list *root)
+{
+	struct lock_list *entry = leaf;
+	int depth;
+
+	/*compute depth from generated tree by BFS*/
+	depth = get_lock_depth(leaf);
+
+	do {
+		print_lock_class_header(entry->class, depth);
+		printk("%*s ... acquired at:\n", depth, "");
+		print_stack_trace(&entry->trace, 2);
+		printk("\n");
+
+		if (depth == 0 && (entry != root)) {
+			printk("lockdep:%s bad BFS generated tree\n", __func__);
+			break;
+		}
+
+		entry = get_lock_parent(entry);
+		depth--;
+	} while (entry && (depth >= 0));
+
+	return;
+}
 /*
  * printk all lock dependencies starting at <entry>:
  */
@@ -992,7 +1022,7 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
  * has been detected):
  */
 static noinline int
-print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+print_circular_bug_entry(struct lock_list *target, int depth)
 {
 	if (debug_locks_silent)
 		return 0;
@@ -1047,7 +1077,7 @@ static noinline int print_circular_bug(struct lock_list *this,
 {
 	struct task_struct *curr = current;
 	struct lock_list *parent;
-	unsigned long depth;
+	int depth;
 
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
@@ -1169,7 +1199,6 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
  * proving that two subgraphs can be connected by a new dependency
  * without creating any illegal irq-safe -> irq-unsafe lock dependency.
  */
-static struct lock_class *forwards_match, *backwards_match;
 
 
 #define   BFS_PROCESS_RET(ret)	do { \
@@ -1235,6 +1264,10 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
 
 static int
 print_bad_irq_dependency(struct task_struct *curr,
+			 struct lock_list *prev_root,
+			 struct lock_list *next_root,
+			 struct lock_list *backwards_entry,
+			 struct lock_list *forwards_entry,
 			 struct held_lock *prev,
 			 struct held_lock *next,
 			 enum lock_usage_bit bit1,
@@ -1267,26 +1300,32 @@ print_bad_irq_dependency(struct task_struct *curr,
 
 	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
 		irqclass);
-	print_lock_name(backwards_match);
+	print_lock_name(backwards_entry->class);
 	printk("\n... which became %s-irq-safe at:\n", irqclass);
 
-	print_stack_trace(backwards_match->usage_traces + bit1, 1);
+	print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
 
 	printk("\nto a %s-irq-unsafe lock:\n", irqclass);
-	print_lock_name(forwards_match);
+	print_lock_name(forwards_entry->class);
 	printk("\n... which became %s-irq-unsafe at:\n", irqclass);
 	printk("...");
 
-	print_stack_trace(forwards_match->usage_traces + bit2, 1);
+	print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
 
 	printk("\nother info that might help us debug this:\n\n");
 	lockdep_print_held_locks(curr);
 
-	printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
-	print_lock_dependencies(backwards_match, 0);
+	printk("\nthe dependencies between %s-irq-safe lock", irqclass);
+	printk(" and the holding lock:\n");
+	if (!save_trace(&prev_root->trace))
+		return 0;
+	print_shortest_lock_dependencies(backwards_entry, prev_root);
 
-	printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
-	print_lock_dependencies(forwards_match, 0);
+	printk("\nthe dependencies between the lock to be acquired");
+	printk(" and %s-irq-unsafe lock:\n", irqclass);
+	if (!save_trace(&next_root->trace))
+		return 0;
+	print_shortest_lock_dependencies(forwards_entry, next_root);
 
 	printk("\nstack backtrace:\n");
 	dump_stack();
@@ -1300,22 +1339,24 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 	    enum lock_usage_bit bit_forwards, const char *irqclass)
 {
 	int ret;
-	struct lock_list this;
+	struct lock_list this, that;
 	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *uninitialized_var(target_entry1);
 
 	this.parent = NULL;
 
 	this.class = hlock_class(prev);
 	ret = find_usage_backwards(&this, bit_backwards, &target_entry);
 	BFS_PROCESS_RET(ret);
-	backwards_match = target_entry->class;
 
-	this.class = hlock_class(next);
-	ret = find_usage_forwards(&this, bit_forwards, &target_entry);
+	that.parent = NULL;
+	that.class = hlock_class(next);
+	ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
 	BFS_PROCESS_RET(ret);
-	forwards_match = target_entry->class;
 
-	return print_bad_irq_dependency(curr, prev, next,
+	return print_bad_irq_dependency(curr, &this, &that,
+			target_entry, target_entry1,
+			prev, next,
 			bit_backwards, bit_forwards, irqclass);
 }
 
@@ -1944,7 +1985,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
  * print irq inversion bug:
  */
 static int
-print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
+print_irq_inversion_bug(struct task_struct *curr,
+			struct lock_list *root, struct lock_list *other,
 			struct held_lock *this, int forwards,
 			const char *irqclass)
 {
@@ -1962,17 +2004,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
 		printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
 	else
 		printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
-	print_lock_name(other);
+	print_lock_name(other->class);
 	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
 
 	printk("\nother info that might help us debug this:\n");
 	lockdep_print_held_locks(curr);
 
-	printk("\nthe first lock's dependencies:\n");
-	print_lock_dependencies(hlock_class(this), 0);
-
-	printk("\nthe second lock's dependencies:\n");
-	print_lock_dependencies(other, 0);
+	printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+	if (!save_trace(&root->trace))
+		return 0;
+	print_shortest_lock_dependencies(other, root);
 
 	printk("\nstack backtrace:\n");
 	dump_stack();
@@ -1997,7 +2038,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
 	ret = find_usage_forwards(&root, bit, &target_entry);
 	BFS_PROCESS_RET(ret);
 
-	return print_irq_inversion_bug(curr, target_entry->class,
+	return print_irq_inversion_bug(curr, &root, target_entry,
 					this, 1, irqclass);
 }
 
@@ -2018,7 +2059,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 	ret = find_usage_backwards(&root, bit, &target_entry);
 	BFS_PROCESS_RET(ret);
 
-	return print_irq_inversion_bug(curr, target_entry->class,
+	return print_irq_inversion_bug(curr, &root, target_entry,
 					this, 1, irqclass);
 }
 
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index c2f6594966f..b115aaa0bf3 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -219,9 +219,9 @@ static inline struct lock_list *get_lock_parent(struct lock_list *child)
 	return child->parent;
 }
 
-static inline unsigned long get_lock_depth(struct lock_list *child)
+static inline int get_lock_depth(struct lock_list *child)
 {
-	unsigned long depth = 0;
+	int depth = 0;
 	struct lock_list *parent;
 
 	while ((parent = get_lock_parent(child))) {
-- 
cgit v1.2.3


From ef681026ff85c49b7399ceec3eb62bbbcce605e8 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Implement lockdep_count_*ward_deps by BFS

Implement lockdep_count_{for,back}ward using BFS.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-8-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 52 +++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 938dc500f1a..b896f23e00c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1115,61 +1115,59 @@ static noinline int print_bfs_bug(int ret)
 	return 0;
 }
 
-unsigned long __lockdep_count_forward_deps(struct lock_class *class,
-					   unsigned int depth)
+static int noop_count(struct lock_list *entry, void *data)
 {
-	struct lock_list *entry;
-	unsigned long ret = 1;
+	(*(unsigned long *)data)++;
+	return 0;
+}
 
-	if (lockdep_dependency_visit(class, depth))
-		return 0;
+unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+{
+	unsigned long  count = 0;
+	struct lock_list *uninitialized_var(target_entry);
 
-	/*
-	 * Recurse this class's dependency list:
-	 */
-	list_for_each_entry(entry, &class->locks_after, entry)
-		ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+	__bfs_forwards(this, (void *)&count, noop_count, &target_entry);
 
-	return ret;
+	return count;
 }
-
 unsigned long lockdep_count_forward_deps(struct lock_class *class)
 {
 	unsigned long ret, flags;
+	struct lock_list this;
+
+	this.parent = NULL;
+	this.class = class;
 
 	local_irq_save(flags);
 	__raw_spin_lock(&lockdep_lock);
-	ret = __lockdep_count_forward_deps(class, 0);
+	ret = __lockdep_count_forward_deps(&this);
 	__raw_spin_unlock(&lockdep_lock);
 	local_irq_restore(flags);
 
 	return ret;
 }
 
-unsigned long __lockdep_count_backward_deps(struct lock_class *class,
-					    unsigned int depth)
+unsigned long __lockdep_count_backward_deps(struct lock_list *this)
 {
-	struct lock_list *entry;
-	unsigned long ret = 1;
+	unsigned long  count = 0;
+	struct lock_list *uninitialized_var(target_entry);
 
-	if (lockdep_dependency_visit(class, depth))
-		return 0;
-	/*
-	 * Recurse this class's dependency list:
-	 */
-	list_for_each_entry(entry, &class->locks_before, entry)
-		ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+	__bfs_backwards(this, (void *)&count, noop_count, &target_entry);
 
-	return ret;
+	return count;
 }
 
 unsigned long lockdep_count_backward_deps(struct lock_class *class)
 {
 	unsigned long ret, flags;
+	struct lock_list this;
+
+	this.parent = NULL;
+	this.class = class;
 
 	local_irq_save(flags);
 	__raw_spin_lock(&lockdep_lock);
-	ret = __lockdep_count_backward_deps(class, 0);
+	ret = __lockdep_count_backward_deps(&this);
 	__raw_spin_unlock(&lockdep_lock);
 	local_irq_restore(flags);
 
-- 
cgit v1.2.3


From 4dd861d6467007681991d8ec079d928db2018cbb Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Update memory usage introduced by BFS

Also account the BFS memory usage.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
[ fix build for !PROVE_LOCKING ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-9-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b896f23e00c..6358cf7e84b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3429,7 +3429,11 @@ void __init lockdep_info(void)
 		sizeof(struct list_head) * CLASSHASH_SIZE +
 		sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
 		sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
-		sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
+		sizeof(struct list_head) * CHAINHASH_SIZE) / 1024
+#ifdef CONFIG_PROVE_LOCKING
+		+ sizeof(struct circular_queue) + sizeof(bfs_accessed)
+#endif
+		);
 
 	printk(" per task-struct memory footprint: %lu bytes\n",
 		sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-- 
cgit v1.2.3


From 12f3dfd022d7e616757a94f0538d3d525d806a16 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: Add statistics info for max bfs queue depth

Add BFS statistics to the existing lockdep stats.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-10-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 6 +++++-
 kernel/lockdep_internals.h | 3 ++-
 kernel/lockdep_proc.c      | 2 ++
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 6358cf7e84b..744da6265d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -929,7 +929,7 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 
 unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)];
 static struct circular_queue  lock_cq;
-
+unsigned int max_bfs_queue_depth;
 static int __bfs(struct lock_list *source_entry,
 			void *data,
 			int (*match)(struct lock_list *entry, void *data),
@@ -975,6 +975,7 @@ static int __bfs(struct lock_list *source_entry,
 
 		list_for_each_entry(entry, head, entry) {
 			if (!lock_accessed(entry)) {
+				unsigned int cq_depth;
 				mark_lock_accessed(entry, lock);
 				if (match(entry, data)) {
 					*target_entry = entry;
@@ -986,6 +987,9 @@ static int __bfs(struct lock_list *source_entry,
 					ret = -1;
 					goto exit;
 				}
+				cq_depth = __cq_get_elem_count(cq);
+				if (max_bfs_queue_depth < cq_depth)
+					max_bfs_queue_depth = cq_depth;
 			}
 		}
 	}
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index b115aaa0bf3..6baa8807efd 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -138,6 +138,7 @@ extern atomic_t nr_find_usage_backwards_recursions;
 #endif
 
 
+extern unsigned int max_bfs_queue_depth;
 extern unsigned long nr_list_entries;
 extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 extern unsigned long bfs_accessed[];
@@ -191,7 +192,7 @@ static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
 	return 0;
 }
 
-static inline int __cq_get_elem_count(struct circular_queue *cq)
+static inline unsigned int  __cq_get_elem_count(struct circular_queue *cq)
 {
 	return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1);
 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c..9a1bf34d2ff 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -411,6 +411,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			max_lockdep_depth);
 	seq_printf(m, " max recursion depth:           %11u\n",
 			max_recursion_depth);
+	seq_printf(m, " max bfs queue depth:           %11u\n",
+			max_bfs_queue_depth);
 	lockdep_stats_debug_show(m);
 	seq_printf(m, " debug_locks:                   %11u\n",
 			debug_locks);
-- 
cgit v1.2.3


From af012961450949ea297b209e091bd1a3805b8a0a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: lockdep: BFS cleanup

Some cleanups of the lockdep code after the BFS series:

 - Remove the last traces of the generation id
 - Fixup comment style
 - Move the bfs routines into lockdep.c
 - Cleanup the bfs routines

[ tom.leiming@gmail.com: Fix crash ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1246201486-7308-11-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 284 +++++++++++++++++++++++++++------------------
 kernel/lockdep_internals.h |  97 +---------------
 2 files changed, 172 insertions(+), 209 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 744da6265d9..1cedb00e3e7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
+
 #include <asm/sections.h>
 
 #include "lockdep_internals.h"
@@ -118,7 +119,7 @@ static inline int debug_locks_off_graph_unlock(void)
 static int lockdep_initialized;
 
 unsigned long nr_list_entries;
-struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 
 /*
  * All data structures here are protected by the global debug_lock.
@@ -390,19 +391,6 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 unsigned int max_recursion_depth;
 
-static unsigned int lockdep_dependency_gen_id;
-
-static bool lockdep_dependency_visit(struct lock_class *source,
-				     unsigned int depth)
-{
-	if (!depth)
-		lockdep_dependency_gen_id++;
-	if (source->dep_gen_id == lockdep_dependency_gen_id)
-		return true;
-	source->dep_gen_id = lockdep_dependency_gen_id;
-	return false;
-}
-
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
  * We cannot printk in early bootup code. Not even early_printk()
@@ -551,88 +539,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
 	}
 }
 
-static void print_lock_class_header(struct lock_class *class, int depth)
-{
-	int bit;
-
-	printk("%*s->", depth, "");
-	print_lock_name(class);
-	printk(" ops: %lu", class->ops);
-	printk(" {\n");
-
-	for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
-		if (class->usage_mask & (1 << bit)) {
-			int len = depth;
-
-			len += printk("%*s   %s", depth, "", usage_str[bit]);
-			len += printk(" at:\n");
-			print_stack_trace(class->usage_traces + bit, len);
-		}
-	}
-	printk("%*s }\n", depth, "");
-
-	printk("%*s ... key      at: ",depth,"");
-	print_ip_sym((unsigned long)class->key);
-}
-
-/*
- * printk the shortest lock dependencies from @start to @end in reverse order:
- */
-static void __used
-print_shortest_lock_dependencies(struct lock_list *leaf,
-				struct lock_list *root)
-{
-	struct lock_list *entry = leaf;
-	int depth;
-
-	/*compute depth from generated tree by BFS*/
-	depth = get_lock_depth(leaf);
-
-	do {
-		print_lock_class_header(entry->class, depth);
-		printk("%*s ... acquired at:\n", depth, "");
-		print_stack_trace(&entry->trace, 2);
-		printk("\n");
-
-		if (depth == 0 && (entry != root)) {
-			printk("lockdep:%s bad BFS generated tree\n", __func__);
-			break;
-		}
-
-		entry = get_lock_parent(entry);
-		depth--;
-	} while (entry && (depth >= 0));
-
-	return;
-}
-/*
- * printk all lock dependencies starting at <entry>:
- */
-static void __used
-print_lock_dependencies(struct lock_class *class, int depth)
-{
-	struct lock_list *entry;
-
-	if (lockdep_dependency_visit(class, depth))
-		return;
-
-	if (DEBUG_LOCKS_WARN_ON(depth >= 20))
-		return;
-
-	print_lock_class_header(class, depth);
-
-	list_for_each_entry(entry, &class->locks_after, entry) {
-		if (DEBUG_LOCKS_WARN_ON(!entry->class))
-			return;
-
-		print_lock_dependencies(entry->class, depth + 1);
-
-		printk("%*s ... acquired at:\n",depth,"");
-		print_stack_trace(&entry->trace, 2);
-		printk("\n");
-	}
-}
-
 static void print_kernel_version(void)
 {
 	printk("%s %.*s\n", init_utsname()->release,
@@ -927,14 +833,106 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	return 1;
 }
 
-unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)];
-static struct circular_queue  lock_cq;
+/*For good efficiency of modular, we use power of 2*/
+#define MAX_CIRCULAR_QUEUE_SIZE		4096UL
+#define CQ_MASK				(MAX_CIRCULAR_QUEUE_SIZE-1)
+
+/* The circular_queue and helpers is used to implement the
+ * breadth-first search(BFS)algorithem, by which we can build
+ * the shortest path from the next lock to be acquired to the
+ * previous held lock if there is a circular between them.
+ * */
+struct circular_queue {
+	unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+	unsigned int  front, rear;
+};
+
+static struct circular_queue lock_cq;
+static unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)];
+
 unsigned int max_bfs_queue_depth;
+
+static inline void __cq_init(struct circular_queue *cq)
+{
+	cq->front = cq->rear = 0;
+	bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES);
+}
+
+static inline int __cq_empty(struct circular_queue *cq)
+{
+	return (cq->front == cq->rear);
+}
+
+static inline int __cq_full(struct circular_queue *cq)
+{
+	return ((cq->rear + 1) & CQ_MASK) == cq->front;
+}
+
+static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+{
+	if (__cq_full(cq))
+		return -1;
+
+	cq->element[cq->rear] = elem;
+	cq->rear = (cq->rear + 1) & CQ_MASK;
+	return 0;
+}
+
+static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+{
+	if (__cq_empty(cq))
+		return -1;
+
+	*elem = cq->element[cq->front];
+	cq->front = (cq->front + 1) & CQ_MASK;
+	return 0;
+}
+
+static inline unsigned int  __cq_get_elem_count(struct circular_queue *cq)
+{
+	return (cq->rear - cq->front) & CQ_MASK;
+}
+
+static inline void mark_lock_accessed(struct lock_list *lock,
+					struct lock_list *parent)
+{
+	unsigned long nr;
+	nr = lock - list_entries;
+	WARN_ON(nr >= nr_list_entries);
+	lock->parent = parent;
+	set_bit(nr, bfs_accessed);
+}
+
+static inline unsigned long lock_accessed(struct lock_list *lock)
+{
+	unsigned long nr;
+	nr = lock - list_entries;
+	WARN_ON(nr >= nr_list_entries);
+	return test_bit(nr, bfs_accessed);
+}
+
+static inline struct lock_list *get_lock_parent(struct lock_list *child)
+{
+	return child->parent;
+}
+
+static inline int get_lock_depth(struct lock_list *child)
+{
+	int depth = 0;
+	struct lock_list *parent;
+
+	while ((parent = get_lock_parent(child))) {
+		child = parent;
+		depth++;
+	}
+	return depth;
+}
+
 static int __bfs(struct lock_list *source_entry,
-			void *data,
-			int (*match)(struct lock_list *entry, void *data),
-			struct lock_list **target_entry,
-			int forward)
+		 void *data,
+		 int (*match)(struct lock_list *entry, void *data),
+		 struct lock_list **target_entry,
+		 int forward)
 {
 	struct lock_list *entry;
 	struct list_head *head;
@@ -1202,14 +1200,6 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
  * without creating any illegal irq-safe -> irq-unsafe lock dependency.
  */
 
-
-#define   BFS_PROCESS_RET(ret)	do { \
-					if (ret < 0) \
-						return print_bfs_bug(ret); \
-					if (ret == 1) \
-						return 1; \
-				} while (0)
-
 static inline int usage_match(struct lock_list *entry, void *bit)
 {
 	return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
@@ -1263,6 +1253,60 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
 	return result;
 }
 
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+	int bit;
+
+	printk("%*s->", depth, "");
+	print_lock_name(class);
+	printk(" ops: %lu", class->ops);
+	printk(" {\n");
+
+	for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+		if (class->usage_mask & (1 << bit)) {
+			int len = depth;
+
+			len += printk("%*s   %s", depth, "", usage_str[bit]);
+			len += printk(" at:\n");
+			print_stack_trace(class->usage_traces + bit, len);
+		}
+	}
+	printk("%*s }\n", depth, "");
+
+	printk("%*s ... key      at: ",depth,"");
+	print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk the shortest lock dependencies from @start to @end in reverse order:
+ */
+static void __used
+print_shortest_lock_dependencies(struct lock_list *leaf,
+				struct lock_list *root)
+{
+	struct lock_list *entry = leaf;
+	int depth;
+
+	/*compute depth from generated tree by BFS*/
+	depth = get_lock_depth(leaf);
+
+	do {
+		print_lock_class_header(entry->class, depth);
+		printk("%*s ... acquired at:\n", depth, "");
+		print_stack_trace(&entry->trace, 2);
+		printk("\n");
+
+		if (depth == 0 && (entry != root)) {
+			printk("lockdep:%s bad BFS generated tree\n", __func__);
+			break;
+		}
+
+		entry = get_lock_parent(entry);
+		depth--;
+	} while (entry && (depth >= 0));
+
+	return;
+}
 
 static int
 print_bad_irq_dependency(struct task_struct *curr,
@@ -1349,12 +1393,18 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 
 	this.class = hlock_class(prev);
 	ret = find_usage_backwards(&this, bit_backwards, &target_entry);
-	BFS_PROCESS_RET(ret);
+	if (ret < 0)
+		return print_bfs_bug(ret);
+	if (ret == 1)
+		return ret;
 
 	that.parent = NULL;
 	that.class = hlock_class(next);
 	ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
-	BFS_PROCESS_RET(ret);
+	if (ret < 0)
+		return print_bfs_bug(ret);
+	if (ret == 1)
+		return ret;
 
 	return print_bad_irq_dependency(curr, &this, &that,
 			target_entry, target_entry1,
@@ -2038,7 +2088,10 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
 	root.parent = NULL;
 	root.class = hlock_class(this);
 	ret = find_usage_forwards(&root, bit, &target_entry);
-	BFS_PROCESS_RET(ret);
+	if (ret < 0)
+		return print_bfs_bug(ret);
+	if (ret == 1)
+		return ret;
 
 	return print_irq_inversion_bug(curr, &root, target_entry,
 					this, 1, irqclass);
@@ -2059,7 +2112,10 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 	root.parent = NULL;
 	root.class = hlock_class(this);
 	ret = find_usage_backwards(&root, bit, &target_entry);
-	BFS_PROCESS_RET(ret);
+	if (ret < 0)
+		return print_bfs_bug(ret);
+	if (ret == 1)
+		return ret;
 
 	return print_irq_inversion_bug(curr, &root, target_entry,
 					this, 1, irqclass);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 6baa8807efd..a2ee95ad131 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
 
+extern unsigned int max_bfs_queue_depth;
+
 #ifdef CONFIG_PROVE_LOCKING
 extern unsigned long lockdep_count_forward_deps(struct lock_class *);
 extern unsigned long lockdep_count_backward_deps(struct lock_class *);
@@ -136,98 +138,3 @@ extern atomic_t nr_find_usage_backwards_recursions;
 # define debug_atomic_dec(ptr)		do { } while (0)
 # define debug_atomic_read(ptr)		0
 #endif
-
-
-extern unsigned int max_bfs_queue_depth;
-extern unsigned long nr_list_entries;
-extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
-extern unsigned long bfs_accessed[];
-
-/*For good efficiency of modular, we use power of 2*/
-#define  MAX_CIRCULAR_QUE_SIZE	    4096UL
-
-/* The circular_queue and helpers is used to implement the
- * breadth-first search(BFS)algorithem, by which we can build
- * the shortest path from the next lock to be acquired to the
- * previous held lock if there is a circular between them.
- * */
-struct circular_queue{
-	unsigned long element[MAX_CIRCULAR_QUE_SIZE];
-	unsigned int  front, rear;
-};
-
-static inline void __cq_init(struct circular_queue *cq)
-{
-	cq->front = cq->rear = 0;
-	bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES);
-}
-
-static inline int __cq_empty(struct circular_queue *cq)
-{
-	return (cq->front == cq->rear);
-}
-
-static inline int __cq_full(struct circular_queue *cq)
-{
-	return ((cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1))  == cq->front;
-}
-
-static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
-{
-	if (__cq_full(cq))
-		return -1;
-
-	cq->element[cq->rear] = elem;
-	cq->rear = (cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1);
-	return 0;
-}
-
-static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
-{
-	if (__cq_empty(cq))
-		return -1;
-
-	*elem = cq->element[cq->front];
-	cq->front = (cq->front + 1)&(MAX_CIRCULAR_QUE_SIZE-1);
-	return 0;
-}
-
-static inline unsigned int  __cq_get_elem_count(struct circular_queue *cq)
-{
-	return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1);
-}
-
-static inline void mark_lock_accessed(struct lock_list *lock,
-					struct lock_list *parent)
-{
-	unsigned long nr;
-	nr = lock - list_entries;
-	WARN_ON(nr >= nr_list_entries);
-	lock->parent = parent;
-	set_bit(nr, bfs_accessed);
-}
-
-static inline unsigned long lock_accessed(struct lock_list *lock)
-{
-	unsigned long nr;
-	nr = lock - list_entries;
-	WARN_ON(nr >= nr_list_entries);
-	return test_bit(nr, bfs_accessed);
-}
-
-static inline struct lock_list *get_lock_parent(struct lock_list *child)
-{
-	return child->parent;
-}
-
-static inline int get_lock_depth(struct lock_list *child)
-{
-	int depth = 0;
-	struct lock_list *parent;
-
-	while ((parent = get_lock_parent(child))) {
-		child = parent;
-		depth++;
-	}
-	return depth;
-}
-- 
cgit v1.2.3


From c1dc0b9c0c8979ce4d411caadff5c0d79dee58bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 2 Aug 2009 11:28:21 +0200
Subject: debug lockups: Improve lockup detection

When debugging a recent lockup bug i found various deficiencies
in how our current lockup detection helpers work:

 - SysRq-L is not very efficient as it uses a workqueue, hence
   it cannot punch through hard lockups and cannot see through
   most soft lockups either.

 - The SysRq-L code depends on the NMI watchdog - which is off
   by default.

 - We dont print backtraces from the RCU code's built-in
   'RCU state machine is stuck' debug code. This debug
   code tends to be one of the first (and only) mechanisms
   that show that a lockup has occured.

This patch changes the code so taht we:

 - Trigger the NMI backtrace code from SysRq-L instead of using
   a workqueue (which cannot punch through hard lockups)

 - Trigger print-all-CPU-backtraces from the RCU lockup detection
   code

Also decouple the backtrace printing code from the NMI watchdog:

 - Dont use variable size cpumasks (it might not be initialized
   and they are a bit more fragile anyway)

 - Trigger an NMI immediately via an IPI, instead of waiting
   for the NMI tick to occur. This is a lot faster and can
   produce more relevant backtraces. It will also work if the
   NMI watchdog is disabled.

 - Dont print the 'dazed and confused' message when we print
   a backtrace from the NMI

 - Do a show_regs() plus a dump_stack() to get maximum info
   out of the dump. Worst-case we get two stacktraces - which
   is not a big deal. Sometimes, if register content is
   corrupted, the precise stack walker in show_regs() wont
   give us a full backtrace - in this case dump_stack() will
   do it.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c202..9c5fa9fc57e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/nmi.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
@@ -469,6 +470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	}
 	printk(" (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	trigger_all_cpu_backtrace();
+
 	force_quiescent_state(rsp, 0);  /* Kick them all. */
 }
 
@@ -479,12 +482,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
 
 	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
 			smp_processor_id(), jiffies - rsp->gp_start);
-	dump_stack();
+	trigger_all_cpu_backtrace();
+
 	spin_lock_irqsave(&rnp->lock, flags);
 	if ((long)(jiffies - rsp->jiffies_stall) >= 0)
 		rsp->jiffies_stall =
 			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
 	spin_unlock_irqrestore(&rnp->lock, flags);
+
 	set_need_resched();  /* kick ourselves to get things going. */
 }
 
-- 
cgit v1.2.3


From a5004278f0525dcb9aa43703ef77bf371ea837cd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 27 Jul 2009 14:04:49 +0200
Subject: sched: Fix cgroup smp fairness

Commit ec4e0e2fe018992d980910db901637c814575914 ("fix
inconsistency when redistribute per-cpu tg->cfs_rq shares")
broke cgroup smp fairness.

In order to avoid starvation of newly placed tasks, we never
quite set the share of an empty cpu group-task to 0, but
instead we set it as if there's a single NICE-0 task present.

If however we actually set this in cfs_rq[cpu]->shares, that
means the total shares for that group will be slightly inflated
every time we balance, causing the observed unfairness.

Fix this by setting cfs_rq[cpu]->shares to 0 but actually
setting the effective weight of the related se to the inflated
number.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1248696557.6987.1615.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ce1056e9b02..26976cd8be0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1523,13 +1523,18 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
 			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-	unsigned long shares;
 	unsigned long rq_weight;
+	unsigned long shares;
+	int boost = 0;
 
 	if (!tg->se[cpu])
 		return;
 
 	rq_weight = tg->cfs_rq[cpu]->rq_weight;
+	if (!rq_weight) {
+		boost = 1;
+		rq_weight = NICE_0_LOAD;
+	}
 
 	/*
 	 *           \Sum shares * rq_weight
@@ -1546,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		tg->cfs_rq[cpu]->shares = shares;
-
+		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
@@ -1560,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long weight, rq_weight = 0;
+	unsigned long weight, rq_weight = 0, eff_weight = 0;
 	unsigned long shares = 0;
 	struct sched_domain *sd = data;
 	int i;
@@ -1572,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
 		 * run here it will not get delayed by group starvation.
 		 */
 		weight = tg->cfs_rq[i]->load.weight;
+		tg->cfs_rq[i]->rq_weight = weight;
+		rq_weight += weight;
+
 		if (!weight)
 			weight = NICE_0_LOAD;
 
-		tg->cfs_rq[i]->rq_weight = weight;
-		rq_weight += weight;
+		eff_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 
@@ -1586,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	for_each_cpu(i, sched_domain_span(sd))
-		update_group_shares_cpu(tg, i, shares, rq_weight);
+	for_each_cpu(i, sched_domain_span(sd)) {
+		unsigned long sd_rq_weight = rq_weight;
+
+		if (!tg->cfs_rq[i]->rq_weight)
+			sd_rq_weight = eff_weight;
+
+		update_group_shares_cpu(tg, i, shares, sd_rq_weight);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From e709715915d69b6a929d77e7652c9c3fea61c317 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 15:41:20 +0200
Subject: sched: Optimize unused cgroup configuration

When cgroup group scheduling is built in, skip some code paths
if we don't have any (but the root) cgroups configured.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 26976cd8be0..ca1f76ba777 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1629,8 +1629,14 @@ static int tg_load_down(struct task_group *tg, void *data)
 
 static void update_shares(struct sched_domain *sd)
 {
-	u64 now = cpu_clock(raw_smp_processor_id());
-	s64 elapsed = now - sd->last_update;
+	s64 elapsed;
+	u64 now;
+
+	if (root_task_group_empty())
+		return;
+
+	now = cpu_clock(raw_smp_processor_id());
+	elapsed = now - sd->last_update;
 
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
 		sd->last_update = now;
@@ -1640,6 +1646,9 @@ static void update_shares(struct sched_domain *sd)
 
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
+	if (root_task_group_empty())
+		return;
+
 	spin_unlock(&rq->lock);
 	update_shares(sd);
 	spin_lock(&rq->lock);
@@ -1647,6 +1656,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 static void update_h_load(long cpu)
 {
+	if (root_task_group_empty())
+		return;
+
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
-- 
cgit v1.2.3


From da19ab510343c6496fe8b8f890091296032025c9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Jul 2009 00:21:22 -0400
Subject: sched: Check for pushing rt tasks after all scheduling

The current method for pushing RT tasks after scheduling only
happens after a context switch. But we found cases where a task
is set up on a run queue to be pushed but the push never
happens because the schedule chooses the same task.

This bug was found with the help of Gregory Haskins and the use
of ftrace (trace_printk). It tooks several days for both of us
analyzing the code and the trace output to find this.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090729042526.205923666@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ca1f76ba777..a030d4514cd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2839,14 +2839,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static int finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
-#ifdef CONFIG_SMP
 	int post_schedule = 0;
 
+#ifdef CONFIG_SMP
 	if (current->sched_class->needs_post_schedule)
 		post_schedule = current->sched_class->needs_post_schedule(rq);
 #endif
@@ -2868,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_arch_switch(prev);
 	perf_counter_task_sched_in(current, cpu_of(rq));
 	finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
-	if (post_schedule)
-		current->sched_class->post_schedule(rq);
-#endif
 
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
@@ -2884,6 +2880,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
+
+	return post_schedule;
 }
 
 /**
@@ -2894,8 +2892,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
+	int post_schedule;
+
+	post_schedule = finish_task_switch(rq, prev);
+
+#ifdef CONFIG_SMP
+	if (post_schedule)
+		current->sched_class->post_schedule(rq);
+#endif
 
-	finish_task_switch(rq, prev);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
@@ -2908,7 +2913,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
-static inline void
+static inline int
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
@@ -2955,7 +2960,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
-	finish_task_switch(this_rq(), prev);
+	return finish_task_switch(this_rq(), prev);
 }
 
 /*
@@ -5366,6 +5371,7 @@ asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
+	int post_schedule = 0;
 	struct rq *rq;
 	int cpu;
 
@@ -5416,15 +5422,25 @@ need_resched_nonpreemptible:
 		rq->curr = next;
 		++*switch_count;
 
-		context_switch(rq, prev, next); /* unlocks the rq */
+		post_schedule = context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * the context switch might have flipped the stack from under
 		 * us, hence refresh the local variables.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
-	} else
+	} else {
+#ifdef CONFIG_SMP
+		if (current->sched_class->needs_post_schedule)
+			post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
 		spin_unlock_irq(&rq->lock);
+	}
+
+#ifdef CONFIG_SMP
+	if (post_schedule)
+		current->sched_class->post_schedule(rq);
+#endif
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-- 
cgit v1.2.3


From c3a2ae3d93c0f10d29c071f599764d00b8de00cb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Jul 2009 00:21:23 -0400
Subject: sched: Add new prio to cpupri before removing old prio

We need to add the new prio to the cpupri accounting before
removing the old prio. This is because removing the old prio
first will open a race window where the cpu will be removed
from pri_active. In this case the cpu will not be visible for
RT push and pulls. This could cause a RT task to not migrate
appropriately, and create a very large latency.

This bug was found with the use of ftrace sched events and
trace_printk.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090729042526.438281019@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_cpupri.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947..0f052fc674d 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 
 	/*
 	 * If the cpu was currently mapped to a different value, we
-	 * first need to unmap the old value
+	 * need to map it to the new value then remove the old value.
+	 * Note, we must add the new value first, otherwise we risk the
+	 * cpu being cleared from pri_active, and this cpu could be
+	 * missed for a push or pull.
 	 */
-	if (likely(oldpri != CPUPRI_INVALID)) {
-		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-
-		spin_lock_irqsave(&vec->lock, flags);
-
-		vec->count--;
-		if (!vec->count)
-			clear_bit(oldpri, cp->pri_active);
-		cpumask_clear_cpu(cpu, vec->mask);
-
-		spin_unlock_irqrestore(&vec->lock, flags);
-	}
-
 	if (likely(newpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
 
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 
 		spin_unlock_irqrestore(&vec->lock, flags);
 	}
+	if (likely(oldpri != CPUPRI_INVALID)) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+
+		spin_lock_irqsave(&vec->lock, flags);
+
+		vec->count--;
+		if (!vec->count)
+			clear_bit(oldpri, cp->pri_active);
+		cpumask_clear_cpu(cpu, vec->mask);
+
+		spin_unlock_irqrestore(&vec->lock, flags);
+	}
 
 	*currpri = newpri;
 }
-- 
cgit v1.2.3


From 3f029d3c6d62068d59301d90c18dbde8ee402107 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 29 Jul 2009 11:08:47 -0400
Subject: sched: Enhance the pre/post scheduling logic

We currently have an explicit "needs_post" vtable method which
returns a stack variable for whether we should later run
post-schedule.  This leads to an awkward exchange of the
variable as it bubbles back up out of the context switch. Peter
Zijlstra observed that this information could be stored in the
run-queue itself instead of handled on the stack.

Therefore, we revert to the method of having context_switch
return void, and update an internal rq->post_schedule variable
when we require further processing.

In addition, we fix a race condition where we try to access
current->sched_class without holding the rq->lock.  This is
technically racy, as the sched-class could change out from
under us.  Instead, we reference the per-rq post_schedule
variable with the runqueue unlocked, but with preemption
disabled to see if we need to reacquire the rq->lock.

Finally, we clean the code up slightly by removing the #ifdef
CONFIG_SMP conditionals from the schedule() call, and implement
some inline helper functions instead.

This patch passes checkpatch, and rt-migrate.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 82 +++++++++++++++++++++++++++++++++----------------------
 kernel/sched_rt.c | 31 ++++++++-------------
 2 files changed, 61 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a030d4514cd..613fee54fc8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -616,6 +616,7 @@ struct rq {
 
 	unsigned char idle_at_tick;
 	/* For active balancing */
+	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	/* cpu of this runqueue: */
@@ -2839,17 +2840,11 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static int finish_task_switch(struct rq *rq, struct task_struct *prev)
+static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
-	int post_schedule = 0;
-
-#ifdef CONFIG_SMP
-	if (current->sched_class->needs_post_schedule)
-		post_schedule = current->sched_class->needs_post_schedule(rq);
-#endif
 
 	rq->prev_mm = NULL;
 
@@ -2880,10 +2875,44 @@ static int finish_task_switch(struct rq *rq, struct task_struct *prev)
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
+}
+
+#ifdef CONFIG_SMP
+
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+	if (prev->sched_class->pre_schedule)
+		prev->sched_class->pre_schedule(rq, prev);
+}
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+	if (rq->post_schedule) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&rq->lock, flags);
+		if (rq->curr->sched_class->post_schedule)
+			rq->curr->sched_class->post_schedule(rq);
+		spin_unlock_irqrestore(&rq->lock, flags);
+
+		rq->post_schedule = 0;
+	}
+}
+
+#else
 
-	return post_schedule;
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void post_schedule(struct rq *rq)
+{
 }
 
+#endif
+
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -2892,14 +2921,14 @@ asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	int post_schedule;
 
-	post_schedule = finish_task_switch(rq, prev);
+	finish_task_switch(rq, prev);
 
-#ifdef CONFIG_SMP
-	if (post_schedule)
-		current->sched_class->post_schedule(rq);
-#endif
+	/*
+	 * FIXME: do we need to worry about rq being invalidated by the
+	 * task_switch?
+	 */
+	post_schedule(rq);
 
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
@@ -2913,7 +2942,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
-static inline int
+static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
@@ -2960,7 +2989,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
-	return finish_task_switch(this_rq(), prev);
+	finish_task_switch(this_rq(), prev);
 }
 
 /*
@@ -5371,7 +5400,6 @@ asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
-	int post_schedule = 0;
 	struct rq *rq;
 	int cpu;
 
@@ -5403,10 +5431,7 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 	}
 
-#ifdef CONFIG_SMP
-	if (prev->sched_class->pre_schedule)
-		prev->sched_class->pre_schedule(rq, prev);
-#endif
+	pre_schedule(rq, prev);
 
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
@@ -5422,25 +5447,17 @@ need_resched_nonpreemptible:
 		rq->curr = next;
 		++*switch_count;
 
-		post_schedule = context_switch(rq, prev, next); /* unlocks the rq */
+		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * the context switch might have flipped the stack from under
 		 * us, hence refresh the local variables.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
-	} else {
-#ifdef CONFIG_SMP
-		if (current->sched_class->needs_post_schedule)
-			post_schedule = current->sched_class->needs_post_schedule(rq);
-#endif
+	} else
 		spin_unlock_irq(&rq->lock);
-	}
 
-#ifdef CONFIG_SMP
-	if (post_schedule)
-		current->sched_class->post_schedule(rq);
-#endif
+	post_schedule(rq);
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
@@ -9403,6 +9420,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
+		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e..a8f89bc3e5e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1056,6 +1056,11 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return p;
 }
 
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
 static struct task_struct *pick_next_task_rt(struct rq *rq)
 {
 	struct task_struct *p = _pick_next_task_rt(rq);
@@ -1064,6 +1069,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	if (p)
 		dequeue_pushable_task(rq, p);
 
+	/*
+	 * We detect this state here so that we can avoid taking the RQ
+	 * lock again later if there is no need to push
+	 */
+	rq->post_schedule = has_pushable_tasks(rq);
+
 	return p;
 }
 
@@ -1262,11 +1273,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 	return lowest_rq;
 }
 
-static inline int has_pushable_tasks(struct rq *rq)
-{
-	return !plist_head_empty(&rq->rt.pushable_tasks);
-}
-
 static struct task_struct *pick_next_pushable_task(struct rq *rq)
 {
 	struct task_struct *p;
@@ -1466,23 +1472,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 		pull_rt_task(rq);
 }
 
-/*
- * assumes rq->lock is held
- */
-static int needs_post_schedule_rt(struct rq *rq)
-{
-	return has_pushable_tasks(rq);
-}
-
 static void post_schedule_rt(struct rq *rq)
 {
-	/*
-	 * This is only called if needs_post_schedule_rt() indicates that
-	 * we need to push tasks away
-	 */
-	spin_lock_irq(&rq->lock);
 	push_rt_tasks(rq);
-	spin_unlock_irq(&rq->lock);
 }
 
 /*
@@ -1758,7 +1750,6 @@ static const struct sched_class rt_sched_class = {
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
 	.pre_schedule		= pre_schedule_rt,
-	.needs_post_schedule	= needs_post_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
 	.switched_from		= switched_from_rt,
-- 
cgit v1.2.3


From 00aec93d10a051ea64f83eff75d4065a19508ea6 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Thu, 30 Jul 2009 10:57:23 -0400
Subject: sched: Fully integrate cpus_active_map and root-domain code

Reflect "active" cpus in the rq->rd->online field, instead of
the online_map.

The motivation is that things that use the root-domain code
(such as cpupri) only care about cpus classified as "active"
anyway. By synchronizing the root-domain state with the active
map, we allow several optimizations.

For instance, we can remove an extra cpumask_and from the
scheduler hotpath by utilizing rq->rd->online (since it is now
a cached version of cpu_active_map & rq->rd->span).

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Max Krasnyansky <maxk@qualcomm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090730145723.25226.24493.stgit@dev.haskins.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  2 +-
 kernel/sched_fair.c | 10 +++++++---
 kernel/sched_rt.c   |  7 -------
 3 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 613fee54fc8..475138c4254 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7927,7 +7927,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	rq->rd = rd;
 
 	cpumask_set_cpu(rq->cpu, rd->span);
-	if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9a..49347298487 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1046,17 +1046,21 @@ static void yield_task_fair(struct rq *rq)
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_mask)
+ * hence we need to mask them out (rq->rd->online)
  *
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+
+#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
+
 static int wake_idle(int cpu, struct task_struct *p)
 {
 	struct sched_domain *sd;
 	int i;
 	unsigned int chosen_wakeup_cpu;
 	int this_cpu;
+	struct rq *task_rq = task_rq(p);
 
 	/*
 	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
@@ -1089,10 +1093,10 @@ static int wake_idle(int cpu, struct task_struct *p)
 	for_each_domain(cpu, sd) {
 		if ((sd->flags & SD_WAKE_IDLE)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
-			&& !task_hot(p, task_rq(p)->clock, sd))) {
+			&& !task_hot(p, task_rq->clock, sd))) {
 			for_each_cpu_and(i, sched_domain_span(sd),
 					 &p->cpus_allowed) {
-				if (cpu_active(i) && idle_cpu(i)) {
+				if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
 					if (i != task_cpu(p)) {
 						schedstat_inc(p,
 						       se.nr_wakeups_idle);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a8f89bc3e5e..13f728ef5b3 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1172,13 +1172,6 @@ static int find_lowest_rq(struct task_struct *task)
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return -1; /* No targets found */
 
-	/*
-	 * Only consider CPUs that are usable for migration.
-	 * I guess we might want to change cpupri_find() to ignore those
-	 * in the first place.
-	 */
-	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
-
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
-- 
cgit v1.2.3


From 8f48894fcc89ddec62e1762f73a0825793e59e91 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 24 Jul 2009 12:25:30 +0200
Subject: sched: Add debug check to task_of()

A frequent mistake appears to be to call task_of() on a
scheduler entity that is not actually a task, which can result
in a wild pointer.

Add a check to catch these mistakes.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 20 ++++++++++++++------
 kernel/sched_rt.c   | 16 ++++++++++++----
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 49347298487..342000b31ad 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
  * CFS operations on generic schedulable entities:
  */
 
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-	return container_of(se, struct task_struct, se);
-}
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 /* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(!entity_is_task(se));
+#endif
+	return container_of(se, struct task_struct, se);
+}
+
 /* Walk up scheduling entities hierarchy */
 #define for_each_sched_entity(se) \
 		for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	}
 }
 
-#else	/* CONFIG_FAIR_GROUP_SCHED */
+#else	/* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 13f728ef5b3..f365e66b3d4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
  * policies)
  */
 
+#ifdef CONFIG_RT_GROUP_SCHED
+
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 {
+#ifdef CONFIG_SCHED_DEBUG
+	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
 	return container_of(rt_se, struct task_struct, rt);
 }
 
-#ifdef CONFIG_RT_GROUP_SCHED
-
-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
-
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
 	return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 
 #define rt_entity_is_task(rt_se) (1)
 
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+	return container_of(rt_se, struct task_struct, rt);
+}
+
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
 	return container_of(rt_rq, struct rq, rt);
-- 
cgit v1.2.3


From 693525e3bea25cf2ee6cf2b862ba7c148e891df2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 21 Jul 2009 13:56:38 +0200
Subject: sched: Ensure the migration task doesn't go away during use

Like sched_migrate_task(), set_cpus_allowed_ptr() should hold
onto the migration thread too.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 475138c4254..7f83be35d65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7078,8 +7078,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
 	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
+		struct task_struct *mt = rq->migration_thread;
+
+		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
+		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
-- 
cgit v1.2.3


From bbfa26229a8143889e95e0df4a9d69067ee836cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 2 Aug 2009 14:44:24 +0200
Subject: lockdep: Fix BFS build

Fix:

  kernel/built-in.o: In function `lockdep_stats_show':
  lockdep_proc.c:(.text+0x48202): undefined reference to `max_bfs_queue_depth'

As max_bfs_queue_depth is only available under
CONFIG_PROVE_LOCKING=y.

Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9a1bf34d2ff..fba81f16e34 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -411,8 +411,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			max_lockdep_depth);
 	seq_printf(m, " max recursion depth:           %11u\n",
 			max_recursion_depth);
+#ifdef CONFIG_PROVE_LOCKING
 	seq_printf(m, " max bfs queue depth:           %11u\n",
 			max_bfs_queue_depth);
+#endif
 	lockdep_stats_debug_show(m);
 	seq_printf(m, " debug_locks:                   %11u\n",
 			debug_locks);
-- 
cgit v1.2.3


From bcf08df3b23b3d13bf8c4ad6bd744a6ad30015fb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 19 Apr 2008 12:11:10 +0200
Subject: sched: Fix cpupri build on !CONFIG_SMP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This build bug:

 In file included from kernel/sched.c:1765:
 kernel/sched_rt.c: In function ‘has_pushable_tasks’:
 kernel/sched_rt.c:1069: error: ‘struct rt_rq’ has no member named ‘pushable_tasks’
 kernel/sched_rt.c: In function ‘pick_next_task_rt’:
 kernel/sched_rt.c:1084: error: ‘struct rq’ has no member named ‘post_schedule’

Triggers because both pushable_tasks and post_schedule are
SMP-only fields.

Move pushable_tasks() to the SMP section and #ifdef the post_schedule use.

Cc: Gregory Haskins <ghaskins@novell.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f365e66b3d4..3d4020a9ba1 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -136,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 }
 
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
 #else
 
 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -1064,11 +1069,6 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return p;
 }
 
-static inline int has_pushable_tasks(struct rq *rq)
-{
-	return !plist_head_empty(&rq->rt.pushable_tasks);
-}
-
 static struct task_struct *pick_next_task_rt(struct rq *rq)
 {
 	struct task_struct *p = _pick_next_task_rt(rq);
@@ -1077,11 +1077,13 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	if (p)
 		dequeue_pushable_task(rq, p);
 
+#ifdef CONFIG_SMP
 	/*
 	 * We detect this state here so that we can avoid taking the RQ
 	 * lock again later if there is no need to push
 	 */
 	rq->post_schedule = has_pushable_tasks(rq);
+#endif
 
 	return p;
 }
-- 
cgit v1.2.3


From 4f84f4330a11b9eb828bf5af557f4c79c64614a3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 20 Jul 2009 15:27:04 +0200
Subject: lockdep: Fix backtraces

Truncate stupid -1 entries in backtraces.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1248096665.15751.8816.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1cedb00e3e7..2f0970297e3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -367,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
 
 	save_stack_trace(trace);
 
+	/*
+	 * Some daft arches put -1 at the end to indicate its a full trace.
+	 *
+	 * <rant> this is buggy anyway, since it takes a whole extra entry so a
+	 * complete trace that maxes out the entries provided will be reported
+	 * as incomplete, friggin useless </rant>
+	 */
+	if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+		trace->nr_entries--;
+
 	trace->max_entries = trace->nr_entries;
 
 	nr_stack_trace_entries += trace->nr_entries;
 
-	if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
 		if (!debug_locks_off_graph_unlock())
 			return 0;
 
-- 
cgit v1.2.3


From 98c33eddaf41d225d99b40f9eedbd0fac4c08c05 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 21 Jul 2009 13:19:07 +0200
Subject: lockdep: Fix style nits

fixes a few comments and whitespaces that annoyed me.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2f0970297e3..4b6cebe8ab3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -843,15 +843,18 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	return 1;
 }
 
-/*For good efficiency of modular, we use power of 2*/
+/*
+ * For good efficiency of modular, we use power of 2
+ */
 #define MAX_CIRCULAR_QUEUE_SIZE		4096UL
 #define CQ_MASK				(MAX_CIRCULAR_QUEUE_SIZE-1)
 
-/* The circular_queue and helpers is used to implement the
+/*
+ * The circular_queue and helpers is used to implement the
  * breadth-first search(BFS)algorithem, by which we can build
  * the shortest path from the next lock to be acquired to the
  * previous held lock if there is a circular between them.
- * */
+ */
 struct circular_queue {
 	unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
 	unsigned int  front, rear;
@@ -907,6 +910,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
 					struct lock_list *parent)
 {
 	unsigned long nr;
+
 	nr = lock - list_entries;
 	WARN_ON(nr >= nr_list_entries);
 	lock->parent = parent;
@@ -916,6 +920,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
 static inline unsigned long lock_accessed(struct lock_list *lock)
 {
 	unsigned long nr;
+
 	nr = lock - list_entries;
 	WARN_ON(nr >= nr_list_entries);
 	return test_bit(nr, bfs_accessed);
-- 
cgit v1.2.3


From f607c6685774811b8112e124f10a053d77015485 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 20 Jul 2009 19:16:29 +0200
Subject: lockdep: Introduce lockdep_assert_held()

Add a lockdep helper to validate that we indeed are the owner
of a lock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 33 +++++++++++++++++++++++++++++++++
 kernel/sched.c   |  2 ++
 2 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 4b6cebe8ab3..28914a50991 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3059,6 +3059,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 	check_chain_key(curr);
 }
 
+static int __lock_is_held(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		if (curr->held_locks[i].instance == lock)
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * Check whether we follow the irq-flags state precisely:
  */
@@ -3160,6 +3173,26 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
+int lock_is_held(struct lockdep_map *lock)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (unlikely(current->lockdep_recursion))
+		return ret;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	ret = __lock_is_held(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lock_is_held);
+
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
 	current->lockdep_reclaim_gfp = gfp_mask;
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273..2c75f7daa43 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6609,6 +6609,8 @@ int cond_resched_lock(spinlock_t *lock)
 	int resched = should_resched();
 	int ret = 0;
 
+	lockdep_assert_held(lock);
+
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
-- 
cgit v1.2.3


From bb97a91e2549a7f2df9c21d32542582f549ab3ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 20 Jul 2009 19:15:35 +0200
Subject: lockdep: Deal with many similar locks

spin_lock_nest_lock() allows to take many instances of the same
class, this can easily lead to overflow of MAX_LOCK_DEPTH.

To avoid this overflow, we'll stop accounting instances but
start reference counting the class in the held_lock structure.

[ We could maintain a list of instances, if we'd move the hlock
  stuff into __lock_acquired(), but that would require
  significant modifications to the current code. ]

We restrict this mode to spin_lock_nest_lock() only, because it
degrades the lockdep quality due to lost of instance.

For lockstat this means we don't track lock statistics for any
but the first lock in the series.

Currently nesting is limited to 11 bits because that was the
spare space available in held_lock. This yields a 2048
instances maximium.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 77 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 28914a50991..0bb246e21cd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2708,13 +2708,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
  */
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  int trylock, int read, int check, int hardirqs_off,
-			  struct lockdep_map *nest_lock, unsigned long ip)
+			  struct lockdep_map *nest_lock, unsigned long ip,
+			  int references)
 {
 	struct task_struct *curr = current;
 	struct lock_class *class = NULL;
 	struct held_lock *hlock;
 	unsigned int depth, id;
 	int chain_head = 0;
+	int class_idx;
 	u64 chain_key;
 
 	if (!prove_locking)
@@ -2762,10 +2764,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
 		return 0;
 
+	class_idx = class - lock_classes + 1;
+
+	if (depth) {
+		hlock = curr->held_locks + depth - 1;
+		if (hlock->class_idx == class_idx && nest_lock) {
+			if (hlock->references)
+				hlock->references++;
+			else
+				hlock->references = 2;
+
+			return 1;
+		}
+	}
+
 	hlock = curr->held_locks + depth;
 	if (DEBUG_LOCKS_WARN_ON(!class))
 		return 0;
-	hlock->class_idx = class - lock_classes + 1;
+	hlock->class_idx = class_idx;
 	hlock->acquire_ip = ip;
 	hlock->instance = lock;
 	hlock->nest_lock = nest_lock;
@@ -2773,6 +2789,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->read = read;
 	hlock->check = check;
 	hlock->hardirqs_off = !!hardirqs_off;
+	hlock->references = references;
 #ifdef CONFIG_LOCK_STAT
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = sched_clock();
@@ -2881,6 +2898,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 	return 1;
 }
 
+static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+{
+	if (hlock->instance == lock)
+		return 1;
+
+	if (hlock->references) {
+		struct lock_class *class = lock->class_cache;
+
+		if (!class)
+			class = look_up_lock_class(lock, 0);
+
+		if (DEBUG_LOCKS_WARN_ON(!class))
+			return 0;
+
+		if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
+			return 0;
+
+		if (hlock->class_idx == class - lock_classes + 1)
+			return 1;
+	}
+
+	return 0;
+}
+
 static int
 __lock_set_class(struct lockdep_map *lock, const char *name,
 		 struct lock_class_key *key, unsigned int subclass,
@@ -2904,7 +2945,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (hlock->instance == lock)
+		if (match_held_lock(hlock, lock))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -2923,7 +2964,8 @@ found_it:
 		if (!__lock_acquire(hlock->instance,
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
-				hlock->nest_lock, hlock->acquire_ip))
+				hlock->nest_lock, hlock->acquire_ip,
+				hlock->references))
 			return 0;
 	}
 
@@ -2962,20 +3004,34 @@ lock_release_non_nested(struct task_struct *curr,
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (hlock->instance == lock)
+		if (match_held_lock(hlock, lock))
 			goto found_it;
 		prev_hlock = hlock;
 	}
 	return print_unlock_inbalance_bug(curr, lock, ip);
 
 found_it:
-	lock_release_holdtime(hlock);
+	if (hlock->instance == lock)
+		lock_release_holdtime(hlock);
+
+	if (hlock->references) {
+		hlock->references--;
+		if (hlock->references) {
+			/*
+			 * We had, and after removing one, still have
+			 * references, the current lock stack is still
+			 * valid. We're done!
+			 */
+			return 1;
+		}
+	}
 
 	/*
 	 * We have the right lock to unlock, 'hlock' points to it.
 	 * Now we remove it from the stack, and add back the other
 	 * entries (if any), recalculating the hash along the way:
 	 */
+
 	curr->lockdep_depth = i;
 	curr->curr_chain_key = hlock->prev_chain_key;
 
@@ -2984,7 +3040,8 @@ found_it:
 		if (!__lock_acquire(hlock->instance,
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
-				hlock->nest_lock, hlock->acquire_ip))
+				hlock->nest_lock, hlock->acquire_ip,
+				hlock->references))
 			return 0;
 	}
 
@@ -3014,7 +3071,7 @@ static int lock_release_nested(struct task_struct *curr,
 	/*
 	 * Is the unlock non-nested:
 	 */
-	if (hlock->instance != lock)
+	if (hlock->instance != lock || hlock->references)
 		return lock_release_non_nested(curr, lock, ip);
 	curr->lockdep_depth--;
 
@@ -3065,7 +3122,9 @@ static int __lock_is_held(struct lockdep_map *lock)
 	int i;
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
-		if (curr->held_locks[i].instance == lock)
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock))
 			return 1;
 	}
 
@@ -3148,7 +3207,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	current->lockdep_recursion = 1;
 	__lock_acquire(lock, subclass, trylock, read, check,
-		       irqs_disabled_flags(flags), nest_lock, ip);
+		       irqs_disabled_flags(flags), nest_lock, ip, 0);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
@@ -3252,7 +3311,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (hlock->instance == lock)
+		if (match_held_lock(hlock, lock))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3260,6 +3319,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 	return;
 
 found_it:
+	if (hlock->instance != lock)
+		return;
+
 	hlock->waittime_stamp = sched_clock();
 
 	contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3299,7 +3361,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (hlock->instance == lock)
+		if (match_held_lock(hlock, lock))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3307,6 +3369,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 	return;
 
 found_it:
+	if (hlock->instance != lock)
+		return;
+
 	cpu = smp_processor_id();
 	if (hlock->waittime_stamp) {
 		now = sched_clock();
-- 
cgit v1.2.3


From e351b660fddd4df76cc4635f896d311ed0ff3752 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 22 Jul 2009 22:48:09 +0800
Subject: lockdep: Reintroduce generation count to make BFS faster

We still can apply DaveM's generation count optimization to
BFS, based on the following idea:

 - before doing each BFS, increase the global generation id
   by 1

 - if one node in the graph has been visited, mark it as
   visited by storing the current global generation id into
   the node's dep_gen_id field

 - so we can decide if one node has been visited already, by
   comparing the node's dep_gen_id with the global generation id.

By applying DaveM's generation count optimization to current
implementation of BFS, we gain the following advantages:

 - we save MAX_LOCKDEP_ENTRIES/8 bytes memory;

 - we remove the bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES);
   in each BFS, which is very time-consuming since
   MAX_LOCKDEP_ENTRIES may be very large.(16384UL)

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "David S. Miller" <davem@davemloft.net>
LKML-Reference: <1248274089-6358-1-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0bb246e21cd..2b443b5090a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -861,14 +861,15 @@ struct circular_queue {
 };
 
 static struct circular_queue lock_cq;
-static unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)];
 
 unsigned int max_bfs_queue_depth;
 
+static unsigned int lockdep_dependency_gen_id;
+
 static inline void __cq_init(struct circular_queue *cq)
 {
 	cq->front = cq->rear = 0;
-	bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES);
+	lockdep_dependency_gen_id++;
 }
 
 static inline int __cq_empty(struct circular_queue *cq)
@@ -914,7 +915,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
 	nr = lock - list_entries;
 	WARN_ON(nr >= nr_list_entries);
 	lock->parent = parent;
-	set_bit(nr, bfs_accessed);
+	lock->class->dep_gen_id = lockdep_dependency_gen_id;
 }
 
 static inline unsigned long lock_accessed(struct lock_list *lock)
@@ -923,7 +924,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
 
 	nr = lock - list_entries;
 	WARN_ON(nr >= nr_list_entries);
-	return test_bit(nr, bfs_accessed);
+	return lock->class->dep_gen_id == lockdep_dependency_gen_id;
 }
 
 static inline struct lock_list *get_lock_parent(struct lock_list *child)
@@ -3604,7 +3605,7 @@ void __init lockdep_info(void)
 		sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
 		sizeof(struct list_head) * CHAINHASH_SIZE) / 1024
 #ifdef CONFIG_PROVE_LOCKING
-		+ sizeof(struct circular_queue) + sizeof(bfs_accessed)
+		+ sizeof(struct circular_queue)
 #endif
 		);
 
-- 
cgit v1.2.3


From 90629209a020859b67423a6326f3765f220c7f5c Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 2 Aug 2009 21:43:36 +0800
Subject: lockdep: Fix memory usage info of BFS

The unit is KB, so sizeof(struct circular_queue) should be
divided by 1024.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Cc: davem@davemloft.net
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: a.p.zijlstra@chello.nl
LKML-Reference: <1249220616-7190-1-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2b443b5090a..0843584aed5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3603,10 +3603,11 @@ void __init lockdep_info(void)
 		sizeof(struct list_head) * CLASSHASH_SIZE +
 		sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
 		sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
-		sizeof(struct list_head) * CHAINHASH_SIZE) / 1024
+		sizeof(struct list_head) * CHAINHASH_SIZE
 #ifdef CONFIG_PROVE_LOCKING
 		+ sizeof(struct circular_queue)
 #endif
+		) / 1024
 		);
 
 	printk(" per task-struct memory footprint: %lu bytes\n",
-- 
cgit v1.2.3


From 5b0f437df0a3e374d26ad533eb78fe64744f55a8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@gmail.com>
Date: Thu, 30 Jul 2009 19:00:53 +0200
Subject: workqueues: Improve schedule_work() documentation

Two important aspects of the schedule_work() function are not
yet documented:

 - that it is allowed to pass a struct work_struct * to this
   function that is already on the kernel-global workqueue;

 - the meaning of its return value.

The patch below documents both aspects.

Signed-off-by: Bart Van Assche <bart.vanassche@gmail.com>
Cc: "Greg Kroah-Hartman" <gregkh@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <200907301900.54202.bart.vanassche@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/workqueue.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d881..3c44b56b0da 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -600,7 +600,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
  * schedule_work - put work task in global workqueue
  * @work: job to be done
  *
- * This puts a job in the kernel-global workqueue.
+ * Returns zero if @work was already on the kernel-global workqueue and
+ * non-zero otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
  */
 int schedule_work(struct work_struct *work)
 {
-- 
cgit v1.2.3


From cc6db4e60116c1f76577b6850a35ae7de69a95b6 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 31 Jul 2009 16:20:10 -0700
Subject: futex: Correct futex_wait_requeue_pi() commentary

The state machine described in the comments wasn't updated with
a follow-on fix.  Address that and cleanup the corresponding
commentary in the function.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <4A737C2A.9090001@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 0672ff88f15..d077201b393 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2102,11 +2102,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * We call schedule in futex_wait_queue_me() when we enqueue and return there
  * via the following:
  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
- * 2) wakeup on uaddr2 after a requeue and subsequent unlock
- * 3) signal (before or after requeue)
- * 4) timeout (before or after requeue)
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
  *
- * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ * If 3, cleanup and return -ERESTARTNOINTR.
  *
  * If 2, we may then block on trying to take the rt_mutex and return via:
  * 5) successful lock
@@ -2114,7 +2114,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * 7) timeout
  * 8) other lock acquisition failure
  *
- * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
  *
  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
  *
@@ -2232,14 +2232,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 			rt_mutex_unlock(pi_mutex);
 	} else if (ret == -EINTR) {
 		/*
-		 * We've already been requeued, but we have no way to
-		 * restart by calling futex_lock_pi() directly. We
-		 * could restart the syscall, but that will look at
-		 * the user space value and return right away. So we
-		 * drop back with EWOULDBLOCK to tell user space that
-		 * "val" has been changed. That's the same what the
-		 * restart of the syscall would do in
-		 * futex_wait_setup().
+		 * We've already been requeued, but cannot restart by calling
+		 * futex_lock_pi() directly. We could restart this syscall, but
+		 * it would detect that the user space "val" changed and return
+		 * -EWOULDBLOCK.  Save the overhead of the restart and return
+		 * -EWOULDBLOCK directly.
 		 */
 		ret = -EWOULDBLOCK;
 	}
-- 
cgit v1.2.3


From 990a55eb25d9698d61352264cc43f3a9c04cce90 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Fri, 31 Jul 2009 00:02:06 +0200
Subject: irq: Clean up by removing irqfixup MODULE_PARM_DESC()

It's wrong (the parm takes no arguments) and compile-time wiped
away anyway (due to !MODULE).

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
LKML-Reference: <1248991326-26267-1-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/spurious.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3..114e704760f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
 
 __setup("irqfixup", irqfixup_setup);
 module_param(irqfixup, int, 0644);
-MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
 
 static int __init irqpoll_setup(char *str)
 {
-- 
cgit v1.2.3


From a2551df7ec568d87793d2eea4ca744e86318f205 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 31 Jul 2009 12:54:11 -0400
Subject: Security/SELinux: seperate lsm specific mmap_min_addr

Currently SELinux enforcement of controls on the ability to map low memory
is determined by the mmap_min_addr tunable.  This patch causes SELinux to
ignore the tunable and instead use a seperate Kconfig option specific to how
much space the LSM should protect.

The tunable will now only control the need for CAP_SYS_RAWIO and SELinux
permissions will always protect the amount of low memory designated by
CONFIG_LSM_MMAP_MIN_ADDR.

This allows users who need to disable the mmap_min_addr controls (usual reason
being they run WINE as a non-root user) to do so and still have SELinux
controls preventing confined domains (like a web server) from being able to
map some area of low memory.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/sysctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98e02328c67..58be76017fd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/security.h>
 #include <linux/slow-work.h>
 #include <linux/perf_counter.h>
 
@@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "mmap_min_addr",
-		.data		= &mmap_min_addr,
-		.maxlen         = sizeof(unsigned long),
+		.data		= &dac_mmap_min_addr,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &mmap_min_addr_handler,
 	},
 #ifdef CONFIG_NUMA
 	{
-- 
cgit v1.2.3


From 42c2c8c854a716b05882a122632ddcd6dbe108f1 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <Sonic.Zhang@analog.com>
Date: Thu, 6 Aug 2009 15:58:11 -0700
Subject: printk: Fix "printk: Enable the use of more than one CON_BOOT (early
 console)"

Don't return when we find the first bootconsole - it can leave
other bootconsoles still installed, and they can be used and
cause problems later (if they are in the init section, and
eventually released), and cause problems.  Make sure we remove
all of them.

Signed-off-by: Sonic Zhang <Sonic.Zhang@analog.com>
Signed-off-by: Robin Getz <rgetz@analog.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e0daaf57985..e10d193a833 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1352,7 +1352,7 @@ static int __init disable_boot_consoles(void)
 		if (con->flags & CON_BOOT) {
 			printk(KERN_INFO "turn off boot console %s%d\n",
 				con->name, con->index);
-			return unregister_console(con);
+			unregister_console(con);
 		}
 	}
 	return 0;
-- 
cgit v1.2.3


From c36ba80ea01d0aecb652c26799a912e760ce8981 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Thu, 6 Aug 2009 21:46:03 +0200
Subject: irq: Remove superfluous NULL pointer check in check_irq_resend()

This takes care of the following entry from Dan's list:

kernel/irq/resend.c +73 check_irq_resend(17) warning: variable derefenced before check 'desc->chip'

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Eugene Teo <eteo@redhat.com>
Cc: Julia Lawall <julia@diku.dk>
LKML-Reference: <200908062146.03638.bzolnier@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/resend.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2..090c3763f3a 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
 
-		if (!desc->chip || !desc->chip->retrigger ||
-					!desc->chip->retrigger(irq)) {
+		if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
 			/* Set it pending and activate the softirq: */
 			set_bit(irq, irqs_resend);
-- 
cgit v1.2.3


From 30dd568c912602b7dbd609a45d053e01b13422bb Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 21 Jul 2009 15:56:48 +0200
Subject: x86, perf_counter, bts: Add BTS support to perfcounters

Implement a performance counter with:

    attr.type           = PERF_TYPE_HARDWARE
    attr.config         = PERF_COUNT_HW_BRANCH_INSTRUCTIONS
    attr.sample_period  = 1

Using branch trace store (BTS) on x86 hardware, if available.

The from and to address for each branch can be sampled using:

    PERF_SAMPLE_IP      for the from address
    PERF_SAMPLE_ADDR    for the to address

[ v2: address review feedback, fix bugs ]

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 546e62d6294..bf8110b35c5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -88,6 +88,7 @@ void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
+void __weak hw_perf_counter_setup_online(int cpu)	{ barrier(); }
 
 int __weak
 hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
-static void perf_counter_output(struct perf_counter *counter, int nmi,
+void perf_counter_output(struct perf_counter *counter, int nmi,
 				struct perf_sample_data *data)
 {
 	int ret;
@@ -4566,6 +4567,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 		perf_counter_init_cpu(cpu);
 		break;
 
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		hw_perf_counter_setup_online(cpu);
+		break;
+
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		perf_counter_exit_cpu(cpu);
@@ -4590,6 +4596,8 @@ void __init perf_counter_init(void)
 {
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+			(void *)(long)smp_processor_id());
 	register_cpu_notifier(&perf_cpu_nb);
 }
 
-- 
cgit v1.2.3


From 5a165657bef7c47e5ff4cd138f7758ef6278e87b Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 13 Aug 2009 05:20:45 +0000
Subject: net: skb ftracer - Add config option to enable new ftracer (v3)

skb allocation / consumption corelator - Add config option

This patch adds a Kconfig option to enable the addtition of the skb source
tracer.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 Kconfig |   10 ++++++++++
 1 file changed, 10 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 019f380fd76..f09d7635c0e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -234,6 +234,16 @@ config BOOT_TRACER
 	  You must pass in initcall_debug and ftrace=initcall to the kernel
 	  command line to enable this on bootup.
 
+config SKB_SOURCES_TRACER
+	bool "Trace skb source information"
+	select GENERIC_TRACER
+	help
+	   This tracer helps developers/sysadmins correlate skb allocation and
+	   consumption.  The idea being that some processes will primarily consume data
+	   that was allocated on certain numa nodes.  By being able to visualize which
+	   nodes the data was allocated on, a sysadmin or developer can optimize the
+	   scheduling of those processes to cut back on cross node chatter.
+
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
-- 
cgit v1.2.3


From 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 13 Aug 2009 05:23:56 +0000
Subject: net: skb ftracer - Add actual ftrace code to kernel (v3)

skb allocation / consumption correlator

Add ftracer module to kernel to print out a list that correlates a process id,
an skb it read, and the numa nodes on wich the process was running when it was
read along with the numa node the skbuff was allocated on.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 Makefile            |    1
 trace.h             |   19 ++++++
 trace_skb_sources.c |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Makefile            |   1 +
 kernel/trace/trace.h             |  19 +++++
 kernel/trace/trace_skb_sources.c | 154 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+)
 create mode 100644 kernel/trace/trace_skb_sources.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90..ee5e5b1b928 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
 endif
+obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b9f4f6e955..8a6281b2330 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
+#include <trace/events/skb.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -40,6 +41,7 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_SKB_SOURCE,
 
 	__TRACE_LAST_TYPE,
 };
@@ -171,6 +173,21 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+struct skb_record {
+	pid_t pid;		/* pid of the copying process */
+	int anid;		/* node where skb was allocated */
+	int cnid;		/* node to which skb was copied in userspace */
+	char ifname[IFNAMSIZ];	/* Name of the receiving interface */
+	int rx_queue;		/* The rx queue the skb was received on */
+	int ccpu;		/* Cpu the application got this frame from */
+	int len;		/* length of the data copied */
+};
+
+struct trace_skb_event {
+	struct trace_entry	ent;
+	struct skb_record	event_data;
+};
+
 enum kmemtrace_type_id {
 	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
 	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
@@ -323,6 +340,8 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
+		IF_ASSIGN(var, ent, struct trace_skb_event,		\
+			  TRACE_SKB_SOURCE);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c
new file mode 100644
index 00000000000..40eb071afdb
--- /dev/null
+++ b/kernel/trace/trace_skb_sources.c
@@ -0,0 +1,154 @@
+/*
+ * ring buffer based tracer for analyzing per-socket skb sources
+ *
+ * Neil Horman <nhorman@tuxdriver.com>
+ * Copyright (C) 2009
+ *
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <trace/events/skb.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec);
+
+static struct trace_array *skb_trace;
+static int __read_mostly trace_skb_source_enabled;
+
+static void probe_skb_dequeue(const struct sk_buff *skb, int len)
+{
+	struct ring_buffer_event *event;
+	struct trace_skb_event *entry;
+	struct trace_array *tr = skb_trace;
+	struct net_device *dev;
+
+	if (!trace_skb_source_enabled)
+		return;
+
+	if (in_interrupt())
+		return;
+
+	event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		return;
+	entry = ring_buffer_event_data(event);
+
+	entry->event_data.pid = current->pid;
+	entry->event_data.anid = page_to_nid(virt_to_page(skb->data));
+	entry->event_data.cnid = cpu_to_node(smp_processor_id());
+	entry->event_data.len = len;
+	entry->event_data.rx_queue = skb->queue_mapping;
+	entry->event_data.ccpu = smp_processor_id();
+
+	dev = dev_get_by_index(sock_net(skb->sk), skb->iif);
+	if (dev) {
+		memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ);
+		dev_put(dev);
+	} else {
+		strcpy(entry->event_data.ifname, "Unknown");
+	}
+
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+}
+
+static int tracing_skb_source_register(void)
+{
+	int ret;
+
+	ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
+	if (ret)
+		pr_info("skb source trace: Couldn't activate dequeue tracepoint");
+
+	return ret;
+}
+
+static void start_skb_source_trace(struct trace_array *tr)
+{
+	trace_skb_source_enabled = 1;
+}
+
+static void stop_skb_source_trace(struct trace_array *tr)
+{
+	trace_skb_source_enabled = 0;
+}
+
+static void skb_source_trace_reset(struct trace_array *tr)
+{
+	trace_skb_source_enabled = 0;
+	unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
+}
+
+
+static int skb_source_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	skb_trace = tr;
+
+	trace_skb_source_enabled = 1;
+	tracing_skb_source_register();
+
+	for_each_cpu(cpu, cpu_possible_mask)
+		tracing_reset(tr, cpu);
+	return 0;
+}
+
+static enum print_line_t skb_source_print_line(struct trace_iterator *iter)
+{
+	int ret = 0;
+	struct trace_entry *entry = iter->ent;
+	struct trace_skb_event *event;
+	struct skb_record *record;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(event, entry);
+	record = &event->event_data;
+	if (entry->type != TRACE_SKB_SOURCE)
+		return TRACE_TYPE_UNHANDLED;
+
+	ret = trace_seq_printf(s, "	%d	%d	%d	%s	%d	%d	%d\n",
+			record->pid,
+			record->anid,
+			record->cnid,
+			record->ifname,
+			record->rx_queue,
+			record->ccpu,
+			record->len);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static void skb_source_print_header(struct seq_file *s)
+{
+	seq_puts(s, "#	PID	ANID	CNID	IFC	RXQ	CCPU	LEN\n");
+	seq_puts(s, "#	 |	 |	 |	 |	 |	 |	 |\n");
+}
+
+static struct tracer skb_source_tracer __read_mostly =
+{
+	.name		= "skb_sources",
+	.init		= skb_source_trace_init,
+	.start		= start_skb_source_trace,
+	.stop		= stop_skb_source_trace,
+	.reset		= skb_source_trace_reset,
+	.print_line	= skb_source_print_line,
+	.print_header	= skb_source_print_header,
+};
+
+static int init_skb_source_trace(void)
+{
+	return register_tracer(&skb_source_tracer);
+}
+device_initcall(init_skb_source_trace);
-- 
cgit v1.2.3


From 9188499cdb117d86a1ea6b04374095b098d56936 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 13 Aug 2009 09:44:57 -0400
Subject: security: introducing security_request_module

Calling request_module() will trigger a userspace upcall which will load a
new module into the kernel.  This can be a dangerous event if the process
able to trigger request_module() is able to control either the modprobe
binary or the module binary.  This patch adds a new security hook to
request_module() which can be used by an LSM to control a processes ability
to call request_module().

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/kmod.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 385c31a1bdb..5a7ae57f983 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -78,6 +78,10 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
 
+	ret = security_kernel_module_request();
+	if (ret)
+		return ret;
+
 	va_start(args, fmt);
 	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
 	va_end(args);
-- 
cgit v1.2.3


From 27569620c748ec13f801b4683b448a2ac2adaae4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2009 09:53:46 -0700
Subject: rcu: Split hierarchical RCU initialization into boot-time and
 CPU-online pieces

This patch divides the rcutree initialization into boot-time
and hotplug-time components, so that the tree data structures
are guaranteed to be fully linked at boot time regardless of
what might happen in CPU hotplug operations.

This makes RCU more resilient against CPU hotplug misbehavior
(and vice versa), but more importantly, does a better job of
compartmentalizing the code.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: josht@linux.vnet.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: hugh.dickins@tiscali.co.uk
Cc: benh@kernel.crashing.org
LKML-Reference: <1250355231152-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 53 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c202..f3e43274ed5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1325,22 +1325,40 @@ int rcu_needs_cpu(int cpu)
 }
 
 /*
- * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
- * approach so that we don't have to worry about how long the CPU has
- * been gone, or whether it ever was online previously.  We do trust the
- * ->mynode field, as it is constant for a given struct rcu_data and
- * initialized during early boot.
- *
- * Note that only one online or offline event can be happening at a given
- * time.  Note also that we can accept some slop in the rsp->completed
- * access due to the fact that this CPU cannot possibly have any RCU
- * callbacks in flight yet.
+ * Do boot-time initialization of a CPU's per-CPU RCU data.
+ */
+static void __init
+rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  Note that only one online or
+ * offline event can be happening at a given time.  Note also that we
+ * can accept some slop in the rsp->completed access due to the fact
+ * that this CPU cannot possibly have any RCU callbacks in flight yet.
  */
 static void __cpuinit
 rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	int i;
 	long lastcomp;
 	unsigned long mask;
 	struct rcu_data *rdp = rsp->rda[cpu];
@@ -1355,16 +1373,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
 	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->passed_quiesc_completed = lastcomp - 1;
-	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-	rdp->nxtlist = NULL;
-	for (i = 0; i < RCU_NEXT_SIZE; i++)
-		rdp->nxttail[i] = &rdp->nxtlist;
-	rdp->qlen = 0;
 	rdp->blimit = blimit;
-#ifdef CONFIG_NO_HZ
-	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
-	rdp->cpu = cpu;
 	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 
 	/*
@@ -1539,8 +1548,12 @@ void __init __rcu_init(void)
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 	rcu_init_one(&rcu_state);
 	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	for_each_possible_cpu(i)
+		rcu_boot_init_percpu_data(i, &rcu_state);
 	rcu_init_one(&rcu_bh_state);
 	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+	for_each_possible_cpu(i)
+		rcu_boot_init_percpu_data(i, &rcu_bh_state);
 
 	for_each_online_cpu(i)
 		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
-- 
cgit v1.2.3


From 2e597558086dec36d5c33521a36e0f6b1bc3f3a7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2009 09:53:48 -0700
Subject: rcu: Simplify RCU CPU-hotplug notification

Use the new cpu_notifier() API to simplify RCU's CPU-hotplug
notifiers, collapsing down to a single such notifier.

This makes it trivial to provide the notifier-ordering
guarantee that rcu_barrier() depends on.

Also remove redundant open_softirq() calls from Hierarchical
RCU notifier.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: josht@linux.vnet.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: hugh.dickins@tiscali.co.uk
Cc: benh@kernel.crashing.org
LKML-Reference: <12503552312510-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c   | 16 +++++++++++++++-
 kernel/rcupreempt.c | 25 ++-----------------------
 kernel/rcutree.c    | 17 +++++------------
 3 files changed, 22 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index eae29c25fb1..8df115600c2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -217,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
 		wake_up(&rcu_migrate_wq);
 }
 
+extern int rcu_cpu_notify(struct notifier_block *self,
+			  unsigned long action, void *hcpu);
+
 static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 		unsigned long action, void *hcpu)
 {
+	rcu_cpu_notify(self, action, hcpu);
 	if (action == CPU_DYING) {
 		/*
 		 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -244,8 +248,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 
 void __init rcu_init(void)
 {
+	int i;
+
 	__rcu_init();
-	hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
+	cpu_notifier(rcu_barrier_cpu_hotplug, 0);
+
+	/*
+	 * We don't need protection against CPU-hotplug here because
+	 * this is called early in boot, before either interrupts
+	 * or the scheduler are operational.
+	 */
+	for_each_online_cpu(i)
+		rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
 }
 
 void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index beb0e659adc..9b87f5134ed 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1417,8 +1417,8 @@ int rcu_pending(int cpu)
 	return 0;
 }
 
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
+int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+			     unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 
@@ -1439,10 +1439,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata rcu_nb = {
-	.notifier_call = rcu_cpu_notify,
-};
-
 void __init __rcu_init(void)
 {
 	int cpu;
@@ -1471,23 +1467,6 @@ void __init __rcu_init(void)
 		rdp->waitschedtail = &rdp->waitschedlist;
 		rdp->rcu_sched_sleeping = 0;
 	}
-	register_cpu_notifier(&rcu_nb);
-
-	/*
-	 * We don't need protection against CPU-Hotplug here
-	 * since
-	 * a) If a CPU comes online while we are iterating over the
-	 *    cpu_online_mask below, we would only end up making a
-	 *    duplicate call to rcu_online_cpu() which sets the corresponding
-	 *    CPU's mask in the rcu_cpu_online_map.
-	 *
-	 * b) A CPU cannot go offline at this point in time since the user
-	 *    does not have access to the sysfs interface, nor do we
-	 *    suspend the system.
-	 */
-	for_each_online_cpu(cpu)
-		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
-
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f3e43274ed5..75762cddbe0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1132,6 +1132,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 
+	WARN_ON_ONCE(rdp->beenonline == 0);
+
 	/*
 	 * If an RCU GP has gone long enough, go check for dyntick
 	 * idle CPUs and, if needed, send resched IPIs.
@@ -1416,14 +1418,13 @@ static void __cpuinit rcu_online_cpu(int cpu)
 {
 	rcu_init_percpu_data(cpu, &rcu_state);
 	rcu_init_percpu_data(cpu, &rcu_bh_state);
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 /*
  * Handle CPU online/offline notifcation events.
  */
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
+int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+			     unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 
@@ -1532,10 +1533,6 @@ do { \
 	} \
 } while (0)
 
-static struct notifier_block __cpuinitdata rcu_nb = {
-	.notifier_call	= rcu_cpu_notify,
-};
-
 void __init __rcu_init(void)
 {
 	int i;			/* All used by RCU_DATA_PTR_INIT(). */
@@ -1554,11 +1551,7 @@ void __init __rcu_init(void)
 	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
 	for_each_possible_cpu(i)
 		rcu_boot_init_percpu_data(i, &rcu_bh_state);
-
-	for_each_online_cpu(i)
-		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
-	/* Register notifier for non-boot CPUs */
-	register_cpu_notifier(&rcu_nb);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 module_param(blimit, int, 0);
-- 
cgit v1.2.3


From 8064d54929f23613e649dc7e14f7a94454487d58 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2009 09:53:49 -0700
Subject: rcu: Make preemptable RCU scan all CPUs when summing RCU counters

This patch eliminates the counter-moving during CPU-offline
notifiers, eliminating potential confusion if counters are
scanned during counter-movement process.

This confusion could result in premature ending of an RCU grace
period. For example, if there are two tasks in RCU read-side
critical sections (so that the sum of the counters is two), and
the counter for the CPU going offline is -2, then moving the
count to another CPU can result in the sum momentarily
appearing to be zero.  Since there are no memory barriers in
either case, many more such scenarios are possible.

So just don't move the counts!!!

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: josht@linux.vnet.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: hugh.dickins@tiscali.co.uk
Cc: benh@kernel.crashing.org
LKML-Reference: <12503552312863-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 9b87f5134ed..2748b89910b 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -849,7 +849,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
+	for_each_possible_cpu(cpu)
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -1067,12 +1067,6 @@ void rcu_offline_cpu(int cpu)
 			   /*  seen -after- acknowledgement. */
 	}
 
-	RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
-	RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
-
-	RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
-	RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
-
 	cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
 
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-- 
cgit v1.2.3


From b612ba804b8a656333013ad2ee96fb2377df5dbb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2009 09:53:50 -0700
Subject: rcu: Make rcupreempt_trace.c look at offline CPUs

Given that offline CPUs can now have non-zero counters, we need
to dump counters for offline CPUs as well as for online CPUs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: josht@linux.vnet.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: hugh.dickins@tiscali.co.uk
Cc: benh@kernel.crashing.org
LKML-Reference: <12503552313921-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt_trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 7c2665cac17..11640346a50 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -236,12 +236,13 @@ static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
 
 	cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
 				"CPU last cur F M\n");
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		long *flipctr = rcupreempt_flipctr(cpu);
 		cnt += snprintf(&rcupreempt_trace_buf[cnt],
 				RCUPREEMPT_TRACE_BUF_SIZE - cnt,
-					"%3d %4ld %3ld %d %d\n",
+					"%3d%c %4ld %3ld %d %d\n",
 			       cpu,
+			       cpu_is_offline(cpu) ? '!' : ' ',
 			       flipctr[!f],
 			       flipctr[f],
 			       rcupreempt_flip_flag(cpu),
-- 
cgit v1.2.3


From 684ca5cc9a772532bc893cdc994bd89bf0773719 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Sat, 15 Aug 2009 09:53:51 -0700
Subject: rcu: Fix typo in rcu_irq_exit() comment header

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: josht@linux.vnet.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: hugh.dickins@tiscali.co.uk
Cc: benh@kernel.crashing.org
LKML-Reference: <1250355231169-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 2748b89910b..510898a7bd6 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -548,7 +548,7 @@ void rcu_irq_enter(void)
  * rcu_irq_exit - Called from exiting Hard irq context.
  *
  * If the CPU was idle with dynamic ticks active, update the
- * rcu_dyntick_sched.dynticks to put let the RCU handling be
+ * rcu_dyntick_sched.dynticks to let the RCU handling be
  * aware that the CPU is going back to idle with no ticks.
  */
 void rcu_irq_exit(void)
-- 
cgit v1.2.3


From 84bc4af59081ee974dd80210e694ab59ebe51ce8 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 13 Aug 2009 17:36:53 -0700
Subject: futex: Detect mismatched requeue targets

There is currently no check to ensure that userspace uses the same
futex requeue target (uaddr2) in futex_requeue() that the waiter used
in futex_wait_requeue_pi().  A mismatch here could very unexpected
results as the waiter assumes it either wakes on uaddr1 or uaddr2. We
could detect this on wakeup in the waiter, but the cleanup is more
intense after the improper requeue has occured.

This patch stores the waiter's expected requeue target in a new
requeue_pi_key pointer in the futex_q which futex_requeue() checks
prior to attempting to do a proxy lock acquistion or a requeue when
requeue_pi=1. If they don't match, return -EINVAL from futex_requeue,
aborting the requeue of any remaining waiters.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20090814003650.14634.63916.stgit@Aeon>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index d077201b393..f0dea283e77 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
 	/* rt_waiter storage for requeue_pi: */
 	struct rt_mutex_waiter *rt_waiter;
 
+	/* The expected requeue pi target futex key: */
+	union futex_key *requeue_pi_key;
+
 	/* Bitset for the optional bitmasked wakeup */
 	u32 bitset;
 };
@@ -1080,6 +1083,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 	if (!top_waiter)
 		return 0;
 
+	/* Ensure we requeue to the expected futex. */
+	if (!match_futex(top_waiter->requeue_pi_key, key2))
+		return -EINVAL;
+
 	/*
 	 * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
 	 * the contended case or if set_waiters is 1.  The pi_state is returned
@@ -1260,6 +1267,12 @@ retry_private:
 			continue;
 		}
 
+		/* Ensure we requeue to the expected futex for requeue_pi. */
+		if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
+			ret = -EINVAL;
+			break;
+		}
+
 		/*
 		 * Requeue nr_requeue waiters and possibly one more in the case
 		 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1735,6 +1748,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 	q.pi_state = NULL;
 	q.bitset = bitset;
 	q.rt_waiter = NULL;
+	q.requeue_pi_key = NULL;
 
 	if (abs_time) {
 		to = &timeout;
@@ -1842,6 +1856,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 
 	q.pi_state = NULL;
 	q.rt_waiter = NULL;
+	q.requeue_pi_key = NULL;
 retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2153,15 +2168,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	debug_rt_mutex_init_waiter(&rt_waiter);
 	rt_waiter.task = NULL;
 
-	q.pi_state = NULL;
-	q.bitset = bitset;
-	q.rt_waiter = &rt_waiter;
-
 	key2 = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
+	q.pi_state = NULL;
+	q.bitset = bitset;
+	q.rt_waiter = &rt_waiter;
+	q.requeue_pi_key = &key2;
+
 	/* Prepare to wait on uaddr. */
 	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
 	if (ret)
-- 
cgit v1.2.3


From 212274347fc4d2a7c56bf6c953b02c809e7e0be1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 17 Aug 2009 13:39:33 +0800
Subject: lockdep: Fix missing entry in /proc/lock_stat

One entry is missing in the output of /proc/lock_stat.

The cause is, when ls_start() is called the 2nd time, we should
start from stats[@pos-1] but not stats[@pos], because pos == 0
is the header.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4A88ED15.20800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index fba81f16e34..5dbe30b4e59 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -634,7 +634,7 @@ static void *ls_start(struct seq_file *m, loff_t *pos)
 	if (*pos == 0)
 		return SEQ_START_TOKEN;
 
-	data->iter = data->stats + *pos;
+	data->iter = data->stats + (*pos - 1);
 	if (data->iter >= data->iter_end)
 		data->iter = NULL;
 
-- 
cgit v1.2.3


From e9d65725bdf5954283625ca4d770bfc34f2ae56a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 17 Aug 2009 13:39:49 +0800
Subject: lockdep: Fix missing entries in /proc/lock_chains

Two entries are missing in the output of /proc/lock_chains.

One is chains[1]. When lc_next() is called the 1st time,
chains[0] is returned. And when it's called the 2nd time,
chains[2] is returned.

The other missing ons is, when lc_start() is called the 2nd
time, we should start from chains[@pos-1] but not chains[@pos],
because pos == 0 is the header.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4A88ED25.2040306@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 5dbe30b4e59..9a7996e371f 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -160,8 +160,8 @@ static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
 	else {
 		chain = v;
 
-		if (*pos < nr_lock_chains)
-			chain = lock_chains + *pos;
+		if (*pos - 1 < nr_lock_chains)
+			chain = lock_chains + (*pos - 1);
 		else
 			chain = NULL;
 	}
@@ -174,8 +174,8 @@ static void *lc_start(struct seq_file *m, loff_t *pos)
 	if (*pos == 0)
 		return SEQ_START_TOKEN;
 
-	if (*pos < nr_lock_chains)
-		return lock_chains + *pos;
+	if (*pos - 1 < nr_lock_chains)
+		return lock_chains + (*pos - 1);
 
 	return NULL;
 }
-- 
cgit v1.2.3


From 8109e1de8502421f9efff1359f2779b1adcc0724 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 17 Aug 2009 13:40:17 +0800
Subject: lockdep: Simplify lockdep seqfile code

Use seq_list_start_head() and seq_list_next().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4A88ED41.5000000@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 46 +++++-----------------------------------------
 1 file changed, 5 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9a7996e371f..05fb5fde9b1 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
 
 static void *l_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct lock_class *class;
-
-	(*pos)++;
-
-	if (v == SEQ_START_TOKEN)
-		class = m->private;
-	else {
-		class = v;
-
-		if (class->lock_entry.next != &all_lock_classes)
-			class = list_entry(class->lock_entry.next,
-					   struct lock_class, lock_entry);
-		else
-			class = NULL;
-	}
-
-	return class;
+	return seq_list_next(v, &all_lock_classes, pos);
 }
 
 static void *l_start(struct seq_file *m, loff_t *pos)
 {
-	struct lock_class *class;
-	loff_t i = 0;
-
-	if (*pos == 0)
-		return SEQ_START_TOKEN;
-
-	list_for_each_entry(class, &all_lock_classes, lock_entry) {
-		if (++i == *pos)
-		return class;
-	}
-	return NULL;
+	return seq_list_start_head(&all_lock_classes, *pos);
 }
 
 static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
 
 static int l_show(struct seq_file *m, void *v)
 {
-	struct lock_class *class = v;
+	struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
 	struct lock_list *entry;
 	char usage[LOCK_USAGE_CHARS];
 
-	if (v == SEQ_START_TOKEN) {
+	if (v == &all_lock_classes) {
 		seq_printf(m, "all lock classes:\n");
 		return 0;
 	}
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
 
 static int lockdep_open(struct inode *inode, struct file *file)
 {
-	int res = seq_open(file, &lockdep_ops);
-	if (!res) {
-		struct seq_file *m = file->private_data;
-
-		if (!list_empty(&all_lock_classes))
-			m->private = list_entry(all_lock_classes.next,
-					struct lock_class, lock_entry);
-		else
-			m->private = NULL;
-	}
-	return res;
+	return seq_open(file, &lockdep_ops);
 }
 
 static const struct file_operations proc_lockdep_operations = {
-- 
cgit v1.2.3


From 12aac19d4ba41019a1748f49d3c5d259b1bfb26d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 17 Aug 2009 13:40:39 +0800
Subject: lockdep: Simplify lockdep_chains seqfile code

- make lc_next() call lc_start()
- use lock_chains directly instead of storing it in m->private

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4A88ED57.5060609@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 37 +++++++------------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 05fb5fde9b1..2bbe25f55d0 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,26 +113,6 @@ static const struct file_operations proc_lockdep_operations = {
 };
 
 #ifdef CONFIG_PROVE_LOCKING
-static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct lock_chain *chain;
-
-	(*pos)++;
-
-	if (v == SEQ_START_TOKEN)
-		chain = m->private;
-	else {
-		chain = v;
-
-		if (*pos - 1 < nr_lock_chains)
-			chain = lock_chains + (*pos - 1);
-		else
-			chain = NULL;
-	}
-
-	return chain;
-}
-
 static void *lc_start(struct seq_file *m, loff_t *pos)
 {
 	if (*pos == 0)
@@ -144,6 +124,12 @@ static void *lc_start(struct seq_file *m, loff_t *pos)
 	return NULL;
 }
 
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return lc_start(m, pos);
+}
+
 static void lc_stop(struct seq_file *m, void *v)
 {
 }
@@ -184,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
 
 static int lockdep_chains_open(struct inode *inode, struct file *file)
 {
-	int res = seq_open(file, &lockdep_chains_ops);
-	if (!res) {
-		struct seq_file *m = file->private_data;
-
-		if (nr_lock_chains)
-			m->private = lock_chains;
-		else
-			m->private = NULL;
-	}
-	return res;
+	return seq_open(file, &lockdep_chains_ops);
 }
 
 static const struct file_operations proc_lockdep_chains_operations = {
-- 
cgit v1.2.3


From 96004bb2a1e4ccad2b1eeb92e51031d1e8e609e3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 17 Aug 2009 13:40:59 +0800
Subject: lockdep: Simplify lock_stat seqfile code

- make ls_next() call ls_start()
- remove redundant code in lock_stat_release()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4A88ED6B.6030602@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 2bbe25f55d0..8dac8402e07 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -383,7 +383,6 @@ struct lock_stat_data {
 };
 
 struct lock_stat_seq {
-	struct lock_stat_data *iter;
 	struct lock_stat_data *iter_end;
 	struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
 };
@@ -571,34 +570,22 @@ static void seq_header(struct seq_file *m)
 static void *ls_start(struct seq_file *m, loff_t *pos)
 {
 	struct lock_stat_seq *data = m->private;
+	struct lock_stat_data *iter;
 
 	if (*pos == 0)
 		return SEQ_START_TOKEN;
 
-	data->iter = data->stats + (*pos - 1);
-	if (data->iter >= data->iter_end)
-		data->iter = NULL;
+	iter = data->stats + (*pos - 1);
+	if (iter >= data->iter_end)
+		iter = NULL;
 
-	return data->iter;
+	return iter;
 }
 
 static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct lock_stat_seq *data = m->private;
-
 	(*pos)++;
-
-	if (v == SEQ_START_TOKEN)
-		data->iter = data->stats;
-	else {
-		data->iter = v;
-		data->iter++;
-	}
-
-	if (data->iter == data->iter_end)
-		data->iter = NULL;
-
-	return data->iter;
+	return ls_start(m, pos);
 }
 
 static void ls_stop(struct seq_file *m, void *v)
@@ -636,7 +623,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
 		struct lock_stat_data *iter = data->stats;
 		struct seq_file *m = file->private_data;
 
-		data->iter = iter;
 		list_for_each_entry(class, &all_lock_classes, lock_entry) {
 			iter->class = class;
 			iter->stats = lock_stats(class);
@@ -644,7 +630,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
 		}
 		data->iter_end = iter;
 
-		sort(data->stats, data->iter_end - data->iter,
+		sort(data->stats, data->iter_end - data->stats,
 				sizeof(struct lock_stat_data),
 				lock_stat_cmp, NULL);
 
@@ -679,7 +665,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
 	struct seq_file *seq = file->private_data;
 
 	vfree(seq->private);
-	seq->private = NULL;
 	return seq_release(inode, file);
 }
 
-- 
cgit v1.2.3


From b25c340c195447afb1860da580fe2a85a6b652c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Aug 2009 12:17:22 +0200
Subject: genirq: Add oneshot support

For threaded interrupt handlers we expect the hard interrupt handler
part to mask the interrupt on the originating device. The interrupt
line itself is reenabled after the hard interrupt handler has
executed.

This requires access to the originating device from hard interrupt
context which is not always possible. There are devices which can only
be accessed via a bus (i2c, spi, ...). The bus access requires thread
context. For such devices we need to keep the interrupt line masked
until the threaded handler has executed.

Add a new flag IRQF_ONESHOT which allows drivers to request that the
interrupt is not unmasked after the hard interrupt context handler has
been executed and the thread has been woken. The interrupt line is
unmasked after the thread handler function has been executed.

Note that for now IRQF_ONESHOT cannot be used with IRQF_SHARED to
avoid complex accounting mechanisms.

For oneshot interrupts the primary handler simply returns
IRQ_WAKE_THREAD and does nothing else. A generic implementation
irq_default_primary_handler() is provided to avoid useless copies all
over the place. It is automatically installed when
request_threaded_irq() is called with handler=NULL and
thread_fn!=NULL.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Trilok Soni <soni.trilok@gmail.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Brian Swetland <swetland@google.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: m.szyprowski@samsung.com
Cc: t.fujak@samsung.com
Cc: kyungmin.park@samsung.com,
Cc: David Brownell <david-b@pacbell.net>
Cc: Daniel Ribeiro <drwyrm@gmail.com>
Cc: arve@android.com
Cc: Barry Song <21cnbao@gmail.com>
---
 kernel/irq/chip.c   | 14 +++++++++++---
 kernel/irq/manage.c | 49 +++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b72..b08c0d24f20 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -382,7 +382,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
-	if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+
+	if (unlikely(desc->status & IRQ_ONESHOT))
+		desc->status |= IRQ_MASKED;
+	else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
 		desc->chip->unmask(irq);
 out_unlock:
 	spin_unlock(&desc->lock);
@@ -478,8 +481,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
-	if (desc->chip->ack)
-		desc->chip->ack(irq);
+	if (unlikely(desc->status & IRQ_ONESHOT)) {
+		desc->status |= IRQ_MASKED;
+		mask_ack_irq(desc, irq);
+	} else {
+		if (desc->chip->ack)
+			desc->chip->ack(irq);
+	}
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d222515a5a0..d7f7b5fd247 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -436,6 +436,16 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
+/*
+ * Default primary interrupt handler for threaded interrupts. Is
+ * assigned as primary handler when request_threaded_irq is called
+ * with handler == NULL. Useful for oneshot interrupts.
+ */
+static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
+{
+	return IRQ_WAKE_THREAD;
+}
+
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
 	while (!kthread_should_stop()) {
@@ -451,6 +461,21 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 	return -1;
 }
 
+/*
+ * Oneshot interrupts keep the irq line masked until the threaded
+ * handler finished. unmask if the interrupt has not been disabled and
+ * is marked MASKED.
+ */
+static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+{
+	spin_lock_irq(&desc->lock);
+	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+		desc->status &= ~IRQ_MASKED;
+		desc->chip->unmask(irq);
+	}
+	spin_unlock_irq(&desc->lock);
+}
+
 #ifdef CONFIG_SMP
 /*
  * Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +517,7 @@ static int irq_thread(void *data)
 	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
-	int wake;
+	int wake, oneshot = desc->status & IRQ_ONESHOT;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
@@ -518,6 +543,9 @@ static int irq_thread(void *data)
 			spin_unlock_irq(&desc->lock);
 
 			action->thread_fn(action->irq, action->dev_id);
+
+			if (oneshot)
+				irq_finalize_oneshot(action->irq, desc);
 		}
 
 		wake = atomic_dec_and_test(&desc->threads_active);
@@ -590,6 +618,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		rand_initialize_irq(irq);
 	}
 
+	/* Oneshot interrupts are not allowed with shared */
+	if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
+		return -EINVAL;
+
 	/*
 	 * Threaded handler ?
 	 */
@@ -663,9 +695,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
 				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
 
+		if (new->flags & IRQF_ONESHOT)
+			desc->status |= IRQ_ONESHOT;
+
 		if (!(desc->status & IRQ_NOAUTOEN)) {
 			desc->depth = 0;
 			desc->status &= ~IRQ_DISABLED;
@@ -878,6 +913,8 @@ EXPORT_SYMBOL(free_irq);
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs.
  *		  Primary handler for threaded interrupts
+ *		  If NULL and thread_fn != NULL the default
+ *		  primary handler is installed
  *	@thread_fn: Function called from the irq handler thread
  *		    If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
@@ -957,8 +994,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 
 	if (desc->status & IRQ_NOREQUEST)
 		return -EINVAL;
-	if (!handler)
-		return -EINVAL;
+
+	if (!handler) {
+		if (!thread_fn)
+			return -EINVAL;
+		handler = irq_default_primary_handler;
+	}
 
 	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
 	if (!action)
-- 
cgit v1.2.3


From 70aedd24d20e75198f5a0b11750faabbb56924e2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Aug 2009 12:17:48 +0200
Subject: genirq: Add buslock support

Some interrupt chips are connected to a "slow" bus (i2c, spi ...). The
bus access needs to sleep and therefor cannot be called in atomic
contexts.

Some of the generic interrupt management functions like disable_irq(),
enable_irq() ... call interrupt chip functions with the irq_desc->lock
held and interrupts disabled. This does not work for such devices.

Provide a separate synchronization mechanism for such interrupt
chips. The irq_chip structure is extended by two optional functions
(bus_lock and bus_sync_and_unlock).

The idea is to serialize the bus access for those operations in the
core code so that drivers which are behind that bus operated interrupt
controller do not have to worry about it and just can use the normal
interfaces. To achieve this we add two function pointers to the
irq_chip: bus_lock and bus_sync_unlock.

bus_lock() is called to serialize access to the interrupt controller
bus.

Now the core code can issue chip->mask/unmask ... commands without
changing the fast path code at all. The chip implementation merily
stores that information in a chip private data structure and
returns. No bus interaction as these functions are called from atomic
context.

After that bus_sync_unlock() is called outside the atomic context. Now
the chip implementation issues the bus commands, waits for completion
and unlocks the interrupt controller bus.

The irq_chip implementation as pseudo code:

struct irq_chip_data {
       struct mutex   mutex;
       unsigned int   irq_offset;
       unsigned long  mask;
       unsigned long  mask_status;
}

static void bus_lock(unsigned int irq)
{
        struct irq_chip_data *data = get_irq_desc_chip_data(irq);

        mutex_lock(&data->mutex);
}

static void mask(unsigned int irq)
{
        struct irq_chip_data *data = get_irq_desc_chip_data(irq);

        irq -= data->irq_offset;
        data->mask |= (1 << irq);
}

static void unmask(unsigned int irq)
{
        struct irq_chip_data *data = get_irq_desc_chip_data(irq);

        irq -= data->irq_offset;
        data->mask &= ~(1 << irq);
}

static void bus_sync_unlock(unsigned int irq)
{
        struct irq_chip_data *data = get_irq_desc_chip_data(irq);

        if (data->mask != data->mask_status) {
                do_bus_magic_to_set_mask(data->mask);
                data->mask_status = data->mask;
        }
        mutex_unlock(&data->mutex);
}

The device drivers can use request_threaded_irq, free_irq, disable_irq
and enable_irq as usual with the only restriction that the calls need
to come from non atomic context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Trilok Soni <soni.trilok@gmail.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Brian Swetland <swetland@google.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: m.szyprowski@samsung.com
Cc: t.fujak@samsung.com
Cc: kyungmin.park@samsung.com,
Cc: David Brownell <david-b@pacbell.net>
Cc: Daniel Ribeiro <drwyrm@gmail.com>
Cc: arve@android.com
Cc: Barry Song <21cnbao@gmail.com>
---
 kernel/irq/chip.c      |  2 ++
 kernel/irq/internals.h | 13 +++++++++++++
 kernel/irq/manage.c    | 19 ++++++++++++++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b08c0d24f20..f856330e684 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -580,6 +580,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		desc->chip = &dummy_irq_chip;
 	}
 
+	chip_bus_lock(irq, desc);
 	spin_lock_irqsave(&desc->lock, flags);
 
 	/* Uninstall? */
@@ -599,6 +600,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		desc->chip->startup(irq);
 	}
 	spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL_GPL(__set_irq_handler);
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb..1b5d742c6a7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
+/* Inline functions for support of irq chips on slow busses */
+static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+{
+	if (unlikely(desc->chip->bus_lock))
+		desc->chip->bus_lock(irq);
+}
+
+static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+{
+	if (unlikely(desc->chip->bus_sync_unlock))
+		desc->chip->bus_sync_unlock(irq);
+}
+
 /*
  * Debugging printout:
  */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d7f7b5fd247..0a3fd5b524c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
 	if (!desc)
 		return;
 
+	chip_bus_lock(irq, desc);
 	spin_lock_irqsave(&desc->lock, flags);
 	__disable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
 
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
  *	matches the last disable, processing of interrupts on this
  *	IRQ line is re-enabled.
  *
- *	This function may be called from IRQ context.
+ *	This function may be called from IRQ context only when
+ *	desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
  */
 void enable_irq(unsigned int irq)
 {
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
 	if (!desc)
 		return;
 
+	chip_bus_lock(irq, desc);
 	spin_lock_irqsave(&desc->lock, flags);
 	__enable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(enable_irq);
 
@@ -468,12 +473,14 @@ static int irq_wait_for_interrupt(struct irqaction *action)
  */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+	chip_bus_lock(irq, desc);
 	spin_lock_irq(&desc->lock);
 	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
 		desc->status &= ~IRQ_MASKED;
 		desc->chip->unmask(irq);
 	}
 	spin_unlock_irq(&desc->lock);
+	chip_bus_sync_unlock(irq, desc);
 }
 
 #ifdef CONFIG_SMP
@@ -904,7 +911,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
  */
 void free_irq(unsigned int irq, void *dev_id)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	chip_bus_lock(irq, desc);
 	kfree(__free_irq(irq, dev_id));
+	chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(free_irq);
 
@@ -1011,7 +1025,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	action->name = devname;
 	action->dev_id = dev_id;
 
+	chip_bus_lock(irq, desc);
 	retval = __setup_irq(irq, desc, action);
+	chip_bus_sync_unlock(irq, desc);
+
 	if (retval)
 		kfree(action);
 
-- 
cgit v1.2.3


From 399b5da29b9f851eb7b96e2882097127f003e87c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Aug 2009 13:21:38 +0200
Subject: genirq: Support nested threaded irq handling

Interrupt chips which are behind a slow bus (i2c, spi ...) and
demultiplex other interrupt sources need to run their interrupt
handler in a thread.

The demultiplexed interrupt handlers need to run in thread context as
well and need to finish before the demux handler thread can reenable
the interrupt line. So the easiest way is to run the sub device
handlers in the context of the demultiplexing handler thread.

To avoid that a separate thread is created for the subdevices the
function set_nested_irq_thread() is provided which sets the
IRQ_NESTED_THREAD flag in the interrupt descriptor.

A driver which calls request_threaded_irq() must not be aware of the
fact that the threaded handler is called in the context of the
demultiplexing handler thread. The setup code checks the
IRQ_NESTED_THREAD flag which was set from the irq chip setup code and
does not setup a separate thread for the interrupt. The primary
function which is provided by the device driver is replaced by an
internal dummy function which warns when it is called.

For the demultiplexing handler a helper function handle_nested_irq()
is provided which calls the demux interrupt thread function in the
context of the caller and does the proper interrupt accounting and
takes the interrupt disabled status of the demultiplexed subdevice
into account.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Trilok Soni <soni.trilok@gmail.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Brian Swetland <swetland@google.com>
Cc: Joonyoung Shim <jy0922.shim@samsung.com>
Cc: m.szyprowski@samsung.com
Cc: t.fujak@samsung.com
Cc: kyungmin.park@samsung.com,
Cc: David Brownell <david-b@pacbell.net>
Cc: Daniel Ribeiro <drwyrm@gmail.com>
Cc: arve@android.com
Cc: Barry Song <21cnbao@gmail.com>
---
 kernel/irq/chip.c   | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c | 34 ++++++++++++++++++++++++---
 2 files changed, 98 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f856330e684..5765aad9499 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
 }
 EXPORT_SYMBOL(set_irq_chip_data);
 
+/**
+ *	set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
+ *
+ *	@irq:	Interrupt number
+ *	@nest:	0 to clear / 1 to set the IRQ_NESTED_THREAD flag
+ *
+ *	The IRQ_NESTED_THREAD flag indicates that on
+ *	request_threaded_irq() no separate interrupt thread should be
+ *	created for the irq as the handler are called nested in the
+ *	context of a demultiplexing interrupt handler thread.
+ */
+void set_irq_nested_thread(unsigned int irq, int nest)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	if (!desc)
+		return;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (nest)
+		desc->status |= IRQ_NESTED_THREAD;
+	else
+		desc->status &= ~IRQ_NESTED_THREAD;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(set_irq_nested_thread);
+
 /*
  * default enable function
  */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
 	}
 }
 
+/*
+ *	handle_nested_irq - Handle a nested irq from a irq thread
+ *	@irq:	the interrupt number
+ *
+ *	Handle interrupts which are nested into a threaded interrupt
+ *	handler. The handler function is called inside the calling
+ *	threads context.
+ */
+void handle_nested_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irqaction *action;
+	irqreturn_t action_ret;
+
+	might_sleep();
+
+	spin_lock_irq(&desc->lock);
+
+	kstat_incr_irqs_this_cpu(irq, desc);
+
+	action = desc->action;
+	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+		goto out_unlock;
+
+	desc->status |= IRQ_INPROGRESS;
+	spin_unlock_irq(&desc->lock);
+
+	action_ret = action->thread_fn(action->irq, action->dev_id);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret);
+
+	spin_lock_irq(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+
+out_unlock:
+	spin_unlock_irq(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
 /**
  *	handle_simple_irq - Simple and software-decoded IRQs.
  *	@irq:	the interrupt number
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a3fd5b524c..e485cea04bf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -451,6 +451,16 @@ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
 	return IRQ_WAKE_THREAD;
 }
 
+/*
+ * Primary handler for nested threaded interrupts. Should never be
+ * called.
+ */
+static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
+{
+	WARN(1, "Primary handler called for nested irq %d\n", irq);
+	return IRQ_NONE;
+}
+
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
 	while (!kthread_should_stop()) {
@@ -600,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	struct irqaction *old, **old_ptr;
 	const char *old_name = NULL;
 	unsigned long flags;
-	int shared = 0;
+	int nested, shared = 0;
 	int ret;
 
 	if (!desc)
@@ -630,9 +640,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		return -EINVAL;
 
 	/*
-	 * Threaded handler ?
+	 * Check whether the interrupt nests into another interrupt
+	 * thread.
+	 */
+	nested = desc->status & IRQ_NESTED_THREAD;
+	if (nested) {
+		if (!new->thread_fn)
+			return -EINVAL;
+		/*
+		 * Replace the primary handler which was provided from
+		 * the driver for non nested interrupt handling by the
+		 * dummy function which warns when called.
+		 */
+		new->handler = irq_nested_primary_handler;
+	}
+
+	/*
+	 * Create a handler thread when a thread function is supplied
+	 * and the interrupt does not nest into another interrupt
+	 * thread.
 	 */
-	if (new->thread_fn) {
+	if (new->thread_fn && !nested) {
 		struct task_struct *t;
 
 		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
-- 
cgit v1.2.3


From 3a5209e3b26386fd174820ee6e8e27479b24869a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 17 Aug 2009 16:00:30 -0700
Subject: trace_skb: fix build when CONFIG_NET is not enabled

Fix trace_skb_sources build when CONFIG_NET is not enabled:

kernel/built-in.o: In function `probe_skb_dequeue':
trace_skb_sources.c:(.text+0xd9152): undefined reference to `init_net'
trace_skb_sources.c:(.text+0xd9188): undefined reference to `dev_get_by_index'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f09d7635c0e..9a2f19bf051 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -236,6 +236,7 @@ config BOOT_TRACER
 
 config SKB_SOURCES_TRACER
 	bool "Trace skb source information"
+	depends on NET
 	select GENERIC_TRACER
 	help
 	   This tracer helps developers/sysadmins correlate skb allocation and
-- 
cgit v1.2.3


From 49a02c514d967921a908ac64e9c0ec0f0fc17fd8 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:51:52 +0200
Subject: sched: Use structure to store local data in __build_sched_domains

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105152.GB29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 165 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 89 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273..565ff775fcd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8091,6 +8091,22 @@ struct static_sched_domain {
 	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
 };
 
+struct s_data {
+#ifdef CONFIG_NUMA
+	int			sd_allnodes;
+	cpumask_var_t		domainspan;
+	cpumask_var_t		covered;
+	cpumask_var_t		notcovered;
+#endif
+	cpumask_var_t		nodemask;
+	cpumask_var_t		this_sibling_map;
+	cpumask_var_t		this_core_map;
+	cpumask_var_t		send_covered;
+	cpumask_var_t		tmpmask;
+	struct sched_group	**sched_group_nodes;
+	struct root_domain	*rd;
+};
+
 /*
  * SMT sched-domains:
  */
@@ -8385,54 +8401,49 @@ static void set_domain_attribute(struct sched_domain *sd,
 static int __build_sched_domains(const struct cpumask *cpu_map,
 				 struct sched_domain_attr *attr)
 {
+	struct s_data d;
 	int i, err = -ENOMEM;
-	struct root_domain *rd;
-	cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
-		tmpmask;
 #ifdef CONFIG_NUMA
-	cpumask_var_t domainspan, covered, notcovered;
-	struct sched_group **sched_group_nodes = NULL;
-	int sd_allnodes = 0;
-
-	if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+	d.sd_allnodes = 0;
+	if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL))
 		goto out;
-	if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d.covered, GFP_KERNEL))
 		goto free_domainspan;
-	if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL))
 		goto free_covered;
 #endif
 
-	if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL))
 		goto free_notcovered;
-	if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL))
 		goto free_nodemask;
-	if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL))
 		goto free_this_sibling_map;
-	if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL))
 		goto free_this_core_map;
-	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL))
 		goto free_send_covered;
 
 #ifdef CONFIG_NUMA
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
-				    GFP_KERNEL);
-	if (!sched_group_nodes) {
+	d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
+				      GFP_KERNEL);
+	if (!d.sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		goto free_tmpmask;
 	}
 #endif
 
-	rd = alloc_rootdomain();
-	if (!rd) {
+	d.rd = alloc_rootdomain();
+	if (!d.rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
 		goto free_sched_groups;
 	}
 
 #ifdef CONFIG_NUMA
-	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
+	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes;
 #endif
 
 	/*
@@ -8441,18 +8452,20 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 
-		cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
+		cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
+			    cpu_map);
 
 #ifdef CONFIG_NUMA
 		if (cpumask_weight(cpu_map) >
-				SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
+				SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) {
 			sd = &per_cpu(allnodes_domains, i).sd;
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			cpumask_copy(sched_domain_span(sd), cpu_map);
-			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
+			cpu_to_allnodes_group(i, cpu_map, &sd->groups,
+					      d.tmpmask);
 			p = sd;
-			sd_allnodes = 1;
+			d.sd_allnodes = 1;
 		} else
 			p = NULL;
 
@@ -8471,11 +8484,11 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		sd = &per_cpu(phys_domains, i).sd;
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
-		cpumask_copy(sched_domain_span(sd), nodemask);
+		cpumask_copy(sched_domain_span(sd), d.nodemask);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
-		cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+		cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask);
 
 #ifdef CONFIG_SCHED_MC
 		p = sd;
@@ -8486,7 +8499,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 						   cpu_coregroup_mask(i));
 		sd->parent = p;
 		p->child = sd;
-		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
+		cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
@@ -8498,54 +8511,54 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 			    topology_thread_cpumask(i), cpu_map);
 		sd->parent = p;
 		p->child = sd;
-		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
+		cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask);
 #endif
 	}
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
-		cpumask_and(this_sibling_map,
+		cpumask_and(d.this_sibling_map,
 			    topology_thread_cpumask(i), cpu_map);
-		if (i != cpumask_first(this_sibling_map))
+		if (i != cpumask_first(d.this_sibling_map))
 			continue;
 
-		init_sched_build_groups(this_sibling_map, cpu_map,
+		init_sched_build_groups(d.this_sibling_map, cpu_map,
 					&cpu_to_cpu_group,
-					send_covered, tmpmask);
+					d.send_covered, d.tmpmask);
 	}
 #endif
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu(i, cpu_map) {
-		cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
-		if (i != cpumask_first(this_core_map))
+		cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map);
+		if (i != cpumask_first(d.this_core_map))
 			continue;
 
-		init_sched_build_groups(this_core_map, cpu_map,
+		init_sched_build_groups(d.this_core_map, cpu_map,
 					&cpu_to_core_group,
-					send_covered, tmpmask);
+					d.send_covered, d.tmpmask);
 	}
 #endif
 
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++) {
-		cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
-		if (cpumask_empty(nodemask))
+		cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map);
+		if (cpumask_empty(d.nodemask))
 			continue;
 
-		init_sched_build_groups(nodemask, cpu_map,
+		init_sched_build_groups(d.nodemask, cpu_map,
 					&cpu_to_phys_group,
-					send_covered, tmpmask);
+					d.send_covered, d.tmpmask);
 	}
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	if (sd_allnodes) {
+	if (d.sd_allnodes) {
 		init_sched_build_groups(cpu_map, cpu_map,
 					&cpu_to_allnodes_group,
-					send_covered, tmpmask);
+					d.send_covered, d.tmpmask);
 	}
 
 	for (i = 0; i < nr_node_ids; i++) {
@@ -8553,15 +8566,15 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		struct sched_group *sg, *prev;
 		int j;
 
-		cpumask_clear(covered);
-		cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
-		if (cpumask_empty(nodemask)) {
-			sched_group_nodes[i] = NULL;
+		cpumask_clear(d.covered);
+		cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map);
+		if (cpumask_empty(d.nodemask)) {
+			d.sched_group_nodes[i] = NULL;
 			continue;
 		}
 
-		sched_domain_node_span(i, domainspan);
-		cpumask_and(domainspan, domainspan, cpu_map);
+		sched_domain_node_span(i, d.domainspan);
+		cpumask_and(d.domainspan, d.domainspan, cpu_map);
 
 		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 				  GFP_KERNEL, i);
@@ -8570,30 +8583,30 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 				"node %d\n", i);
 			goto error;
 		}
-		sched_group_nodes[i] = sg;
-		for_each_cpu(j, nodemask) {
+		d.sched_group_nodes[i] = sg;
+		for_each_cpu(j, d.nodemask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j).sd;
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
-		cpumask_copy(sched_group_cpus(sg), nodemask);
+		cpumask_copy(sched_group_cpus(sg), d.nodemask);
 		sg->next = sg;
-		cpumask_or(covered, covered, nodemask);
+		cpumask_or(d.covered, d.covered, d.nodemask);
 		prev = sg;
 
 		for (j = 0; j < nr_node_ids; j++) {
 			int n = (i + j) % nr_node_ids;
 
-			cpumask_complement(notcovered, covered);
-			cpumask_and(tmpmask, notcovered, cpu_map);
-			cpumask_and(tmpmask, tmpmask, domainspan);
-			if (cpumask_empty(tmpmask))
+			cpumask_complement(d.notcovered, d.covered);
+			cpumask_and(d.tmpmask, d.notcovered, cpu_map);
+			cpumask_and(d.tmpmask, d.tmpmask, d.domainspan);
+			if (cpumask_empty(d.tmpmask))
 				break;
 
-			cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
-			if (cpumask_empty(tmpmask))
+			cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n));
+			if (cpumask_empty(d.tmpmask))
 				continue;
 
 			sg = kmalloc_node(sizeof(struct sched_group) +
@@ -8605,9 +8618,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 				goto error;
 			}
 			sg->__cpu_power = 0;
-			cpumask_copy(sched_group_cpus(sg), tmpmask);
+			cpumask_copy(sched_group_cpus(sg), d.tmpmask);
 			sg->next = prev->next;
-			cpumask_or(covered, covered, tmpmask);
+			cpumask_or(d.covered, d.covered, d.tmpmask);
 			prev->next = sg;
 			prev = sg;
 		}
@@ -8638,13 +8651,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 #ifdef CONFIG_NUMA
 	for (i = 0; i < nr_node_ids; i++)
-		init_numa_sched_groups_power(sched_group_nodes[i]);
+		init_numa_sched_groups_power(d.sched_group_nodes[i]);
 
-	if (sd_allnodes) {
+	if (d.sd_allnodes) {
 		struct sched_group *sg;
 
 		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
-								tmpmask);
+								d.tmpmask);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
@@ -8659,42 +8672,42 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 #else
 		sd = &per_cpu(phys_domains, i).sd;
 #endif
-		cpu_attach_domain(sd, rd, i);
+		cpu_attach_domain(sd, d.rd, i);
 	}
 
 	err = 0;
 
 free_tmpmask:
-	free_cpumask_var(tmpmask);
+	free_cpumask_var(d.tmpmask);
 free_send_covered:
-	free_cpumask_var(send_covered);
+	free_cpumask_var(d.send_covered);
 free_this_core_map:
-	free_cpumask_var(this_core_map);
+	free_cpumask_var(d.this_core_map);
 free_this_sibling_map:
-	free_cpumask_var(this_sibling_map);
+	free_cpumask_var(d.this_sibling_map);
 free_nodemask:
-	free_cpumask_var(nodemask);
+	free_cpumask_var(d.nodemask);
 free_notcovered:
 #ifdef CONFIG_NUMA
-	free_cpumask_var(notcovered);
+	free_cpumask_var(d.notcovered);
 free_covered:
-	free_cpumask_var(covered);
+	free_cpumask_var(d.covered);
 free_domainspan:
-	free_cpumask_var(domainspan);
+	free_cpumask_var(d.domainspan);
 out:
 #endif
 	return err;
 
 free_sched_groups:
 #ifdef CONFIG_NUMA
-	kfree(sched_group_nodes);
+	kfree(d.sched_group_nodes);
 #endif
 	goto free_tmpmask;
 
 #ifdef CONFIG_NUMA
 error:
-	free_sched_groups(cpu_map, tmpmask);
-	free_rootdomain(rd);
+	free_sched_groups(cpu_map, d.tmpmask);
+	free_rootdomain(d.rd);
 	goto free_tmpmask;
 #endif
 }
-- 
cgit v1.2.3


From 2109b99ee192764b407dc7f52babb74740eea6f9 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:53:00 +0200
Subject: sched: Separate out allocation/free/goto-hell from
 __build_sched_domains

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105300.GC29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 171 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 99 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 565ff775fcd..c5d1fee4236 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8107,6 +8107,23 @@ struct s_data {
 	struct root_domain	*rd;
 };
 
+enum s_alloc {
+	sa_sched_groups = 0,
+	sa_rootdomain,
+	sa_tmpmask,
+	sa_send_covered,
+	sa_this_core_map,
+	sa_this_sibling_map,
+	sa_nodemask,
+	sa_sched_group_nodes,
+#ifdef CONFIG_NUMA
+	sa_notcovered,
+	sa_covered,
+	sa_domainspan,
+#endif
+	sa_none,
+};
+
 /*
  * SMT sched-domains:
  */
@@ -8394,6 +8411,77 @@ static void set_domain_attribute(struct sched_domain *sd,
 	}
 }
 
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+				 const struct cpumask *cpu_map)
+{
+	switch (what) {
+	case sa_sched_groups:
+		free_sched_groups(cpu_map, d->tmpmask); /* fall through */
+		d->sched_group_nodes = NULL;
+	case sa_rootdomain:
+		free_rootdomain(d->rd); /* fall through */
+	case sa_tmpmask:
+		free_cpumask_var(d->tmpmask); /* fall through */
+	case sa_send_covered:
+		free_cpumask_var(d->send_covered); /* fall through */
+	case sa_this_core_map:
+		free_cpumask_var(d->this_core_map); /* fall through */
+	case sa_this_sibling_map:
+		free_cpumask_var(d->this_sibling_map); /* fall through */
+	case sa_nodemask:
+		free_cpumask_var(d->nodemask); /* fall through */
+	case sa_sched_group_nodes:
+#ifdef CONFIG_NUMA
+		kfree(d->sched_group_nodes); /* fall through */
+	case sa_notcovered:
+		free_cpumask_var(d->notcovered); /* fall through */
+	case sa_covered:
+		free_cpumask_var(d->covered); /* fall through */
+	case sa_domainspan:
+		free_cpumask_var(d->domainspan); /* fall through */
+#endif
+	case sa_none:
+		break;
+	}
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+						   const struct cpumask *cpu_map)
+{
+#ifdef CONFIG_NUMA
+	if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
+		return sa_none;
+	if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+		return sa_domainspan;
+	if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+		return sa_covered;
+	/* Allocate the per-node list of sched groups */
+	d->sched_group_nodes = kcalloc(nr_node_ids,
+				      sizeof(struct sched_group *), GFP_KERNEL);
+	if (!d->sched_group_nodes) {
+		printk(KERN_WARNING "Can not alloc sched group node list\n");
+		return sa_notcovered;
+	}
+	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
+#endif
+	if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
+		return sa_sched_group_nodes;
+	if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
+		return sa_nodemask;
+	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
+		return sa_this_sibling_map;
+	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+		return sa_this_core_map;
+	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
+		return sa_send_covered;
+	d->rd = alloc_rootdomain();
+	if (!d->rd) {
+		printk(KERN_WARNING "Cannot alloc root domain\n");
+		return sa_tmpmask;
+	}
+	return sa_rootdomain;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -8401,50 +8489,17 @@ static void set_domain_attribute(struct sched_domain *sd,
 static int __build_sched_domains(const struct cpumask *cpu_map,
 				 struct sched_domain_attr *attr)
 {
+	enum s_alloc alloc_state = sa_none;
 	struct s_data d;
-	int i, err = -ENOMEM;
+	int i;
 #ifdef CONFIG_NUMA
 	d.sd_allnodes = 0;
-	if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL))
-		goto out;
-	if (!alloc_cpumask_var(&d.covered, GFP_KERNEL))
-		goto free_domainspan;
-	if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL))
-		goto free_covered;
-#endif
-
-	if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL))
-		goto free_notcovered;
-	if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL))
-		goto free_nodemask;
-	if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL))
-		goto free_this_sibling_map;
-	if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL))
-		goto free_this_core_map;
-	if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL))
-		goto free_send_covered;
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Allocate the per-node list of sched groups
-	 */
-	d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
-				      GFP_KERNEL);
-	if (!d.sched_group_nodes) {
-		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		goto free_tmpmask;
-	}
 #endif
 
-	d.rd = alloc_rootdomain();
-	if (!d.rd) {
-		printk(KERN_WARNING "Cannot alloc root domain\n");
-		goto free_sched_groups;
-	}
-
-#ifdef CONFIG_NUMA
-	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes;
-#endif
+	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+	if (alloc_state != sa_rootdomain)
+		goto error;
+	alloc_state = sa_sched_groups;
 
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
@@ -8675,41 +8730,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		cpu_attach_domain(sd, d.rd, i);
 	}
 
-	err = 0;
-
-free_tmpmask:
-	free_cpumask_var(d.tmpmask);
-free_send_covered:
-	free_cpumask_var(d.send_covered);
-free_this_core_map:
-	free_cpumask_var(d.this_core_map);
-free_this_sibling_map:
-	free_cpumask_var(d.this_sibling_map);
-free_nodemask:
-	free_cpumask_var(d.nodemask);
-free_notcovered:
-#ifdef CONFIG_NUMA
-	free_cpumask_var(d.notcovered);
-free_covered:
-	free_cpumask_var(d.covered);
-free_domainspan:
-	free_cpumask_var(d.domainspan);
-out:
-#endif
-	return err;
-
-free_sched_groups:
-#ifdef CONFIG_NUMA
-	kfree(d.sched_group_nodes);
-#endif
-	goto free_tmpmask;
+	d.sched_group_nodes = NULL; /* don't free this we still need it */
+	__free_domain_allocs(&d, sa_tmpmask, cpu_map);
+	return 0;
 
-#ifdef CONFIG_NUMA
 error:
-	free_sched_groups(cpu_map, d.tmpmask);
-	free_rootdomain(d.rd);
-	goto free_tmpmask;
-#endif
+	__free_domain_allocs(&d, alloc_state, cpu_map);
+	return -ENOMEM;
 }
 
 static int build_sched_domains(const struct cpumask *cpu_map)
-- 
cgit v1.2.3


From 7f4588f3aa395632fec9ba2e15a1920f0682fda0 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:54:06 +0200
Subject: sched: Separate out build of NUMA sched domain from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105406.GD29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 57 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c5d1fee4236..dd95a470837 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8482,6 +8482,37 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 	return sa_rootdomain;
 }
 
+static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+	const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+{
+	struct sched_domain *sd = NULL;
+#ifdef CONFIG_NUMA
+	struct sched_domain *parent;
+
+	d->sd_allnodes = 0;
+	if (cpumask_weight(cpu_map) >
+	    SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
+		sd = &per_cpu(allnodes_domains, i).sd;
+		SD_INIT(sd, ALLNODES);
+		set_domain_attribute(sd, attr);
+		cpumask_copy(sched_domain_span(sd), cpu_map);
+		cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
+		d->sd_allnodes = 1;
+	}
+	parent = sd;
+
+	sd = &per_cpu(node_domains, i).sd;
+	SD_INIT(sd, NODE);
+	set_domain_attribute(sd, attr);
+	sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+	sd->parent = parent;
+	if (parent)
+		parent->child = sd;
+	cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
+#endif
+	return sd;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -8510,31 +8541,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
 			    cpu_map);
 
-#ifdef CONFIG_NUMA
-		if (cpumask_weight(cpu_map) >
-				SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) {
-			sd = &per_cpu(allnodes_domains, i).sd;
-			SD_INIT(sd, ALLNODES);
-			set_domain_attribute(sd, attr);
-			cpumask_copy(sched_domain_span(sd), cpu_map);
-			cpu_to_allnodes_group(i, cpu_map, &sd->groups,
-					      d.tmpmask);
-			p = sd;
-			d.sd_allnodes = 1;
-		} else
-			p = NULL;
-
-		sd = &per_cpu(node_domains, i).sd;
-		SD_INIT(sd, NODE);
-		set_domain_attribute(sd, attr);
-		sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
-		sd->parent = p;
-		if (p)
-			p->child = sd;
-		cpumask_and(sched_domain_span(sd),
-			    sched_domain_span(sd), cpu_map);
-#endif
-
+		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
 		p = sd;
 		sd = &per_cpu(phys_domains, i).sd;
 		SD_INIT(sd, CPU);
-- 
cgit v1.2.3


From 87cce6622c2ab2f0e96ecc2a37133378a7db3177 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:54:55 +0200
Subject: sched: Separate out build of CPU sched domain from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105455.GE29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dd95a470837..3d0666c5e02 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8513,6 +8513,22 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
 	return sd;
 }
 
+static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+	struct sched_domain *parent, int i)
+{
+	struct sched_domain *sd;
+	sd = &per_cpu(phys_domains, i).sd;
+	SD_INIT(sd, CPU);
+	set_domain_attribute(sd, attr);
+	cpumask_copy(sched_domain_span(sd), d->nodemask);
+	sd->parent = parent;
+	if (parent)
+		parent->child = sd;
+	cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
+	return sd;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -8542,15 +8558,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 			    cpu_map);
 
 		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
-		p = sd;
-		sd = &per_cpu(phys_domains, i).sd;
-		SD_INIT(sd, CPU);
-		set_domain_attribute(sd, attr);
-		cpumask_copy(sched_domain_span(sd), d.nodemask);
-		sd->parent = p;
-		if (p)
-			p->child = sd;
-		cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask);
+		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
 
 #ifdef CONFIG_SCHED_MC
 		p = sd;
-- 
cgit v1.2.3


From 410c408108bb85f32fe132aaf448388af0b6da64 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:56:14 +0200
Subject: sched: Separate out build of MC sched domain from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105614.GF29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3d0666c5e02..5c829d4ba8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8529,6 +8529,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
 	return sd;
 }
 
+static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+	struct sched_domain *parent, int i)
+{
+	struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_MC
+	sd = &per_cpu(core_domains, i).sd;
+	SD_INIT(sd, MC);
+	set_domain_attribute(sd, attr);
+	cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
+	sd->parent = parent;
+	parent->child = sd;
+	cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+	return sd;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -8559,18 +8576,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
 		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
-
-#ifdef CONFIG_SCHED_MC
-		p = sd;
-		sd = &per_cpu(core_domains, i).sd;
-		SD_INIT(sd, MC);
-		set_domain_attribute(sd, attr);
-		cpumask_and(sched_domain_span(sd), cpu_map,
-						   cpu_coregroup_mask(i));
-		sd->parent = p;
-		p->child = sd;
-		cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask);
-#endif
+		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
 
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
-- 
cgit v1.2.3


From d81735355533cd4b2bce9508d86fcad24a38cf47 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:57:03 +0200
Subject: sched: Separate out build of SMT sched domain from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105703.GG29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5c829d4ba8f..2ecec06e3f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8546,6 +8546,23 @@ static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
 	return sd;
 }
 
+static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
+	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+	struct sched_domain *parent, int i)
+{
+	struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_SMT
+	sd = &per_cpu(cpu_domains, i).sd;
+	SD_INIT(sd, SIBLING);
+	set_domain_attribute(sd, attr);
+	cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
+	sd->parent = parent;
+	parent->child = sd;
+	cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+	return sd;
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -8569,7 +8586,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = NULL, *p;
+		struct sched_domain *sd;
 
 		cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
 			    cpu_map);
@@ -8577,18 +8594,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
 		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
-
-#ifdef CONFIG_SCHED_SMT
-		p = sd;
-		sd = &per_cpu(cpu_domains, i).sd;
-		SD_INIT(sd, SIBLING);
-		set_domain_attribute(sd, attr);
-		cpumask_and(sched_domain_span(sd),
-			    topology_thread_cpumask(i), cpu_map);
-		sd->parent = p;
-		p->child = sd;
-		cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask);
-#endif
+		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
 	}
 
 #ifdef CONFIG_SCHED_SMT
-- 
cgit v1.2.3


From 0e8e85c941d8f1b43bcc2e3b8b7026cdae476c53 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:57:51 +0200
Subject: sched: Separate out build of SMT sched groups from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105751.GH29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2ecec06e3f0..43cfc6e54e9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8563,6 +8563,25 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
 	return sd;
 }
 
+static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+			       const struct cpumask *cpu_map, int cpu)
+{
+	switch (l) {
+#ifdef CONFIG_SCHED_SMT
+	case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+		cpumask_and(d->this_sibling_map, cpu_map,
+			    topology_thread_cpumask(cpu));
+		if (cpu == cpumask_first(d->this_sibling_map))
+			init_sched_build_groups(d->this_sibling_map, cpu_map,
+						&cpu_to_cpu_group,
+						d->send_covered, d->tmpmask);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -8597,19 +8616,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
 	}
 
-#ifdef CONFIG_SCHED_SMT
-	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
-		cpumask_and(d.this_sibling_map,
-			    topology_thread_cpumask(i), cpu_map);
-		if (i != cpumask_first(d.this_sibling_map))
-			continue;
-
-		init_sched_build_groups(d.this_sibling_map, cpu_map,
-					&cpu_to_cpu_group,
-					d.send_covered, d.tmpmask);
+		build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
 	}
-#endif
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
-- 
cgit v1.2.3


From a2af04cdbb748158043e31799b28c48272081600 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:58:38 +0200
Subject: sched: Separate out build of MC sched groups from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105838.GI29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 43cfc6e54e9..f2c202f6629 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8576,6 +8576,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
 						&cpu_to_cpu_group,
 						d->send_covered, d->tmpmask);
 		break;
+#endif
+#ifdef CONFIG_SCHED_MC
+	case SD_LV_MC: /* set up multi-core groups */
+		cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
+		if (cpu == cpumask_first(d->this_core_map))
+			init_sched_build_groups(d->this_core_map, cpu_map,
+						&cpu_to_core_group,
+						d->send_covered, d->tmpmask);
+		break;
 #endif
 	default:
 		break;
@@ -8618,21 +8627,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 	for_each_cpu(i, cpu_map) {
 		build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+		build_sched_groups(&d, SD_LV_MC, cpu_map, i);
 	}
 
-#ifdef CONFIG_SCHED_MC
-	/* Set up multi-core groups */
-	for_each_cpu(i, cpu_map) {
-		cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map);
-		if (i != cpumask_first(d.this_core_map))
-			continue;
-
-		init_sched_build_groups(d.this_core_map, cpu_map,
-					&cpu_to_core_group,
-					d.send_covered, d.tmpmask);
-	}
-#endif
-
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++) {
 		cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map);
-- 
cgit v1.2.3


From 86548096f252bfe2065f1ea2d301e7319a16375d Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 12:59:28 +0200
Subject: sched: Separate out build of CPU sched groups from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818105928.GJ29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f2c202f6629..b09a41c93ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8586,6 +8586,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
 						d->send_covered, d->tmpmask);
 		break;
 #endif
+	case SD_LV_CPU: /* set up physical groups */
+		cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
+		if (!cpumask_empty(d->nodemask))
+			init_sched_build_groups(d->nodemask, cpu_map,
+						&cpu_to_phys_group,
+						d->send_covered, d->tmpmask);
+		break;
 	default:
 		break;
 	}
@@ -8631,15 +8638,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	}
 
 	/* Set up physical groups */
-	for (i = 0; i < nr_node_ids; i++) {
-		cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map);
-		if (cpumask_empty(d.nodemask))
-			continue;
-
-		init_sched_build_groups(d.nodemask, cpu_map,
-					&cpu_to_phys_group,
-					d.send_covered, d.tmpmask);
-	}
+	for (i = 0; i < nr_node_ids; i++)
+		build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-- 
cgit v1.2.3


From de616e36c700dc312d9021dd75f769c463f85122 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 13:00:13 +0200
Subject: sched: Separate out build of ALLNODES sched groups from
 __build_sched_domains

For the sake of completeness.
Now all calls to init_sched_build_groups() are contained in
build_sched_groups().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818110013.GK29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b09a41c93ae..52c1953bc41 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8593,6 +8593,12 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
 						&cpu_to_phys_group,
 						d->send_covered, d->tmpmask);
 		break;
+#ifdef CONFIG_NUMA
+	case SD_LV_ALLNODES:
+		init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+					d->send_covered, d->tmpmask);
+		break;
+#endif
 	default:
 		break;
 	}
@@ -8643,11 +8649,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	if (d.sd_allnodes) {
-		init_sched_build_groups(cpu_map, cpu_map,
-					&cpu_to_allnodes_group,
-					d.send_covered, d.tmpmask);
-	}
+	if (d.sd_allnodes)
+		build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
 
 	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
-- 
cgit v1.2.3


From 0601a88d8fa4508eaa49a6d96c6685e1dece38e3 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 13:01:11 +0200
Subject: sched: Separate out build of NUMA sched groups from
 __build_sched_domains

... to further strip down __build_sched_domains().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818110111.GL29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 130 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 67 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 52c1953bc41..c1ce884a016 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8246,6 +8246,71 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 		sg = sg->next;
 	} while (sg != group_head);
 }
+
+static int build_numa_sched_groups(struct s_data *d,
+				   const struct cpumask *cpu_map, int num)
+{
+	struct sched_domain *sd;
+	struct sched_group *sg, *prev;
+	int n, j;
+
+	cpumask_clear(d->covered);
+	cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+	if (cpumask_empty(d->nodemask)) {
+		d->sched_group_nodes[num] = NULL;
+		goto out;
+	}
+
+	sched_domain_node_span(num, d->domainspan);
+	cpumask_and(d->domainspan, d->domainspan, cpu_map);
+
+	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+			  GFP_KERNEL, num);
+	if (!sg) {
+		printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+		       num);
+		return -ENOMEM;
+	}
+	d->sched_group_nodes[num] = sg;
+
+	for_each_cpu(j, d->nodemask) {
+		sd = &per_cpu(node_domains, j).sd;
+		sd->groups = sg;
+	}
+
+	sg->__cpu_power = 0;
+	cpumask_copy(sched_group_cpus(sg), d->nodemask);
+	sg->next = sg;
+	cpumask_or(d->covered, d->covered, d->nodemask);
+
+	prev = sg;
+	for (j = 0; j < nr_node_ids; j++) {
+		n = (num + j) % nr_node_ids;
+		cpumask_complement(d->notcovered, d->covered);
+		cpumask_and(d->tmpmask, d->notcovered, cpu_map);
+		cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
+		if (cpumask_empty(d->tmpmask))
+			break;
+		cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
+		if (cpumask_empty(d->tmpmask))
+			continue;
+		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				  GFP_KERNEL, num);
+		if (!sg) {
+			printk(KERN_WARNING
+			       "Can not alloc domain group for node %d\n", j);
+			return -ENOMEM;
+		}
+		sg->__cpu_power = 0;
+		cpumask_copy(sched_group_cpus(sg), d->tmpmask);
+		sg->next = prev->next;
+		cpumask_or(d->covered, d->covered, d->tmpmask);
+		prev->next = sg;
+		prev = sg;
+	}
+out:
+	return 0;
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_NUMA
@@ -8652,70 +8717,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	if (d.sd_allnodes)
 		build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
 
-	for (i = 0; i < nr_node_ids; i++) {
-		/* Set up node groups */
-		struct sched_group *sg, *prev;
-		int j;
-
-		cpumask_clear(d.covered);
-		cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map);
-		if (cpumask_empty(d.nodemask)) {
-			d.sched_group_nodes[i] = NULL;
-			continue;
-		}
-
-		sched_domain_node_span(i, d.domainspan);
-		cpumask_and(d.domainspan, d.domainspan, cpu_map);
-
-		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-				  GFP_KERNEL, i);
-		if (!sg) {
-			printk(KERN_WARNING "Can not alloc domain group for "
-				"node %d\n", i);
+	for (i = 0; i < nr_node_ids; i++)
+		if (build_numa_sched_groups(&d, cpu_map, i))
 			goto error;
-		}
-		d.sched_group_nodes[i] = sg;
-		for_each_cpu(j, d.nodemask) {
-			struct sched_domain *sd;
-
-			sd = &per_cpu(node_domains, j).sd;
-			sd->groups = sg;
-		}
-		sg->__cpu_power = 0;
-		cpumask_copy(sched_group_cpus(sg), d.nodemask);
-		sg->next = sg;
-		cpumask_or(d.covered, d.covered, d.nodemask);
-		prev = sg;
-
-		for (j = 0; j < nr_node_ids; j++) {
-			int n = (i + j) % nr_node_ids;
-
-			cpumask_complement(d.notcovered, d.covered);
-			cpumask_and(d.tmpmask, d.notcovered, cpu_map);
-			cpumask_and(d.tmpmask, d.tmpmask, d.domainspan);
-			if (cpumask_empty(d.tmpmask))
-				break;
-
-			cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n));
-			if (cpumask_empty(d.tmpmask))
-				continue;
-
-			sg = kmalloc_node(sizeof(struct sched_group) +
-					  cpumask_size(),
-					  GFP_KERNEL, i);
-			if (!sg) {
-				printk(KERN_WARNING
-				"Can not alloc domain group for node %d\n", j);
-				goto error;
-			}
-			sg->__cpu_power = 0;
-			cpumask_copy(sched_group_cpus(sg), d.tmpmask);
-			sg->next = prev->next;
-			cpumask_or(d.covered, d.covered, d.tmpmask);
-			prev->next = sg;
-			prev = sg;
-		}
-	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
-- 
cgit v1.2.3


From 294b0c9619a0469a3b385b6fc47e79f64222a692 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 18 Aug 2009 13:02:29 +0200
Subject: sched: Consolidate definition of variable sd in __build_sched_domains

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090818110229.GM29515@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c1ce884a016..cf4c953d648 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8678,6 +8678,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 {
 	enum s_alloc alloc_state = sa_none;
 	struct s_data d;
+	struct sched_domain *sd;
 	int i;
 #ifdef CONFIG_NUMA
 	d.sd_allnodes = 0;
@@ -8692,8 +8693,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd;
-
 		cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
 			    cpu_map);
 
@@ -8725,22 +8724,19 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
-
+		sd = &per_cpu(cpu_domains, i).sd;
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(core_domains, i).sd;
-
+		sd = &per_cpu(core_domains, i).sd;
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
-
+		sd = &per_cpu(phys_domains, i).sd;
 		init_sched_groups_power(i, sd);
 	}
 
@@ -8759,7 +8755,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 
 	/* Attach the domains */
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
-- 
cgit v1.2.3


From b08dc3eba0c34027010caeda258f495074ae3a54 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Tue, 18 Aug 2009 21:57:13 +0800
Subject: Security/SELinux: remove duplicated #include

Remove duplicated #include('s) in
  kernel/sysctl.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/sysctl.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd..71d8dc7f992 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,7 +49,6 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
-#include <linux/security.h>
 #include <linux/slow-work.h>
 #include <linux/perf_counter.h>
 
-- 
cgit v1.2.3


From de481560eb0bd9d940b90311eba85711e4b1150b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 30 Apr 2009 12:32:04 -0400
Subject: kconfig: keep config.gz around even if CONFIG_IKCONFIG_PROC is not
 set

If CONFIG_IKCONFIG is set but CONFIG_IKCONFIG_PROC is not, then
gcc will optimize the config.gz out, because nobody uses it.

This patch adds "__used" to the config.gz data to keep it around so that
code like extract-ikconfig can still find it.

[ Impact: allow extract-ikconfig to find config.gz ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c..d0c84e6bf50 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -119,7 +119,7 @@ $(obj)/config_data.gz: .config FORCE
 	$(call if_changed,gzip)
 
 quiet_cmd_ikconfiggz = IKCFG   $@
-      cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+      cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
 	$(call if_changed,ikconfiggz)
-- 
cgit v1.2.3


From 1423cc033df017c762a9155eec470da77a460141 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Aug 2009 23:06:14 -0700
Subject: rcu: Delay rcu_barrier() wait until beginning of next CPU-hotunplug
 operation.

Ingo Molnar reported this lockup:

 [  200.380003] Hangcheck: hangcheck value past margin!
 [  248.192003] INFO: task S99local:2974 blocked for more than 120 seconds.
 [  248.194532] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 [  248.202330] S99local      D 0000000c  6256  2974   2687 0x00000000
 [  248.208929]  9c7ebe90 00000086 6b67ef8b 0000000c 9f25a610 81a69869 00000001 820b6990
 [  248.216123]  820b6990 820b6990 9c6e4c20 9c6e4eb4 82c78990 00000000 6b993559 0000000c
 [  248.220616]  9c7ebe90 8105f22a 9c6e4eb4 9c6e4c20 00000001 9c7ebe98 9c7ebeb4 81a65cb3
 [  248.229990] Call Trace:
 [  248.234049]  [<81a69869>] ? _spin_unlock_irqrestore+0x22/0x37
 [  248.239769]  [<8105f22a>] ? prepare_to_wait+0x48/0x4e
 [  248.244796]  [<81a65cb3>] rcu_barrier_cpu_hotplug+0xaa/0xc9
 [  248.250343]  [<8105f029>] ? autoremove_wake_function+0x0/0x38
 [  248.256063]  [<81062cf2>] notifier_call_chain+0x49/0x71
 [  248.261263]  [<81062da0>] raw_notifier_call_chain+0x11/0x13
 [  248.266809]  [<81a0b475>] _cpu_down+0x272/0x288
 [  248.271316]  [<81a0b4d5>] cpu_down+0x4a/0xa2
 [  248.275563]  [<81a0c48a>] store_online+0x2a/0x5e
 [  248.280156]  [<81a0c460>] ? store_online+0x0/0x5e
 [  248.284836]  [<814ddc35>] sysdev_store+0x20/0x28
 [  248.289429]  [<8112e403>] sysfs_write_file+0xb8/0xe3
 [  248.294369]  [<8112e34b>] ? sysfs_write_file+0x0/0xe3
 [  248.299396]  [<810e4c8f>] vfs_write+0x91/0x120
 [  248.303817]  [<810e4dc1>] sys_write+0x40/0x65
 [  248.308150]  [<81002d73>] sysenter_do_call+0x12/0x28

This change moves an RCU grace period delay off of the
critical path for CPU-hotunplug operations.

Since RCU callback migration is only performed on
CPU-hotunplug operations, and since the rcu_barrier() race is
provoked only by consecutive CPU-hotunplug operations, it is
not necessary to delay the end of a given CPU-hotunplug
operation.

We can instead choose to delay the beginning of the next
CPU-hotunplug operation.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josht@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: hugh.dickins@tiscali.co.uk
Cc: benh@kernel.crashing.org
LKML-Reference: <20090819060614.GA14383@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 8df115600c2..bd5d5c8e514 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -238,7 +238,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
 		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
 		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
-	} else if (action == CPU_POST_DEAD) {
+	} else if (action == CPU_DOWN_PREPARE) {
+		/* Don't need to wait until next removal operation. */
 		/* rcu_migrate_head is protected by cpu_add_remove_lock */
 		wait_migrated_callbacks();
 	}
-- 
cgit v1.2.3


From cde7e5ca4e329a157108769d1f752d191cbb71c6 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Tue, 18 Aug 2009 13:01:01 +0900
Subject: sched: Use for_each_class macro in move_one_task()

Replace for loop with the macro for_each_class to cleanup.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
LKML-Reference: <4A8A277D.4090304@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7f83be35d65..1b529efe887 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3461,9 +3461,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 {
 	const struct sched_class *class;
 
-	for (class = sched_class_highest; class; class = class->next)
+	for_each_class(class) {
 		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
 			return 1;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From a8af7246c114bfd939e539f9566b872c06f6225c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 Aug 2009 13:58:54 +0200
Subject: sched: Avoid division by zero

Patch a5004278f0525dcb9aa43703ef77bf371ea837cd (sched: Fix
cgroup smp fairness) introduced the possibility of a
divide-by-zero because load-balancing is not synchronized
between sched_domains.

This can cause the state of cpus to change between the first
and second loop over the sched domain in tg_shares_up().

Reported-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1250855934.7538.30.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1b529efe887..8f8a98eab9d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1522,7 +1522,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  */
 static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
-			unsigned long sd_shares, unsigned long sd_rq_weight)
+			unsigned long sd_shares, unsigned long sd_rq_weight,
+			unsigned long sd_eff_weight)
 {
 	unsigned long rq_weight;
 	unsigned long shares;
@@ -1535,13 +1536,15 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 	if (!rq_weight) {
 		boost = 1;
 		rq_weight = NICE_0_LOAD;
+		if (sd_rq_weight == sd_eff_weight)
+			sd_eff_weight += NICE_0_LOAD;
+		sd_rq_weight = sd_eff_weight;
 	}
 
 	/*
-	 *           \Sum shares * rq_weight
-	 * shares =  -----------------------
-	 *               \Sum rq_weight
-	 *
+	 *             \Sum_j shares_j * rq_weight_i
+	 * shares_i =  -----------------------------
+	 *                  \Sum_j rq_weight_j
 	 */
 	shares = (sd_shares * rq_weight) / sd_rq_weight;
 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1593,14 +1596,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	for_each_cpu(i, sched_domain_span(sd)) {
-		unsigned long sd_rq_weight = rq_weight;
-
-		if (!tg->cfs_rq[i]->rq_weight)
-			sd_rq_weight = eff_weight;
-
-		update_group_shares_cpu(tg, i, shares, sd_rq_weight);
-	}
+	for_each_cpu(i, sched_domain_span(sd))
+		update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 5e928f77a09a07f9dd595bb8a489965d69a83458 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 18 Aug 2009 23:38:32 +0200
Subject: PM: Introduce core framework for run-time PM of I/O devices (rev. 17)

Introduce a core framework for run-time power management of I/O
devices.  Add device run-time PM fields to 'struct dev_pm_info'
and device run-time PM callbacks to 'struct dev_pm_ops'.  Introduce
a run-time PM workqueue and define some device run-time PM helper
functions at the core level.  Document all these things.

Special thanks to Alan Stern for his help with the design and
multiple detailed reviews of the pereceding versions of this patch
and to Magnus Damm for testing feedback.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Magnus Damm <damm@igel.co.jp>
---
 kernel/power/Kconfig | 14 ++++++++++++++
 kernel/power/main.c  | 17 +++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37..91e09d3b2eb 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
 	  random kernel OOPSes or reboots that don't seem to be related to
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
+
+config PM_RUNTIME
+	bool "Run-time PM core functionality"
+	depends on PM
+	---help---
+	  Enable functionality allowing I/O devices to be put into energy-saving
+	  (low power) states at run time (or autosuspended) after a specified
+	  period of inactivity and woken up in response to a hardware-generated
+	  wake-up event or a driver's request.
+
+	  Hardware support is generally required for this functionality to work
+	  and the bus type drivers of the buses the devices are on are
+	  responsible for the actual handling of the autosuspend requests and
+	  wake-up events.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930c..347d2cc88cd 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/resume-trace.h>
+#include <linux/workqueue.h>
 
 #include "power.h"
 
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
 	.attrs = g,
 };
 
+#ifdef CONFIG_PM_RUNTIME
+struct workqueue_struct *pm_wq;
+
+static int __init pm_start_workqueue(void)
+{
+	pm_wq = create_freezeable_workqueue("pm");
+
+	return pm_wq ? 0 : -ENOMEM;
+}
+#else
+static inline int pm_start_workqueue(void) { return 0; }
+#endif
+
 static int __init pm_init(void)
 {
+	int error = pm_start_workqueue();
+	if (error)
+		return error;
 	power_kobj = kobject_create_and_add("power", NULL);
 	if (!power_kobj)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 9f77da9f40045253e91f55c12d4481254b513d2d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:45 -0700
Subject: rcu: Move private definitions from include/linux/rcutree.h to
 kernel/rcutree.h

Some information hiding that makes it easier to merge
preemptability into rcutree without descending into #include
hell.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <1250974613373-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c       |   2 +
 kernel/rcutree.h       | 238 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c |   1 +
 3 files changed, 241 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 75762cddbe0..a162f859dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,8 @@
 #include <linux/mutex.h>
 #include <linux/time.h>
 
+#include "rcutree.h"
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f..7cc830a1c44 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,3 +1,239 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif NR_CPUS <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif NR_CPUS <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+	int dynticks_nesting;	/* Track nesting level, sort of. */
+	int dynticks;		/* Even value for dynticks-idle, else odd. */
+	int dynticks_nmi;	/* Even value for either dynticks-idle or */
+				/*  not in nmi handler, else odd.  So this */
+				/*  remains even for nmi from irq handler. */
+};
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long qsmask;	/* CPUs or groups that need to switch in */
+				/*  order for current grace period to proceed.*/
+	unsigned long qsmaskinit;
+				/* Per-GP initialization for qsmask. */
+	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+	int	grplo;		/* lowest-numbered CPU or group here. */
+	int	grphi;		/* highest-numbered CPU or group here. */
+	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	level;		/* root is at level 0. */
+	struct rcu_node *parent;
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		beenonline;	/* CPU online at least once. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
+	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
+
+	/* 2) batch handling */
+	/*
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+#ifdef CONFIG_NO_HZ
+	/* 3) dynticks interface. */
+	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+	int dynticks_nmi_snap;		/* Per-GP tracking for dynticks_nmi. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	/* 5) __rcu_pending() statistics. */
+	long n_rcu_pending;		/* rcu_pending() calls since boot. */
+	long n_rp_qs_pending;
+	long n_rp_cb_ready;
+	long n_rp_cpu_needs_gp;
+	long n_rp_gp_completed;
+	long n_rp_gp_started;
+	long n_rp_need_fqs;
+	long n_rp_need_nothing;
+
+	int cpu;
+};
+
+/* Values for signaled field in struct rcu_state. */
+#define RCU_GP_INIT		0	/* Grace period being initialized. */
+#define RCU_SAVE_DYNTICK	1	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		2	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY		2	  /* Allow other CPUs time */
+						  /*  to take at least one */
+						  /*  scheduling clock irq */
+						  /*  before ratting on them. */
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* Force QS state. */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_lh;		/* ~Number of calls leaving */
+						/*  due to lock unavailable. */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long jiffies_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+#ifdef RCU_TREE_NONCORE
 
 /*
  * RCU implementation internal declarations:
@@ -8,3 +244,5 @@ DECLARE_PER_CPU(struct rcu_data, rcu_data);
 extern struct rcu_state rcu_bh_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 
+#endif /* #ifdef RCU_TREE_NONCORE */
+
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca..0cb52b88775 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
+#define RCU_TREE_NONCORE
 #include "rcutree.h"
 
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
-- 
cgit v1.2.3


From d6714c22b43fbcbead7e7b706ff270e15f04a791 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:46 -0700
Subject: rcu: Renamings to increase RCU clarity

Make RCU-sched, RCU-bh, and RCU-preempt be underlying
implementations, with "RCU" defined in terms of one of the
three.  Update the outdated rcu_qsctr_inc() names, as these
functions no longer increment anything.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746132696-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c    |  8 ++---
 kernel/rcutree.c       | 80 +++++++++++++++++++++++++++++++-------------------
 kernel/rcutree.h       |  4 +--
 kernel/rcutree_trace.c | 20 ++++++-------
 kernel/sched.c         |  2 +-
 kernel/softirq.c       |  4 +--
 6 files changed, 69 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 510898a7bd6..7d777c9f394 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -159,7 +159,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched
 	.dynticks = 1,
 };
 
-void rcu_qsctr_inc(int cpu)
+void rcu_sched_qs(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
@@ -967,12 +967,12 @@ void rcu_check_callbacks(int cpu, int user)
 	 * If this CPU took its interrupt from user mode or from the
 	 * idle loop, and this is not a nested interrupt, then
 	 * this CPU has to have exited all prior preept-disable
-	 * sections of code.  So increment the counter to note this.
+	 * sections of code.  So invoke rcu_sched_qs() to note this.
 	 *
 	 * The memory barrier is needed to handle the case where
 	 * writes from a preempt-disable section of code get reordered
 	 * into schedule() by this CPU's write buffer.  So the memory
-	 * barrier makes sure that the rcu_qsctr_inc() is seen by other
+	 * barrier makes sure that the rcu_sched_qs() is seen by other
 	 * CPUs to happen after any such write.
 	 */
 
@@ -980,7 +980,7 @@ void rcu_check_callbacks(int cpu, int user)
 	    (idle_cpu(cpu) && !in_softirq() &&
 	     hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 		smp_mb();	/* Guard against aggressive schedule(). */
-	     	rcu_qsctr_inc(cpu);
+		rcu_sched_qs(cpu);
 	}
 
 	rcu_check_mb(cpu);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a162f859dd3..4d71d4e8b5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -74,26 +74,25 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_data);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 /*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
+ * Note a quiescent state.  Because we do not need to know
  * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
+ * one since the start of the grace period, this just sets a flag.
  */
-void rcu_qsctr_inc(int cpu)
+void rcu_sched_qs(int cpu)
 {
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 	rdp->passed_quiesc = 1;
 	rdp->passed_quiesc_completed = rdp->completed;
 }
 
-void rcu_bh_qsctr_inc(int cpu)
+void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesc = 1;
@@ -113,12 +112,22 @@ static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 
+/*
+ * Return the number of RCU-sched batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_sched(void)
+{
+	return rcu_sched_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
+
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
+ * @@@ placeholder, maps to rcu_batches_completed_sched().
  */
 long rcu_batches_completed(void)
 {
-	return rcu_state.completed;
+	return rcu_batches_completed_sched();
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
@@ -310,7 +319,7 @@ void rcu_irq_exit(void)
 	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
 
 	/* If the interrupt queued a callback, get out of dyntick mode. */
-	if (__get_cpu_var(rcu_data).nxtlist ||
+	if (__get_cpu_var(rcu_sched_data).nxtlist ||
 	    __get_cpu_var(rcu_bh_data).nxtlist)
 		set_need_resched();
 }
@@ -847,7 +856,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	/*
 	 * Move callbacks from the outgoing CPU to the running CPU.
 	 * Note that the outgoing CPU is now quiscent, so it is now
-	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * (uncharacteristically) safe to access its rcu_data structure.
 	 * Note also that we must carefully retain the order of the
 	 * outgoing CPU's callbacks in order for rcu_barrier() to work
 	 * correctly.  Finally, note that we start all the callbacks
@@ -878,7 +887,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
  */
 static void rcu_offline_cpu(int cpu)
 {
-	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_sched_state);
 	__rcu_offline_cpu(cpu, &rcu_bh_state);
 }
 
@@ -973,17 +982,16 @@ void rcu_check_callbacks(int cpu, int user)
 		 * Get here if this CPU took its interrupt from user
 		 * mode or from the idle loop, and if this is not a
 		 * nested interrupt.  In this case, the CPU is in
-		 * a quiescent state, so count it.
+		 * a quiescent state, so note it.
 		 *
 		 * No memory barrier is required here because both
-		 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
-		 * only CPU-local variables that other CPUs neither
-		 * access nor modify, at least not while the corresponding
-		 * CPU is online.
+		 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
+		 * variables that other CPUs neither access nor modify,
+		 * at least not while the corresponding CPU is online.
 		 */
 
-		rcu_qsctr_inc(cpu);
-		rcu_bh_qsctr_inc(cpu);
+		rcu_sched_qs(cpu);
+		rcu_bh_qs(cpu);
 
 	} else if (!in_softirq()) {
 
@@ -991,10 +999,10 @@ void rcu_check_callbacks(int cpu, int user)
 		 * Get here if this CPU did not take its interrupt from
 		 * softirq, in other words, if it is not interrupting
 		 * a rcu_bh read-side critical section.  This is an _bh
-		 * critical section, so count it.
+		 * critical section, so note it.
 		 */
 
-		rcu_bh_qsctr_inc(cpu);
+		rcu_bh_qs(cpu);
 	}
 	raise_softirq(RCU_SOFTIRQ);
 }
@@ -1174,7 +1182,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 	 */
 	smp_mb(); /* See above block comment. */
 
-	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_sched_state,
+				&__get_cpu_var(rcu_sched_data));
 	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 
 	/*
@@ -1231,14 +1240,25 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 }
 
 /*
- * Queue an RCU callback for invocation after a grace period.
+ * Queue an RCU-sched callback for invocation after a grace period.
+ */
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
+/*
+ * @@@ Queue an RCU callback for invocation after a grace period.
+ * @@@ Placeholder pending rcutree_plugin.h.
  */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-	__call_rcu(head, func, &rcu_state);
+	call_rcu_sched(head, func);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+
 /*
  * Queue an RCU for invocation after a quicker grace period.
  */
@@ -1311,7 +1331,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 int rcu_pending(int cpu)
 {
-	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
 	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
 }
 
@@ -1324,7 +1344,7 @@ int rcu_pending(int cpu)
 int rcu_needs_cpu(int cpu)
 {
 	/* RCU callbacks either ready or pending? */
-	return per_cpu(rcu_data, cpu).nxtlist ||
+	return per_cpu(rcu_sched_data, cpu).nxtlist ||
 	       per_cpu(rcu_bh_data, cpu).nxtlist;
 }
 
@@ -1418,7 +1438,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 
 static void __cpuinit rcu_online_cpu(int cpu)
 {
-	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_sched_state);
 	rcu_init_percpu_data(cpu, &rcu_bh_state);
 }
 
@@ -1545,10 +1565,10 @@ void __init __rcu_init(void)
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-	rcu_init_one(&rcu_state);
-	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_sched_state);
+	RCU_DATA_PTR_INIT(&rcu_sched_state, rcu_sched_data);
 	for_each_possible_cpu(i)
-		rcu_boot_init_percpu_data(i, &rcu_state);
+		rcu_boot_init_percpu_data(i, &rcu_sched_state);
 	rcu_init_one(&rcu_bh_state);
 	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
 	for_each_possible_cpu(i)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7cc830a1c44..0024e5ddcc6 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -238,8 +238,8 @@ struct rcu_state {
 /*
  * RCU implementation internal declarations:
  */
-extern struct rcu_state rcu_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
+extern struct rcu_state rcu_sched_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
 
 extern struct rcu_state rcu_bh_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0cb52b88775..236c0504fee 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -77,8 +77,8 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 
 static int show_rcudata(struct seq_file *m, void *unused)
 {
-	seq_puts(m, "rcu:\n");
-	PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
+	seq_puts(m, "rcu_sched:\n");
+	PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
 	seq_puts(m, "rcu_bh:\n");
 	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
 	return 0;
@@ -125,8 +125,8 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
 	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
-	seq_puts(m, "\"rcu:\"\n");
-	PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
+	seq_puts(m, "\"rcu_sched:\"\n");
+	PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
 	seq_puts(m, "\"rcu_bh:\"\n");
 	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
 	return 0;
@@ -172,8 +172,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 
 static int show_rcuhier(struct seq_file *m, void *unused)
 {
-	seq_puts(m, "rcu:\n");
-	print_one_rcu_state(m, &rcu_state);
+	seq_puts(m, "rcu_sched:\n");
+	print_one_rcu_state(m, &rcu_sched_state);
 	seq_puts(m, "rcu_bh:\n");
 	print_one_rcu_state(m, &rcu_bh_state);
 	return 0;
@@ -194,8 +194,8 @@ static struct file_operations rcuhier_fops = {
 
 static int show_rcugp(struct seq_file *m, void *unused)
 {
-	seq_printf(m, "rcu: completed=%ld  gpnum=%ld\n",
-		   rcu_state.completed, rcu_state.gpnum);
+	seq_printf(m, "rcu_sched: completed=%ld  gpnum=%ld\n",
+		   rcu_sched_state.completed, rcu_sched_state.gpnum);
 	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
 		   rcu_bh_state.completed, rcu_bh_state.gpnum);
 	return 0;
@@ -244,8 +244,8 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
 
 static int show_rcu_pending(struct seq_file *m, void *unused)
 {
-	seq_puts(m, "rcu:\n");
-	print_rcu_pendings(m, &rcu_state);
+	seq_puts(m, "rcu_sched:\n");
+	print_rcu_pendings(m, &rcu_sched_state);
 	seq_puts(m, "rcu_bh:\n");
 	print_rcu_pendings(m, &rcu_bh_state);
 	return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index cda8b81f880..c9beca67a53 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5325,7 +5325,7 @@ need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
-	rcu_qsctr_inc(cpu);
+	rcu_sched_qs(cpu);
 	prev = rq->curr;
 	switch_count = &prev->nivcsw;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eb5e131a048..7db25067cd2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ restart:
 				preempt_count() = prev_count;
 			}
 
-			rcu_bh_qsctr_inc(cpu);
+			rcu_bh_qs(cpu);
 		}
 		h++;
 		pending >>= 1;
@@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
-			rcu_qsctr_inc((long)__bind_cpu);
+			rcu_sched_qs((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From 5699ed8fcb0c32ca699e2a27ad716eb70b367dbf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:48 -0700
Subject: rcu: Fix online/offline indication for rcudata.csv trace file

The heading said "Online?", but the column had "Y" for offline
CPUs and "N" for online CPUs.  Swap the "Y" and "N" to match
the heading.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746132841-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 236c0504fee..ea4a4747037 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -103,7 +103,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 		return;
 	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
 		   rdp->cpu,
-		   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
+		   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
 		   rdp->completed, rdp->gpnum,
 		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
 		   rdp->qs_pending);
-- 
cgit v1.2.3


From 65cf8f866fc0fb40fa9daaded7e938a886d6f7c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:49 -0700
Subject: rcu: Merge per-RCU-flavor initialization into pre-existing macro

Rename the RCU_DATA_PTR_INIT() macro to RCU_INIT_FLAVOR() and
make it do the rcu_init_one() and rcu_boot_init_percpu_data()
calls.  Merge the loop that was in the original macro with the
loops that were in __rcu_init().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746133916-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4d71d4e8b5a..7c515082ae8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1543,8 +1543,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
  * Helper macro for __rcu_init().  To be used nowhere else!
  * Assigns leaf node pointers into each CPU's rcu_data structure.
  */
-#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+#define RCU_INIT_FLAVOR(rsp, rcu_data) \
 do { \
+	rcu_init_one(rsp); \
 	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
 	j = 0; \
 	for_each_possible_cpu(i) { \
@@ -1552,6 +1553,7 @@ do { \
 			j++; \
 		per_cpu(rcu_data, i).mynode = &rnp[j]; \
 		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+		rcu_boot_init_percpu_data(i, rsp); \
 	} \
 } while (0)
 
@@ -1565,14 +1567,8 @@ void __init __rcu_init(void)
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-	rcu_init_one(&rcu_sched_state);
-	RCU_DATA_PTR_INIT(&rcu_sched_state, rcu_sched_data);
-	for_each_possible_cpu(i)
-		rcu_boot_init_percpu_data(i, &rcu_sched_state);
-	rcu_init_one(&rcu_bh_state);
-	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
-	for_each_possible_cpu(i)
-		rcu_boot_init_percpu_data(i, &rcu_bh_state);
+	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
+	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
-- 
cgit v1.2.3


From 22f00b69f6a7e1e18e821979a23e8307c2de9888 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:50 -0700
Subject: rcu: Use debugfs_remove_recursive() simplify code.

Suggested by Josh Triplett.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <12509746132173-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree_trace.c | 45 +++++++++++++++------------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ea4a4747037..31af3a0fb6d 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -265,62 +265,47 @@ static struct file_operations rcu_pending_fops = {
 };
 
 static struct dentry *rcudir;
-static struct dentry *datadir;
-static struct dentry *datadir_csv;
-static struct dentry *gpdir;
-static struct dentry *hierdir;
-static struct dentry *rcu_pendingdir;
 
 static int __init rcuclassic_trace_init(void)
 {
+	struct dentry *retval;
+
 	rcudir = debugfs_create_dir("rcu", NULL);
 	if (!rcudir)
-		goto out;
+		goto free_out;
 
-	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+	retval = debugfs_create_file("rcudata", 0444, rcudir,
 						NULL, &rcudata_fops);
-	if (!datadir)
+	if (!retval)
 		goto free_out;
 
-	datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
+	retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
 						NULL, &rcudata_csv_fops);
-	if (!datadir_csv)
+	if (!retval)
 		goto free_out;
 
-	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
-	if (!gpdir)
+	retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!retval)
 		goto free_out;
 
-	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+	retval = debugfs_create_file("rcuhier", 0444, rcudir,
 						NULL, &rcuhier_fops);
-	if (!hierdir)
+	if (!retval)
 		goto free_out;
 
-	rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+	retval = debugfs_create_file("rcu_pending", 0444, rcudir,
 						NULL, &rcu_pending_fops);
-	if (!rcu_pendingdir)
+	if (!retval)
 		goto free_out;
 	return 0;
 free_out:
-	if (datadir)
-		debugfs_remove(datadir);
-	if (datadir_csv)
-		debugfs_remove(datadir_csv);
-	if (gpdir)
-		debugfs_remove(gpdir);
-	debugfs_remove(rcudir);
-out:
+	debugfs_remove_recursive(rcudir);
 	return 1;
 }
 
 static void __exit rcuclassic_trace_cleanup(void)
 {
-	debugfs_remove(datadir);
-	debugfs_remove(datadir_csv);
-	debugfs_remove(gpdir);
-	debugfs_remove(hierdir);
-	debugfs_remove(rcu_pendingdir);
-	debugfs_remove(rcudir);
+	debugfs_remove_recursive(rcudir);
 }
 
 
-- 
cgit v1.2.3


From a157229cabd6dd8cfa82525fc9bf730c94cc9ac2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:51 -0700
Subject: rcu: Simplify rcu_pending()/rcu_check_callbacks() API

All calls from outside RCU are of the form:

	if (rcu_pending(cpu))
		rcu_check_callbacks(cpu, user);

This is silly, instead we put a call to rcu_pending() in
rcu_check_callbacks(), and then make the outside calls be to
rcu_check_callbacks().  This cuts down on the code a bit and
also gives the compiler a better chance of optimizing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <125097461311-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 10 ++++++++--
 kernel/rcutree.c    |  5 ++++-
 kernel/timer.c      |  3 +--
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 7d777c9f394..0053ce56e32 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -159,6 +159,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched
 	.dynticks = 1,
 };
 
+static int rcu_pending(int cpu);
+
 void rcu_sched_qs(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
@@ -961,7 +963,10 @@ static void rcu_check_mb(int cpu)
 void rcu_check_callbacks(int cpu, int user)
 {
 	unsigned long flags;
-	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+	struct rcu_data *rdp;
+
+	if (!rcu_pending(cpu))
+		return; /* if nothing for RCU to do. */
 
 	/*
 	 * If this CPU took its interrupt from user mode or from the
@@ -976,6 +981,7 @@ void rcu_check_callbacks(int cpu, int user)
 	 * CPUs to happen after any such write.
 	 */
 
+	rdp = RCU_DATA_CPU(cpu);
 	if (user ||
 	    (idle_cpu(cpu) && !in_softirq() &&
 	     hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1382,7 +1388,7 @@ int rcu_needs_cpu(int cpu)
 		rdp->waitschedlist != NULL);
 }
 
-int rcu_pending(int cpu)
+static int rcu_pending(int cpu)
 {
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7c515082ae8..4ce3adcfa94 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -111,6 +111,7 @@ static int qhimark = 10000;	/* If this many pending, ignore blimit. */
 static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+static int rcu_pending(int cpu);
 
 /*
  * Return the number of RCU-sched batches processed thus far for debug & stats.
@@ -974,6 +975,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
  */
 void rcu_check_callbacks(int cpu, int user)
 {
+	if (!rcu_pending(cpu))
+		return; /* if nothing for RCU to do. */
 	if (user ||
 	    (idle_cpu(cpu) && rcu_scheduler_active &&
 	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1329,7 +1332,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
  * by the current CPU, returning 1 if so.  This function is part of the
  * RCU implementation; it is -not- an exported member of the RCU API.
  */
-int rcu_pending(int cpu)
+static int rcu_pending(int cpu)
 {
 	return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
 	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a624..a3d25f41501 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1156,8 +1156,7 @@ void update_process_times(int user_tick)
 	/* Note: this timer irq context must be accounted for as well. */
 	account_process_tick(p, user_tick);
 	run_local_timers();
-	if (rcu_pending(cpu))
-		rcu_check_callbacks(cpu, user_tick);
+	rcu_check_callbacks(cpu, user_tick);
 	printk_tick();
 	scheduler_tick();
 	run_posix_cpu_timers(p);
-- 
cgit v1.2.3


From f41d911f8c49a5d65c86504c19e8204bb605c4fd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:52 -0700
Subject: rcu: Merge preemptable-RCU functionality into hierarchical RCU

Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.

This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU.  Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.

The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile         |   1 +
 kernel/exit.c           |   1 +
 kernel/fork.c           |   5 +-
 kernel/rcutree.c        | 135 ++++++++++-----
 kernel/rcutree.h        |   9 +
 kernel/rcutree_plugin.h | 447 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c  |  20 +++
 7 files changed, 571 insertions(+), 47 deletions(-)
 create mode 100644 kernel/rcutree_plugin.h

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 2419c9d4391..1a38b4789dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
 obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733..263f95ed720 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1010,6 +1010,7 @@ NORET_TYPE void do_exit(long code)
 		__free_pipe_info(tsk->splice_pipe);
 
 	preempt_disable();
+	exit_rcu();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
 	schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index 021e1138556..642e8b5edf0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1022,10 +1022,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
-#ifdef CONFIG_PREEMPT_RCU
-	p->rcu_read_lock_nesting = 0;
-	p->rcu_flipctr_idx = 0;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
+	rcu_copy_process(p);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4ce3adcfa94..cc025571407 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -80,6 +80,21 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
+extern long rcu_batches_completed_sched(void);
+static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
+			  struct rcu_node *rnp, unsigned long flags);
+static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
+static void __rcu_process_callbacks(struct rcu_state *rsp,
+				    struct rcu_data *rdp);
+static void __call_rcu(struct rcu_head *head,
+		       void (*func)(struct rcu_head *rcu),
+		       struct rcu_state *rsp);
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
+static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
+					   int preemptable);
+
+#include "rcutree_plugin.h"
+
 /*
  * Note a quiescent state.  Because we do not need to know
  * how many quiescent states passed, just if there was at least
@@ -87,16 +102,27 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
  */
 void rcu_sched_qs(int cpu)
 {
-	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	local_irq_save(flags);
+	rdp = &per_cpu(rcu_sched_data, cpu);
 	rdp->passed_quiesc = 1;
 	rdp->passed_quiesc_completed = rdp->completed;
+	rcu_preempt_qs(cpu);
+	local_irq_restore(flags);
 }
 
 void rcu_bh_qs(int cpu)
 {
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	local_irq_save(flags);
+	rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesc = 1;
 	rdp->passed_quiesc_completed = rdp->completed;
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_NO_HZ
@@ -122,16 +148,6 @@ long rcu_batches_completed_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- * @@@ placeholder, maps to rcu_batches_completed_sched().
- */
-long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
@@ -193,6 +209,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 		return 1;
 	}
 
+	/* If preemptable RCU, no point in sending reschedule IPI. */
+	if (rdp->preemptable)
+		return 0;
+
 	/* The CPU is online, so send it a reschedule IPI. */
 	if (rdp->cpu != smp_processor_id())
 		smp_send_reschedule(rdp->cpu);
@@ -473,6 +493,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 
 	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
 	for (; rnp_cur < rnp_end; rnp_cur++) {
+		rcu_print_task_stall(rnp);
 		if (rnp_cur->qsmask == 0)
 			continue;
 		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -685,6 +706,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 	local_irq_restore(flags);
 }
 
+/*
+ * Clean up after the prior grace period and let rcu_start_gp() start up
+ * the next grace period if one is needed.  Note that the caller must
+ * hold rnp->lock, as required by rcu_start_gp(), which will release it.
+ */
+static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
+	__releases(rnp->lock)
+{
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+}
+
 /*
  * Similar to cpu_quiet(), for which it is a helper function.  Allows
  * a group of CPUs to be quieted at one go, though all the CPUs in the
@@ -706,7 +740,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
 			return;
 		}
 		rnp->qsmask &= ~mask;
-		if (rnp->qsmask != 0) {
+		if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
 
 			/* Other bits still set at this level, so done. */
 			spin_unlock_irqrestore(&rnp->lock, flags);
@@ -726,14 +760,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
 
 	/*
 	 * Get here if we are the last CPU to pass through a quiescent
-	 * state for this grace period.  Clean up and let rcu_start_gp()
-	 * start up the next grace period if one is needed.  Note that
-	 * we still hold rnp->lock, as required by rcu_start_gp(), which
-	 * will release it.
+	 * state for this grace period.  Invoke cpu_quiet_msk_finish()
+	 * to clean up and start the next grace period if one is needed.
 	 */
-	rsp->completed = rsp->gpnum;
-	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
-	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+	cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
 }
 
 /*
@@ -840,11 +870,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 		spin_lock(&rnp->lock);		/* irqs already disabled. */
 		rnp->qsmaskinit &= ~mask;
 		if (rnp->qsmaskinit != 0) {
-			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			break;
 		}
 		mask = rnp->grpmask;
-		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 		rnp = rnp->parent;
 	} while (rnp != NULL);
 	lastcomp = rsp->completed;
@@ -1007,6 +1037,7 @@ void rcu_check_callbacks(int cpu, int user)
 
 		rcu_bh_qs(cpu);
 	}
+	rcu_preempt_check_callbacks(cpu);
 	raise_softirq(RCU_SOFTIRQ);
 }
 
@@ -1188,6 +1219,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 	__rcu_process_callbacks(&rcu_sched_state,
 				&__get_cpu_var(rcu_sched_data));
 	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+	rcu_preempt_process_callbacks();
 
 	/*
 	 * Memory references from any later RCU read-side critical sections
@@ -1251,17 +1283,6 @@ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
-/*
- * @@@ Queue an RCU callback for invocation after a grace period.
- * @@@ Placeholder pending rcutree_plugin.h.
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-	call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-
 /*
  * Queue an RCU for invocation after a quicker grace period.
  */
@@ -1335,7 +1356,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 static int rcu_pending(int cpu)
 {
 	return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
-	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
+	       rcu_preempt_pending(cpu);
 }
 
 /*
@@ -1348,7 +1370,8 @@ int rcu_needs_cpu(int cpu)
 {
 	/* RCU callbacks either ready or pending? */
 	return per_cpu(rcu_sched_data, cpu).nxtlist ||
-	       per_cpu(rcu_bh_data, cpu).nxtlist;
+	       per_cpu(rcu_bh_data, cpu).nxtlist ||
+	       rcu_preempt_needs_cpu(cpu);
 }
 
 /*
@@ -1383,7 +1406,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
  * that this CPU cannot possibly have any RCU callbacks in flight yet.
  */
 static void __cpuinit
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 {
 	unsigned long flags;
 	long lastcomp;
@@ -1399,6 +1422,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
 	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
 	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->preemptable = preemptable;
 	rdp->passed_quiesc_completed = lastcomp - 1;
 	rdp->blimit = blimit;
 	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
@@ -1441,12 +1465,13 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 
 static void __cpuinit rcu_online_cpu(int cpu)
 {
-	rcu_init_percpu_data(cpu, &rcu_sched_state);
-	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
+	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
+	rcu_preempt_init_percpu_data(cpu);
 }
 
 /*
- * Handle CPU online/offline notifcation events.
+ * Handle CPU online/offline notification events.
  */
 int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 			     unsigned long action, void *hcpu)
@@ -1521,6 +1546,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		rnp = rsp->level[i];
 		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
 			spin_lock_init(&rnp->lock);
+			rnp->gpnum = 0;
 			rnp->qsmask = 0;
 			rnp->qsmaskinit = 0;
 			rnp->grplo = j * cpustride;
@@ -1538,13 +1564,16 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 					      j / rsp->levelspread[i - 1];
 			}
 			rnp->level = i;
+			INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
+			INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
 		}
 	}
 }
 
 /*
- * Helper macro for __rcu_init().  To be used nowhere else!
- * Assigns leaf node pointers into each CPU's rcu_data structure.
+ * Helper macro for __rcu_init() and __rcu_init_preempt().  To be used
+ * nowhere else!  Assigns leaf node pointers into each CPU's rcu_data
+ * structure.
  */
 #define RCU_INIT_FLAVOR(rsp, rcu_data) \
 do { \
@@ -1560,18 +1589,38 @@ do { \
 	} \
 } while (0)
 
+#ifdef CONFIG_TREE_PREEMPT_RCU
+
+void __init __rcu_init_preempt(void)
+{
+	int i;			/* All used by RCU_INIT_FLAVOR(). */
+	int j;
+	struct rcu_node *rnp;
+
+	RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+}
+
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+void __init __rcu_init_preempt(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+
 void __init __rcu_init(void)
 {
-	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int i;			/* All used by RCU_INIT_FLAVOR(). */
 	int j;
 	struct rcu_node *rnp;
 
-	printk(KERN_INFO "Hierarchical RCU implementation.\n");
+	rcu_bootup_announce();
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
 	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+	__rcu_init_preempt();
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 0024e5ddcc6..ca560364d8c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -80,6 +80,7 @@ struct rcu_dynticks {
  */
 struct rcu_node {
 	spinlock_t lock;
+	long	gpnum;		/* Current grace period for this node. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
 				/*  order for current grace period to proceed.*/
 	unsigned long qsmaskinit;
@@ -90,6 +91,8 @@ struct rcu_node {
 	u8	grpnum;		/* CPU/group number for next level up. */
 	u8	level;		/* root is at level 0. */
 	struct rcu_node *parent;
+	struct list_head blocked_tasks[2];
+				/* Tasks blocked in RCU read-side critsect. */
 } ____cacheline_internodealigned_in_smp;
 
 /* Index values for nxttail array in struct rcu_data. */
@@ -111,6 +114,7 @@ struct rcu_data {
 	bool		passed_quiesc;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
+	bool		preemptable;	/* Preemptable RCU? */
 	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
 	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
 
@@ -244,5 +248,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
 extern struct rcu_state rcu_bh_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 
+#ifdef CONFIG_TREE_PREEMPT_RCU
+extern struct rcu_state rcu_preempt_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
 #endif /* #ifdef RCU_TREE_NONCORE */
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 00000000000..cd2ab67400c
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,447 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptable semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+
+#ifdef CONFIG_TREE_PREEMPT_RCU
+
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+
+/*
+ * Tell them what RCU they are running.
+ */
+static inline void rcu_bootup_announce(void)
+{
+	printk(KERN_INFO
+	       "Experimental preemptable hierarchical RCU implementation.\n");
+}
+
+/*
+ * Return the number of RCU-preempt batches processed thus far
+ * for debug and statistics.
+ */
+long rcu_batches_completed_preempt(void)
+{
+	return rcu_preempt_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_batches_completed_preempt();
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Record a preemptable-RCU quiescent state for the specified CPU.  Note
+ * that this just means that the task currently running on the CPU is
+ * not in a quiescent state.  There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ */
+static void rcu_preempt_qs_record(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+/*
+ * We have entered the scheduler or are between softirqs in ksoftirqd.
+ * If we are in an RCU read-side critical section, we need to reflect
+ * that in the state of the rcu_node structure corresponding to this CPU.
+ * Caller must disable hardirqs.
+ */
+static void rcu_preempt_qs(int cpu)
+{
+	struct task_struct *t = current;
+	int phase;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	if (t->rcu_read_lock_nesting &&
+	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+		/* Possibly blocking in an RCU read-side critical section. */
+		rdp = rcu_preempt_state.rda[cpu];
+		rnp = rdp->mynode;
+		spin_lock(&rnp->lock);
+		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+		t->rcu_blocked_cpu = cpu;
+
+		/*
+		 * If this CPU has already checked in, then this task
+		 * will hold up the next grace period rather than the
+		 * current grace period.  Queue the task accordingly.
+		 * If the task is queued for the current grace period
+		 * (i.e., this CPU has not yet passed through a quiescent
+		 * state for the current grace period), then as long
+		 * as that task remains queued, the current grace period
+		 * cannot end.
+		 */
+		phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
+		list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
+		smp_mb();  /* Ensure later ctxt swtch seen after above. */
+		spin_unlock(&rnp->lock);
+	}
+
+	/*
+	 * Either we were not in an RCU read-side critical section to
+	 * begin with, or we have now recorded that critical section
+	 * globally.  Either way, we can now note a quiescent state
+	 * for this CPU.  Again, if we were in an RCU read-side critical
+	 * section, and if that critical section was blocking the current
+	 * grace period, then the fact that the task has been enqueued
+	 * means that we continue to block the current grace period.
+	 */
+	rcu_preempt_qs_record(cpu);
+	t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
+					RCU_READ_UNLOCK_GOT_QS);
+}
+
+/*
+ * Tree-preemptable RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+	ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+	barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+	int empty;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+	int special;
+
+	/* NMI handlers cannot block and cannot safely manipulate state. */
+	if (in_nmi())
+		return;
+
+	local_irq_save(flags);
+
+	/*
+	 * If RCU core is waiting for this CPU to exit critical section,
+	 * let it know that we have done so.
+	 */
+	special = t->rcu_read_unlock_special;
+	if (special & RCU_READ_UNLOCK_NEED_QS) {
+		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
+	}
+
+	/* Hardware IRQ handlers cannot block. */
+	if (in_irq()) {
+		local_irq_restore(flags);
+		return;
+	}
+
+	/* Clean up if blocked during RCU read-side critical section. */
+	if (special & RCU_READ_UNLOCK_BLOCKED) {
+		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+		/* Remove this task from the list it blocked on. */
+		rnp = rcu_preempt_state.rda[t->rcu_blocked_cpu]->mynode;
+		spin_lock(&rnp->lock);
+		empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+		list_del_init(&t->rcu_node_entry);
+		t->rcu_blocked_cpu = -1;
+
+		/*
+		 * If this was the last task on the current list, and if
+		 * we aren't waiting on any CPUs, report the quiescent state.
+		 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
+		 * drop rnp->lock and restore irq.
+		 */
+		if (!empty && rnp->qsmask == 0 &&
+		    list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
+			t->rcu_read_unlock_special &=
+				~(RCU_READ_UNLOCK_NEED_QS |
+				  RCU_READ_UNLOCK_GOT_QS);
+			if (rnp->parent == NULL) {
+				/* Only one rcu_node in the tree. */
+				cpu_quiet_msk_finish(&rcu_preempt_state, flags);
+				return;
+			}
+			/* Report up the rest of the hierarchy. */
+			mask = rnp->grpmask;
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			rnp = rnp->parent;
+			spin_lock_irqsave(&rnp->lock, flags);
+			cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
+			return;
+		}
+		spin_unlock(&rnp->lock);
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Tree-preemptable RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+	struct task_struct *t = current;
+
+	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
+	if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+	    unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+		rcu_read_unlock_special(t);
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static void rcu_print_task_stall(struct rcu_node *rnp)
+{
+	unsigned long flags;
+	struct list_head *lp;
+	int phase = rnp->gpnum & 0x1;
+	struct task_struct *t;
+
+	if (!list_empty(&rnp->blocked_tasks[phase])) {
+		spin_lock_irqsave(&rnp->lock, flags);
+		phase = rnp->gpnum & 0x1; /* re-read under lock. */
+		lp = &rnp->blocked_tasks[phase];
+		list_for_each_entry(t, lp, rcu_node_entry)
+			printk(" P%d", t->pid);
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	}
+}
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Check for preempted RCU readers for the specified rcu_node structure.
+ * If the caller needs a reliable answer, it must hold the rcu_node's
+ * >lock.
+ */
+static int rcu_preempted_readers(struct rcu_node *rnp)
+{
+	return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+}
+
+/*
+ * Check for a quiescent state from the current CPU.  When a task blocks,
+ * the task is recorded in the corresponding CPU's rcu_node structure,
+ * which is checked elsewhere.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(int cpu)
+{
+	struct task_struct *t = current;
+
+	if (t->rcu_read_lock_nesting == 0) {
+		t->rcu_read_unlock_special &=
+			~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
+		rcu_preempt_qs_record(cpu);
+		return;
+	}
+	if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
+		if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
+			rcu_preempt_qs_record(cpu);
+			t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
+		} else if (!(t->rcu_read_unlock_special &
+			     RCU_READ_UNLOCK_NEED_QS)) {
+			t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+		}
+	}
+}
+
+/*
+ * Process callbacks for preemptable RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+	__rcu_process_callbacks(&rcu_preempt_state,
+				&__get_cpu_var(rcu_preempt_data));
+}
+
+/*
+ * Queue a preemptable-RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_preempt_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Check to see if there is any immediate preemptable-RCU-related work
+ * to be done.
+ */
+static int rcu_preempt_pending(int cpu)
+{
+	return __rcu_pending(&rcu_preempt_state,
+			     &per_cpu(rcu_preempt_data, cpu));
+}
+
+/*
+ * Does preemptable RCU need the CPU to stay out of dynticks mode?
+ */
+static int rcu_preempt_needs_cpu(int cpu)
+{
+	return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize preemptable RCU's per-CPU data.
+ */
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
+{
+	rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
+}
+
+/*
+ * Check for a task exiting while in a preemptable-RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+	struct task_struct *t = current;
+
+	if (t->rcu_read_lock_nesting == 0)
+		return;
+	t->rcu_read_lock_nesting = 1;
+	rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+/*
+ * Tell them what RCU they are running.
+ */
+static inline void rcu_bootup_announce(void)
+{
+	printk(KERN_INFO "Hierarchical RCU implementation.\n");
+}
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_batches_completed_sched();
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_qs(int cpu)
+{
+}
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_task_stall(struct rcu_node *rnp)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Because preemptable RCU does not exist, there are never any preempted
+ * RCU readers.
+ */
+static int rcu_preempted_readers(struct rcu_node *rnp)
+{
+	return 0;
+}
+
+/*
+ * Because preemptable RCU does not exist, it never has any callbacks
+ * to check.
+ */
+void rcu_preempt_check_callbacks(int cpu)
+{
+}
+
+/*
+ * Because preemptable RCU does not exist, it never has any callbacks
+ * to process.
+ */
+void rcu_preempt_process_callbacks(void)
+{
+}
+
+/*
+ * In classic RCU, call_rcu() is just call_rcu_sched().
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	call_rcu_sched(head, func);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Because preemptable RCU does not exist, it never has any work to do.
+ */
+static int rcu_preempt_pending(int cpu)
+{
+	return 0;
+}
+
+/*
+ * Because preemptable RCU does not exist, it never needs any CPU.
+ */
+static int rcu_preempt_needs_cpu(int cpu)
+{
+	return 0;
+}
+
+/*
+ * Because preemptable RCU does not exist, there is no per-CPU
+ * data to initialize.
+ */
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 31af3a0fb6d..0ea1bff6972 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -77,6 +77,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 
 static int show_rcudata(struct seq_file *m, void *unused)
 {
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	seq_puts(m, "rcu_preempt:\n");
+	PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 	seq_puts(m, "rcu_sched:\n");
 	PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
 	seq_puts(m, "rcu_bh:\n");
@@ -125,6 +129,10 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
 	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	seq_puts(m, "\"rcu_preempt:\"\n");
+	PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 	seq_puts(m, "\"rcu_sched:\"\n");
 	PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
 	seq_puts(m, "\"rcu_bh:\"\n");
@@ -172,6 +180,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 
 static int show_rcuhier(struct seq_file *m, void *unused)
 {
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	seq_puts(m, "rcu_preempt:\n");
+	print_one_rcu_state(m, &rcu_preempt_state);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 	seq_puts(m, "rcu_sched:\n");
 	print_one_rcu_state(m, &rcu_sched_state);
 	seq_puts(m, "rcu_bh:\n");
@@ -194,6 +206,10 @@ static struct file_operations rcuhier_fops = {
 
 static int show_rcugp(struct seq_file *m, void *unused)
 {
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%ld\n",
+		   rcu_preempt_state.completed, rcu_preempt_state.gpnum);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 	seq_printf(m, "rcu_sched: completed=%ld  gpnum=%ld\n",
 		   rcu_sched_state.completed, rcu_sched_state.gpnum);
 	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
@@ -244,6 +260,10 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
 
 static int show_rcu_pending(struct seq_file *m, void *unused)
 {
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	seq_puts(m, "rcu_preempt:\n");
+	print_rcu_pendings(m, &rcu_preempt_state);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 	seq_puts(m, "rcu_sched:\n");
 	print_rcu_pendings(m, &rcu_sched_state);
 	seq_puts(m, "rcu_bh:\n");
-- 
cgit v1.2.3


From 6b3ef48adf847f7adf11c870e3ffacac150f1564 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 22 Aug 2009 13:56:53 -0700
Subject: rcu: Remove CONFIG_PREEMPT_RCU

Now that CONFIG_TREE_PREEMPT_RCU is in place, there is no
further need for CONFIG_PREEMPT_RCU.  Remove it, along with
whatever subtle bugs it may (or may not) contain.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <125097461396-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile           |    2 -
 kernel/rcupreempt.c       | 1518 ---------------------------------------------
 kernel/rcupreempt_trace.c |  335 ----------
 3 files changed, 1855 deletions(-)
 delete mode 100644 kernel/rcupreempt.c
 delete mode 100644 kernel/rcupreempt_trace.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1a38b4789dd..b833bd5cc12 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,9 +82,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
-obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index 0053ce56e32..00000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1518 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, realtime implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2006
- *
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- *		With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
- *		for pushing me away from locks and towards counters, and
- *		to Suparna Bhattacharya for pushing me completely away
- *		from atomic instructions on the read side.
- *
- *  - Added handling of Dynamic Ticks
- *      Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
- *                     - Steven Rostedt <srostedt@redhat.com>
- *
- * Papers:  http://www.rdrop.com/users/paulmck/RCU
- *
- * Design Document: http://lwn.net/Articles/253651/
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU/ *.txt
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/random.h>
-#include <linux/delay.h>
-#include <linux/cpumask.h>
-#include <linux/rcupreempt_trace.h>
-#include <asm/byteorder.h>
-
-/*
- * PREEMPT_RCU data structures.
- */
-
-/*
- * GP_STAGES specifies the number of times the state machine has
- * to go through the all the rcu_try_flip_states (see below)
- * in a single Grace Period.
- *
- * GP in GP_STAGES stands for Grace Period ;)
- */
-#define GP_STAGES    2
-struct rcu_data {
-	spinlock_t	lock;		/* Protect rcu_data fields. */
-	long		completed;	/* Number of last completed batch. */
-	int		waitlistcount;
-	struct rcu_head *nextlist;
-	struct rcu_head **nexttail;
-	struct rcu_head *waitlist[GP_STAGES];
-	struct rcu_head **waittail[GP_STAGES];
-	struct rcu_head *donelist;	/* from waitlist & waitschedlist */
-	struct rcu_head **donetail;
-	long rcu_flipctr[2];
-	struct rcu_head *nextschedlist;
-	struct rcu_head **nextschedtail;
-	struct rcu_head *waitschedlist;
-	struct rcu_head **waitschedtail;
-	int rcu_sched_sleeping;
-#ifdef CONFIG_RCU_TRACE
-	struct rcupreempt_trace trace;
-#endif /* #ifdef CONFIG_RCU_TRACE */
-};
-
-/*
- * States for rcu_try_flip() and friends.
- */
-
-enum rcu_try_flip_states {
-
-	/*
-	 * Stay here if nothing is happening. Flip the counter if somthing
-	 * starts happening. Denoted by "I"
-	 */
-	rcu_try_flip_idle_state,
-
-	/*
-	 * Wait here for all CPUs to notice that the counter has flipped. This
-	 * prevents the old set of counters from ever being incremented once
-	 * we leave this state, which in turn is necessary because we cannot
-	 * test any individual counter for zero -- we can only check the sum.
-	 * Denoted by "A".
-	 */
-	rcu_try_flip_waitack_state,
-
-	/*
-	 * Wait here for the sum of the old per-CPU counters to reach zero.
-	 * Denoted by "Z".
-	 */
-	rcu_try_flip_waitzero_state,
-
-	/*
-	 * Wait here for each of the other CPUs to execute a memory barrier.
-	 * This is necessary to ensure that these other CPUs really have
-	 * completed executing their RCU read-side critical sections, despite
-	 * their CPUs wildly reordering memory. Denoted by "M".
-	 */
-	rcu_try_flip_waitmb_state,
-};
-
-/*
- * States for rcu_ctrlblk.rcu_sched_sleep.
- */
-
-enum rcu_sched_sleep_states {
-	rcu_sched_not_sleeping,	/* Not sleeping, callbacks need GP.  */
-	rcu_sched_sleep_prep,	/* Thinking of sleeping, rechecking. */
-	rcu_sched_sleeping,	/* Sleeping, awaken if GP needed. */
-};
-
-struct rcu_ctrlblk {
-	spinlock_t	fliplock;	/* Protect state-machine transitions. */
-	long		completed;	/* Number of last completed batch. */
-	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
-							the rcu state machine */
-	spinlock_t	schedlock;	/* Protect rcu_sched sleep state. */
-	enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
-	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
-};
-
-struct rcu_dyntick_sched {
-	int dynticks;
-	int dynticks_snap;
-	int sched_qs;
-	int sched_qs_snap;
-	int sched_dynticks_snap;
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
-	.dynticks = 1,
-};
-
-static int rcu_pending(int cpu);
-
-void rcu_sched_qs(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	rdssp->sched_qs++;
-}
-
-#ifdef CONFIG_NO_HZ
-
-void rcu_enter_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
-}
-
-void rcu_exit_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
-	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
-				&rs);
-}
-
-#endif /* CONFIG_NO_HZ */
-
-
-static DEFINE_PER_CPU(struct rcu_data, rcu_data);
-
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
-	.completed = 0,
-	.rcu_try_flip_state = rcu_try_flip_idle_state,
-	.schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
-	.sched_sleep = rcu_sched_not_sleeping,
-	.sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
-};
-
-static struct task_struct *rcu_sched_grace_period_task;
-
-#ifdef CONFIG_RCU_TRACE
-static char *rcu_try_flip_state_names[] =
-	{ "idle", "waitack", "waitzero", "waitmb" };
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
-	= CPU_BITS_NONE;
-
-/*
- * Enum and per-CPU flag to determine when each CPU has seen
- * the most recent counter flip.
- */
-
-enum rcu_flip_flag_values {
-	rcu_flip_seen,		/* Steady/initial state, last flip seen. */
-				/* Only GP detector can update. */
-	rcu_flipped		/* Flip just completed, need confirmation. */
-				/* Only corresponding CPU can update. */
-};
-static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
-								= rcu_flip_seen;
-
-/*
- * Enum and per-CPU flag to determine when each CPU has executed the
- * needed memory barrier to fence in memory references from its last RCU
- * read-side critical section in the just-completed grace period.
- */
-
-enum rcu_mb_flag_values {
-	rcu_mb_done,		/* Steady/initial state, no mb()s required. */
-				/* Only GP detector can update. */
-	rcu_mb_needed		/* Flip just completed, need an mb(). */
-				/* Only corresponding CPU can update. */
-};
-static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
-								= rcu_mb_done;
-
-/*
- * RCU_DATA_ME: find the current CPU's rcu_data structure.
- * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
- */
-#define RCU_DATA_ME()		(&__get_cpu_var(rcu_data))
-#define RCU_DATA_CPU(cpu)	(&per_cpu(rcu_data, cpu))
-
-/*
- * Helper macro for tracing when the appropriate rcu_data is not
- * cached in a local variable, but where the CPU number is so cached.
- */
-#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
-
-/*
- * Helper macro for tracing when the appropriate rcu_data is not
- * cached in a local variable.
- */
-#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
-
-/*
- * Helper macro for tracing when the appropriate rcu_data is pointed
- * to by a local variable.
- */
-#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
-
-#define RCU_SCHED_BATCH_TIME (HZ / 50)
-
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
-	return rcu_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-void __rcu_read_lock(void)
-{
-	int idx;
-	struct task_struct *t = current;
-	int nesting;
-
-	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
-	if (nesting != 0) {
-
-		/* An earlier rcu_read_lock() covers us, just count it. */
-
-		t->rcu_read_lock_nesting = nesting + 1;
-
-	} else {
-		unsigned long flags;
-
-		/*
-		 * We disable interrupts for the following reasons:
-		 * - If we get scheduling clock interrupt here, and we
-		 *   end up acking the counter flip, it's like a promise
-		 *   that we will never increment the old counter again.
-		 *   Thus we will break that promise if that
-		 *   scheduling clock interrupt happens between the time
-		 *   we pick the .completed field and the time that we
-		 *   increment our counter.
-		 *
-		 * - We don't want to be preempted out here.
-		 *
-		 * NMIs can still occur, of course, and might themselves
-		 * contain rcu_read_lock().
-		 */
-
-		local_irq_save(flags);
-
-		/*
-		 * Outermost nesting of rcu_read_lock(), so increment
-		 * the current counter for the current CPU.  Use volatile
-		 * casts to prevent the compiler from reordering.
-		 */
-
-		idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
-		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
-
-		/*
-		 * Now that the per-CPU counter has been incremented, we
-		 * are protected from races with rcu_read_lock() invoked
-		 * from NMI handlers on this CPU.  We can therefore safely
-		 * increment the nesting counter, relieving further NMIs
-		 * of the need to increment the per-CPU counter.
-		 */
-
-		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
-
-		/*
-		 * Now that we have preventing any NMIs from storing
-		 * to the ->rcu_flipctr_idx, we can safely use it to
-		 * remember which counter to decrement in the matching
-		 * rcu_read_unlock().
-		 */
-
-		ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
-		local_irq_restore(flags);
-	}
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-void __rcu_read_unlock(void)
-{
-	int idx;
-	struct task_struct *t = current;
-	int nesting;
-
-	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
-	if (nesting > 1) {
-
-		/*
-		 * We are still protected by the enclosing rcu_read_lock(),
-		 * so simply decrement the counter.
-		 */
-
-		t->rcu_read_lock_nesting = nesting - 1;
-
-	} else {
-		unsigned long flags;
-
-		/*
-		 * Disable local interrupts to prevent the grace-period
-		 * detection state machine from seeing us half-done.
-		 * NMIs can still occur, of course, and might themselves
-		 * contain rcu_read_lock() and rcu_read_unlock().
-		 */
-
-		local_irq_save(flags);
-
-		/*
-		 * Outermost nesting of rcu_read_unlock(), so we must
-		 * decrement the current counter for the current CPU.
-		 * This must be done carefully, because NMIs can
-		 * occur at any point in this code, and any rcu_read_lock()
-		 * and rcu_read_unlock() pairs in the NMI handlers
-		 * must interact non-destructively with this code.
-		 * Lots of volatile casts, and -very- careful ordering.
-		 *
-		 * Changes to this code, including this one, must be
-		 * inspected, validated, and tested extremely carefully!!!
-		 */
-
-		/*
-		 * First, pick up the index.
-		 */
-
-		idx = ACCESS_ONCE(t->rcu_flipctr_idx);
-
-		/*
-		 * Now that we have fetched the counter index, it is
-		 * safe to decrement the per-task RCU nesting counter.
-		 * After this, any interrupts or NMIs will increment and
-		 * decrement the per-CPU counters.
-		 */
-		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
-
-		/*
-		 * It is now safe to decrement this task's nesting count.
-		 * NMIs that occur after this statement will route their
-		 * rcu_read_lock() calls through this "else" clause, and
-		 * will thus start incrementing the per-CPU counter on
-		 * their own.  They will also clobber ->rcu_flipctr_idx,
-		 * but that is OK, since we have already fetched it.
-		 */
-
-		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
-		local_irq_restore(flags);
-	}
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-/*
- * If a global counter flip has occurred since the last time that we
- * advanced callbacks, advance them.  Hardware interrupts must be
- * disabled when calling this function.
- */
-static void __rcu_advance_callbacks(struct rcu_data *rdp)
-{
-	int cpu;
-	int i;
-	int wlc = 0;
-
-	if (rdp->completed != rcu_ctrlblk.completed) {
-		if (rdp->waitlist[GP_STAGES - 1] != NULL) {
-			*rdp->donetail = rdp->waitlist[GP_STAGES - 1];
-			rdp->donetail = rdp->waittail[GP_STAGES - 1];
-			RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
-		}
-		for (i = GP_STAGES - 2; i >= 0; i--) {
-			if (rdp->waitlist[i] != NULL) {
-				rdp->waitlist[i + 1] = rdp->waitlist[i];
-				rdp->waittail[i + 1] = rdp->waittail[i];
-				wlc++;
-			} else {
-				rdp->waitlist[i + 1] = NULL;
-				rdp->waittail[i + 1] =
-					&rdp->waitlist[i + 1];
-			}
-		}
-		if (rdp->nextlist != NULL) {
-			rdp->waitlist[0] = rdp->nextlist;
-			rdp->waittail[0] = rdp->nexttail;
-			wlc++;
-			rdp->nextlist = NULL;
-			rdp->nexttail = &rdp->nextlist;
-			RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
-		} else {
-			rdp->waitlist[0] = NULL;
-			rdp->waittail[0] = &rdp->waitlist[0];
-		}
-		rdp->waitlistcount = wlc;
-		rdp->completed = rcu_ctrlblk.completed;
-	}
-
-	/*
-	 * Check to see if this CPU needs to report that it has seen
-	 * the most recent counter flip, thereby declaring that all
-	 * subsequent rcu_read_lock() invocations will respect this flip.
-	 */
-
-	cpu = raw_smp_processor_id();
-	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
-		smp_mb();  /* Subsequent counter accesses must see new value */
-		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
-		smp_mb();  /* Subsequent RCU read-side critical sections */
-			   /*  seen -after- acknowledgement. */
-	}
-}
-
-#ifdef CONFIG_NO_HZ
-static DEFINE_PER_CPU(int, rcu_update_flag);
-
-/**
- * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
- *
- * If the CPU was idle with dynamic ticks active, this updates the
- * rcu_dyntick_sched.dynticks to let the RCU handling know that the
- * CPU is active.
- */
-void rcu_irq_enter(void)
-{
-	int cpu = smp_processor_id();
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	if (per_cpu(rcu_update_flag, cpu))
-		per_cpu(rcu_update_flag, cpu)++;
-
-	/*
-	 * Only update if we are coming from a stopped ticks mode
-	 * (rcu_dyntick_sched.dynticks is even).
-	 */
-	if (!in_interrupt() &&
-	    (rdssp->dynticks & 0x1) == 0) {
-		/*
-		 * The following might seem like we could have a race
-		 * with NMI/SMIs. But this really isn't a problem.
-		 * Here we do a read/modify/write, and the race happens
-		 * when an NMI/SMI comes in after the read and before
-		 * the write. But NMI/SMIs will increment this counter
-		 * twice before returning, so the zero bit will not
-		 * be corrupted by the NMI/SMI which is the most important
-		 * part.
-		 *
-		 * The only thing is that we would bring back the counter
-		 * to a postion that it was in during the NMI/SMI.
-		 * But the zero bit would be set, so the rest of the
-		 * counter would again be ignored.
-		 *
-		 * On return from the IRQ, the counter may have the zero
-		 * bit be 0 and the counter the same as the return from
-		 * the NMI/SMI. If the state machine was so unlucky to
-		 * see that, it still doesn't matter, since all
-		 * RCU read-side critical sections on this CPU would
-		 * have already completed.
-		 */
-		rdssp->dynticks++;
-		/*
-		 * The following memory barrier ensures that any
-		 * rcu_read_lock() primitives in the irq handler
-		 * are seen by other CPUs to follow the above
-		 * increment to rcu_dyntick_sched.dynticks. This is
-		 * required in order for other CPUs to correctly
-		 * determine when it is safe to advance the RCU
-		 * grace-period state machine.
-		 */
-		smp_mb(); /* see above block comment. */
-		/*
-		 * Since we can't determine the dynamic tick mode from
-		 * the rcu_dyntick_sched.dynticks after this routine,
-		 * we use a second flag to acknowledge that we came
-		 * from an idle state with ticks stopped.
-		 */
-		per_cpu(rcu_update_flag, cpu)++;
-		/*
-		 * If we take an NMI/SMI now, they will also increment
-		 * the rcu_update_flag, and will not update the
-		 * rcu_dyntick_sched.dynticks on exit. That is for
-		 * this IRQ to do.
-		 */
-	}
-}
-
-/**
- * rcu_irq_exit - Called from exiting Hard irq context.
- *
- * If the CPU was idle with dynamic ticks active, update the
- * rcu_dyntick_sched.dynticks to let the RCU handling be
- * aware that the CPU is going back to idle with no ticks.
- */
-void rcu_irq_exit(void)
-{
-	int cpu = smp_processor_id();
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	/*
-	 * rcu_update_flag is set if we interrupted the CPU
-	 * when it was idle with ticks stopped.
-	 * Once this occurs, we keep track of interrupt nesting
-	 * because a NMI/SMI could also come in, and we still
-	 * only want the IRQ that started the increment of the
-	 * rcu_dyntick_sched.dynticks to be the one that modifies
-	 * it on exit.
-	 */
-	if (per_cpu(rcu_update_flag, cpu)) {
-		if (--per_cpu(rcu_update_flag, cpu))
-			return;
-
-		/* This must match the interrupt nesting */
-		WARN_ON(in_interrupt());
-
-		/*
-		 * If an NMI/SMI happens now we are still
-		 * protected by the rcu_dyntick_sched.dynticks being odd.
-		 */
-
-		/*
-		 * The following memory barrier ensures that any
-		 * rcu_read_unlock() primitives in the irq handler
-		 * are seen by other CPUs to preceed the following
-		 * increment to rcu_dyntick_sched.dynticks. This
-		 * is required in order for other CPUs to determine
-		 * when it is safe to advance the RCU grace-period
-		 * state machine.
-		 */
-		smp_mb(); /* see above block comment. */
-		rdssp->dynticks++;
-		WARN_ON(rdssp->dynticks & 0x1);
-	}
-}
-
-void rcu_nmi_enter(void)
-{
-	rcu_irq_enter();
-}
-
-void rcu_nmi_exit(void)
-{
-	rcu_irq_exit();
-}
-
-static void dyntick_save_progress_counter(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	rdssp->dynticks_snap = rdssp->dynticks;
-}
-
-static inline int
-rcu_try_flip_waitack_needed(int cpu)
-{
-	long curr;
-	long snap;
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	curr = rdssp->dynticks;
-	snap = rdssp->dynticks_snap;
-	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
-
-	/*
-	 * If the CPU remained in dynticks mode for the entire time
-	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
-	 * then it cannot be in the middle of an rcu_read_lock(), so
-	 * the next rcu_read_lock() it executes must use the new value
-	 * of the counter.  So we can safely pretend that this CPU
-	 * already acknowledged the counter.
-	 */
-
-	if ((curr == snap) && ((curr & 0x1) == 0))
-		return 0;
-
-	/*
-	 * If the CPU passed through or entered a dynticks idle phase with
-	 * no active irq handlers, then, as above, we can safely pretend
-	 * that this CPU already acknowledged the counter.
-	 */
-
-	if ((curr - snap) > 2 || (curr & 0x1) == 0)
-		return 0;
-
-	/* We need this CPU to explicitly acknowledge the counter flip. */
-
-	return 1;
-}
-
-static inline int
-rcu_try_flip_waitmb_needed(int cpu)
-{
-	long curr;
-	long snap;
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	curr = rdssp->dynticks;
-	snap = rdssp->dynticks_snap;
-	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
-
-	/*
-	 * If the CPU remained in dynticks mode for the entire time
-	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
-	 * then it cannot have executed an RCU read-side critical section
-	 * during that time, so there is no need for it to execute a
-	 * memory barrier.
-	 */
-
-	if ((curr == snap) && ((curr & 0x1) == 0))
-		return 0;
-
-	/*
-	 * If the CPU either entered or exited an outermost interrupt,
-	 * SMI, NMI, or whatever handler, then we know that it executed
-	 * a memory barrier when doing so.  So we don't need another one.
-	 */
-	if (curr != snap)
-		return 0;
-
-	/* We need the CPU to execute a memory barrier. */
-
-	return 1;
-}
-
-static void dyntick_save_progress_counter_sched(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	rdssp->sched_dynticks_snap = rdssp->dynticks;
-}
-
-static int rcu_qsctr_inc_needed_dyntick(int cpu)
-{
-	long curr;
-	long snap;
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	curr = rdssp->dynticks;
-	snap = rdssp->sched_dynticks_snap;
-	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
-
-	/*
-	 * If the CPU remained in dynticks mode for the entire time
-	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
-	 * then it cannot be in the middle of an rcu_read_lock(), so
-	 * the next rcu_read_lock() it executes must use the new value
-	 * of the counter.  Therefore, this CPU has been in a quiescent
-	 * state the entire time, and we don't need to wait for it.
-	 */
-
-	if ((curr == snap) && ((curr & 0x1) == 0))
-		return 0;
-
-	/*
-	 * If the CPU passed through or entered a dynticks idle phase with
-	 * no active irq handlers, then, as above, this CPU has already
-	 * passed through a quiescent state.
-	 */
-
-	if ((curr - snap) > 2 || (snap & 0x1) == 0)
-		return 0;
-
-	/* We need this CPU to go through a quiescent state. */
-
-	return 1;
-}
-
-#else /* !CONFIG_NO_HZ */
-
-# define dyntick_save_progress_counter(cpu)		do { } while (0)
-# define rcu_try_flip_waitack_needed(cpu)		(1)
-# define rcu_try_flip_waitmb_needed(cpu)		(1)
-
-# define dyntick_save_progress_counter_sched(cpu)	do { } while (0)
-# define rcu_qsctr_inc_needed_dyntick(cpu)		(1)
-
-#endif /* CONFIG_NO_HZ */
-
-static void save_qsctr_sched(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	rdssp->sched_qs_snap = rdssp->sched_qs;
-}
-
-static inline int rcu_qsctr_inc_needed(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	/*
-	 * If there has been a quiescent state, no more need to wait
-	 * on this CPU.
-	 */
-
-	if (rdssp->sched_qs != rdssp->sched_qs_snap) {
-		smp_mb(); /* force ordering with cpu entering schedule(). */
-		return 0;
-	}
-
-	/* We need this CPU to go through a quiescent state. */
-
-	return 1;
-}
-
-/*
- * Get here when RCU is idle.  Decide whether we need to
- * move out of idle state, and return non-zero if so.
- * "Straightforward" approach for the moment, might later
- * use callback-list lengths, grace-period duration, or
- * some such to determine when to exit idle state.
- * Might also need a pre-idle test that does not acquire
- * the lock, but let's get the simple case working first...
- */
-
-static int
-rcu_try_flip_idle(void)
-{
-	int cpu;
-
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
-	if (!rcu_pending(smp_processor_id())) {
-		RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
-		return 0;
-	}
-
-	/*
-	 * Do the flip.
-	 */
-
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
-	rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
-
-	/*
-	 * Need a memory barrier so that other CPUs see the new
-	 * counter value before they see the subsequent change of all
-	 * the rcu_flip_flag instances to rcu_flipped.
-	 */
-
-	smp_mb();	/* see above block comment. */
-
-	/* Now ask each CPU for acknowledgement of the flip. */
-
-	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
-		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
-		dyntick_save_progress_counter(cpu);
-	}
-
-	return 1;
-}
-
-/*
- * Wait for CPUs to acknowledge the flip.
- */
-
-static int
-rcu_try_flip_waitack(void)
-{
-	int cpu;
-
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
-		if (rcu_try_flip_waitack_needed(cpu) &&
-		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
-			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
-			return 0;
-		}
-
-	/*
-	 * Make sure our checks above don't bleed into subsequent
-	 * waiting for the sum of the counters to reach zero.
-	 */
-
-	smp_mb();	/* see above block comment. */
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
-	return 1;
-}
-
-/*
- * Wait for collective ``last'' counter to reach zero,
- * then tell all CPUs to do an end-of-grace-period memory barrier.
- */
-
-static int
-rcu_try_flip_waitzero(void)
-{
-	int cpu;
-	int lastidx = !(rcu_ctrlblk.completed & 0x1);
-	int sum = 0;
-
-	/* Check to see if the sum of the "last" counters is zero. */
-
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_possible_cpu(cpu)
-		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
-	if (sum != 0) {
-		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
-		return 0;
-	}
-
-	/*
-	 * This ensures that the other CPUs see the call for
-	 * memory barriers -after- the sum to zero has been
-	 * detected here
-	 */
-	smp_mb();  /*  ^^^^^^^^^^^^ */
-
-	/* Call for a memory barrier from each CPU. */
-	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
-		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
-		dyntick_save_progress_counter(cpu);
-	}
-
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
-	return 1;
-}
-
-/*
- * Wait for all CPUs to do their end-of-grace-period memory barrier.
- * Return 0 once all CPUs have done so.
- */
-
-static int
-rcu_try_flip_waitmb(void)
-{
-	int cpu;
-
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
-		if (rcu_try_flip_waitmb_needed(cpu) &&
-		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
-			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
-			return 0;
-		}
-
-	smp_mb(); /* Ensure that the above checks precede any following flip. */
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
-	return 1;
-}
-
-/*
- * Attempt a single flip of the counters.  Remember, a single flip does
- * -not- constitute a grace period.  Instead, the interval between
- * at least GP_STAGES consecutive flips is a grace period.
- *
- * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
- * on a large SMP, they might want to use a hierarchical organization of
- * the per-CPU-counter pairs.
- */
-static void rcu_try_flip(void)
-{
-	unsigned long flags;
-
-	RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
-	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
-		RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
-		return;
-	}
-
-	/*
-	 * Take the next transition(s) through the RCU grace-period
-	 * flip-counter state machine.
-	 */
-
-	switch (rcu_ctrlblk.rcu_try_flip_state) {
-	case rcu_try_flip_idle_state:
-		if (rcu_try_flip_idle())
-			rcu_ctrlblk.rcu_try_flip_state =
-				rcu_try_flip_waitack_state;
-		break;
-	case rcu_try_flip_waitack_state:
-		if (rcu_try_flip_waitack())
-			rcu_ctrlblk.rcu_try_flip_state =
-				rcu_try_flip_waitzero_state;
-		break;
-	case rcu_try_flip_waitzero_state:
-		if (rcu_try_flip_waitzero())
-			rcu_ctrlblk.rcu_try_flip_state =
-				rcu_try_flip_waitmb_state;
-		break;
-	case rcu_try_flip_waitmb_state:
-		if (rcu_try_flip_waitmb())
-			rcu_ctrlblk.rcu_try_flip_state =
-				rcu_try_flip_idle_state;
-	}
-	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-}
-
-/*
- * Check to see if this CPU needs to do a memory barrier in order to
- * ensure that any prior RCU read-side critical sections have committed
- * their counter manipulations and critical-section memory references
- * before declaring the grace period to be completed.
- */
-static void rcu_check_mb(int cpu)
-{
-	if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
-		smp_mb();  /* Ensure RCU read-side accesses are visible. */
-		per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
-	}
-}
-
-void rcu_check_callbacks(int cpu, int user)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	if (!rcu_pending(cpu))
-		return; /* if nothing for RCU to do. */
-
-	/*
-	 * If this CPU took its interrupt from user mode or from the
-	 * idle loop, and this is not a nested interrupt, then
-	 * this CPU has to have exited all prior preept-disable
-	 * sections of code.  So invoke rcu_sched_qs() to note this.
-	 *
-	 * The memory barrier is needed to handle the case where
-	 * writes from a preempt-disable section of code get reordered
-	 * into schedule() by this CPU's write buffer.  So the memory
-	 * barrier makes sure that the rcu_sched_qs() is seen by other
-	 * CPUs to happen after any such write.
-	 */
-
-	rdp = RCU_DATA_CPU(cpu);
-	if (user ||
-	    (idle_cpu(cpu) && !in_softirq() &&
-	     hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-		smp_mb();	/* Guard against aggressive schedule(). */
-		rcu_sched_qs(cpu);
-	}
-
-	rcu_check_mb(cpu);
-	if (rcu_ctrlblk.completed == rdp->completed)
-		rcu_try_flip();
-	spin_lock_irqsave(&rdp->lock, flags);
-	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
-	__rcu_advance_callbacks(rdp);
-	if (rdp->donelist == NULL) {
-		spin_unlock_irqrestore(&rdp->lock, flags);
-	} else {
-		spin_unlock_irqrestore(&rdp->lock, flags);
-		raise_softirq(RCU_SOFTIRQ);
-	}
-}
-
-/*
- * Needed by dynticks, to make sure all RCU processing has finished
- * when we go idle:
- */
-void rcu_advance_callbacks(int cpu, int user)
-{
-	unsigned long flags;
-	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
-	if (rcu_ctrlblk.completed == rdp->completed) {
-		rcu_try_flip();
-		if (rcu_ctrlblk.completed == rdp->completed)
-			return;
-	}
-	spin_lock_irqsave(&rdp->lock, flags);
-	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
-	__rcu_advance_callbacks(rdp);
-	spin_unlock_irqrestore(&rdp->lock, flags);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
-		*dsttail = srclist; \
-		if (srclist != NULL) { \
-			dsttail = srctail; \
-			srclist = NULL; \
-			srctail = &srclist;\
-		} \
-	} while (0)
-
-void rcu_offline_cpu(int cpu)
-{
-	int i;
-	struct rcu_head *list = NULL;
-	unsigned long flags;
-	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-	struct rcu_head *schedlist = NULL;
-	struct rcu_head **schedtail = &schedlist;
-	struct rcu_head **tail = &list;
-
-	/*
-	 * Remove all callbacks from the newly dead CPU, retaining order.
-	 * Otherwise rcu_barrier() will fail
-	 */
-
-	spin_lock_irqsave(&rdp->lock, flags);
-	rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
-	for (i = GP_STAGES - 1; i >= 0; i--)
-		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
-						list, tail);
-	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
-	rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
-				schedlist, schedtail);
-	rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
-				schedlist, schedtail);
-	rdp->rcu_sched_sleeping = 0;
-	spin_unlock_irqrestore(&rdp->lock, flags);
-	rdp->waitlistcount = 0;
-
-	/* Disengage the newly dead CPU from the grace-period computation. */
-
-	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
-	rcu_check_mb(cpu);
-	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
-		smp_mb();  /* Subsequent counter accesses must see new value */
-		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
-		smp_mb();  /* Subsequent RCU read-side critical sections */
-			   /*  seen -after- acknowledgement. */
-	}
-
-	cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
-
-	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-
-	/*
-	 * Place the removed callbacks on the current CPU's queue.
-	 * Make them all start a new grace period: simple approach,
-	 * in theory could starve a given set of callbacks, but
-	 * you would need to be doing some serious CPU hotplugging
-	 * to make this happen.  If this becomes a problem, adding
-	 * a synchronize_rcu() to the hotplug path would be a simple
-	 * fix.
-	 */
-
-	local_irq_save(flags);  /* disable preempt till we know what lock. */
-	rdp = RCU_DATA_ME();
-	spin_lock(&rdp->lock);
-	*rdp->nexttail = list;
-	if (list)
-		rdp->nexttail = tail;
-	*rdp->nextschedtail = schedlist;
-	if (schedlist)
-		rdp->nextschedtail = schedtail;
-	spin_unlock_irqrestore(&rdp->lock, flags);
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-void __cpuinit rcu_online_cpu(int cpu)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
-	cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
-	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-
-	/*
-	 * The rcu_sched grace-period processing might have bypassed
-	 * this CPU, given that it was not in the rcu_cpu_online_map
-	 * when the grace-period scan started.  This means that the
-	 * grace-period task might sleep.  So make sure that if this
-	 * should happen, the first callback posted to this CPU will
-	 * wake up the grace-period task if need be.
-	 */
-
-	rdp = RCU_DATA_CPU(cpu);
-	spin_lock_irqsave(&rdp->lock, flags);
-	rdp->rcu_sched_sleeping = 1;
-	spin_unlock_irqrestore(&rdp->lock, flags);
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
-	unsigned long flags;
-	struct rcu_head *next, *list;
-	struct rcu_data *rdp;
-
-	local_irq_save(flags);
-	rdp = RCU_DATA_ME();
-	spin_lock(&rdp->lock);
-	list = rdp->donelist;
-	if (list == NULL) {
-		spin_unlock_irqrestore(&rdp->lock, flags);
-		return;
-	}
-	rdp->donelist = NULL;
-	rdp->donetail = &rdp->donelist;
-	RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
-	spin_unlock_irqrestore(&rdp->lock, flags);
-	while (list) {
-		next = list->next;
-		list->func(list);
-		list = next;
-		RCU_TRACE_ME(rcupreempt_trace_invoke);
-	}
-}
-
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = RCU_DATA_ME();
-	spin_lock(&rdp->lock);
-	__rcu_advance_callbacks(rdp);
-	*rdp->nexttail = head;
-	rdp->nexttail = &head->next;
-	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
-	spin_unlock_irqrestore(&rdp->lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-	int wake_gp = 0;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = RCU_DATA_ME();
-	spin_lock(&rdp->lock);
-	*rdp->nextschedtail = head;
-	rdp->nextschedtail = &head->next;
-	if (rdp->rcu_sched_sleeping) {
-
-		/* Grace-period processing might be sleeping... */
-
-		rdp->rcu_sched_sleeping = 0;
-		wake_gp = 1;
-	}
-	spin_unlock_irqrestore(&rdp->lock, flags);
-	if (wake_gp) {
-
-		/* Wake up grace-period processing, unless someone beat us. */
-
-		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
-		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
-			wake_gp = 0;
-		rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
-		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-		if (wake_gp)
-			wake_up_interruptible(&rcu_ctrlblk.sched_wq);
-	}
-}
-EXPORT_SYMBOL_GPL(call_rcu_sched);
-
-/*
- * Wait until all currently running preempt_disable() code segments
- * (including hardware-irq-disable segments) complete.  Note that
- * in -rt this does -not- necessarily result in all currently executing
- * interrupt -handlers- having completed.
- */
-void __synchronize_sched(void)
-{
-	struct rcu_synchronize rcu;
-
-	if (num_online_cpus() == 1)
-		return;  /* blocking is gp if only one CPU! */
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_sched(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(__synchronize_sched);
-
-/*
- * kthread function that manages call_rcu_sched grace periods.
- */
-static int rcu_sched_grace_period(void *arg)
-{
-	int couldsleep;		/* might sleep after current pass. */
-	int couldsleepnext = 0; /* might sleep after next pass. */
-	int cpu;
-	unsigned long flags;
-	struct rcu_data *rdp;
-	int ret;
-
-	/*
-	 * Each pass through the following loop handles one
-	 * rcu_sched grace period cycle.
-	 */
-	do {
-		/* Save each CPU's current state. */
-
-		for_each_online_cpu(cpu) {
-			dyntick_save_progress_counter_sched(cpu);
-			save_qsctr_sched(cpu);
-		}
-
-		/*
-		 * Sleep for about an RCU grace-period's worth to
-		 * allow better batching and to consume less CPU.
-		 */
-		schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
-
-		/*
-		 * If there was nothing to do last time, prepare to
-		 * sleep at the end of the current grace period cycle.
-		 */
-		couldsleep = couldsleepnext;
-		couldsleepnext = 1;
-		if (couldsleep) {
-			spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
-			rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
-			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-		}
-
-		/*
-		 * Wait on each CPU in turn to have either visited
-		 * a quiescent state or been in dynticks-idle mode.
-		 */
-		for_each_online_cpu(cpu) {
-			while (rcu_qsctr_inc_needed(cpu) &&
-			       rcu_qsctr_inc_needed_dyntick(cpu)) {
-				/* resched_cpu(cpu); @@@ */
-				schedule_timeout_interruptible(1);
-			}
-		}
-
-		/* Advance callbacks for each CPU.  */
-
-		for_each_online_cpu(cpu) {
-
-			rdp = RCU_DATA_CPU(cpu);
-			spin_lock_irqsave(&rdp->lock, flags);
-
-			/*
-			 * We are running on this CPU irq-disabled, so no
-			 * CPU can go offline until we re-enable irqs.
-			 * The current CPU might have already gone
-			 * offline (between the for_each_offline_cpu and
-			 * the spin_lock_irqsave), but in that case all its
-			 * callback lists will be empty, so no harm done.
-			 *
-			 * Advance the callbacks!  We share normal RCU's
-			 * donelist, since callbacks are invoked the
-			 * same way in either case.
-			 */
-			if (rdp->waitschedlist != NULL) {
-				*rdp->donetail = rdp->waitschedlist;
-				rdp->donetail = rdp->waitschedtail;
-
-				/*
-				 * Next rcu_check_callbacks() will
-				 * do the required raise_softirq().
-				 */
-			}
-			if (rdp->nextschedlist != NULL) {
-				rdp->waitschedlist = rdp->nextschedlist;
-				rdp->waitschedtail = rdp->nextschedtail;
-				couldsleep = 0;
-				couldsleepnext = 0;
-			} else {
-				rdp->waitschedlist = NULL;
-				rdp->waitschedtail = &rdp->waitschedlist;
-			}
-			rdp->nextschedlist = NULL;
-			rdp->nextschedtail = &rdp->nextschedlist;
-
-			/* Mark sleep intention. */
-
-			rdp->rcu_sched_sleeping = couldsleep;
-
-			spin_unlock_irqrestore(&rdp->lock, flags);
-		}
-
-		/* If we saw callbacks on the last scan, go deal with them. */
-
-		if (!couldsleep)
-			continue;
-
-		/* Attempt to block... */
-
-		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
-		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
-
-			/*
-			 * Someone posted a callback after we scanned.
-			 * Go take care of it.
-			 */
-			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-			couldsleepnext = 0;
-			continue;
-		}
-
-		/* Block until the next person posts a callback. */
-
-		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
-		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-		ret = 0; /* unused */
-		__wait_event_interruptible(rcu_ctrlblk.sched_wq,
-			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
-			ret);
-
-		couldsleepnext = 0;
-
-	} while (!kthread_should_stop());
-
-	return (0);
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  Assumes that notifiers would take care of handling any
- * outstanding requests from the RCU core.
- *
- * This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
-	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
-	return (rdp->donelist != NULL ||
-		!!rdp->waitlistcount ||
-		rdp->nextlist != NULL ||
-		rdp->nextschedlist != NULL ||
-		rdp->waitschedlist != NULL);
-}
-
-static int rcu_pending(int cpu)
-{
-	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
-	/* The CPU has at least one callback queued somewhere. */
-
-	if (rdp->donelist != NULL ||
-	    !!rdp->waitlistcount ||
-	    rdp->nextlist != NULL ||
-	    rdp->nextschedlist != NULL ||
-	    rdp->waitschedlist != NULL)
-		return 1;
-
-	/* The RCU core needs an acknowledgement from this CPU. */
-
-	if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
-	    (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
-		return 1;
-
-	/* This CPU has fallen behind the global grace-period number. */
-
-	if (rdp->completed != rcu_ctrlblk.completed)
-		return 1;
-
-	/* Nothing needed from this CPU. */
-
-	return 0;
-}
-
-int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-			     unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		rcu_online_cpu(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		rcu_offline_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-void __init __rcu_init(void)
-{
-	int cpu;
-	int i;
-	struct rcu_data *rdp;
-
-	printk(KERN_NOTICE "Preemptible RCU implementation.\n");
-	for_each_possible_cpu(cpu) {
-		rdp = RCU_DATA_CPU(cpu);
-		spin_lock_init(&rdp->lock);
-		rdp->completed = 0;
-		rdp->waitlistcount = 0;
-		rdp->nextlist = NULL;
-		rdp->nexttail = &rdp->nextlist;
-		for (i = 0; i < GP_STAGES; i++) {
-			rdp->waitlist[i] = NULL;
-			rdp->waittail[i] = &rdp->waitlist[i];
-		}
-		rdp->donelist = NULL;
-		rdp->donetail = &rdp->donelist;
-		rdp->rcu_flipctr[0] = 0;
-		rdp->rcu_flipctr[1] = 0;
-		rdp->nextschedlist = NULL;
-		rdp->nextschedtail = &rdp->nextschedlist;
-		rdp->waitschedlist = NULL;
-		rdp->waitschedtail = &rdp->waitschedlist;
-		rdp->rcu_sched_sleeping = 0;
-	}
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-}
-
-/*
- * Late-boot-time RCU initialization that must wait until after scheduler
- * has been initialized.
- */
-void __init rcu_init_sched(void)
-{
-	rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
-						  NULL,
-						  "rcu_sched_grace_period");
-	WARN_ON(IS_ERR(rcu_sched_grace_period_task));
-}
-
-#ifdef CONFIG_RCU_TRACE
-long *rcupreempt_flipctr(int cpu)
-{
-	return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
-}
-EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
-
-int rcupreempt_flip_flag(int cpu)
-{
-	return per_cpu(rcu_flip_flag, cpu);
-}
-EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
-
-int rcupreempt_mb_flag(int cpu)
-{
-	return per_cpu(rcu_mb_flag, cpu);
-}
-EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
-
-char *rcupreempt_try_flip_state_name(void)
-{
-	return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
-}
-EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
-
-struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
-{
-	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
-	return &rdp->trace;
-}
-EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
-
-#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 11640346a50..00000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Read-Copy Update tracing for realtime implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2006
- *
- * Papers:  http://www.rdrop.com/users/paulmck/RCU
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU/ *.txt
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/rcupreempt_trace.h>
-#include <linux/debugfs.h>
-
-static struct mutex rcupreempt_trace_mutex;
-static char *rcupreempt_trace_buf;
-#define RCUPREEMPT_TRACE_BUF_SIZE 4096
-
-void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
-{
-	trace->done_length += trace->wait_length;
-	trace->done_add += trace->wait_length;
-	trace->wait_length = 0;
-}
-void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
-{
-	trace->wait_length += trace->next_length;
-	trace->wait_add += trace->next_length;
-	trace->next_length = 0;
-}
-void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
-{
-	atomic_inc(&trace->rcu_try_flip_1);
-}
-void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
-{
-	atomic_inc(&trace->rcu_try_flip_e1);
-}
-void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_i1++;
-}
-void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_ie1++;
-}
-void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_g1++;
-}
-void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_a1++;
-}
-void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_ae1++;
-}
-void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_a2++;
-}
-void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_z1++;
-}
-void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_ze1++;
-}
-void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_z2++;
-}
-void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_m1++;
-}
-void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_me1++;
-}
-void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
-{
-	trace->rcu_try_flip_m2++;
-}
-void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
-{
-	trace->rcu_check_callbacks++;
-}
-void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
-{
-	trace->done_remove += trace->done_length;
-	trace->done_length = 0;
-}
-void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
-{
-	atomic_inc(&trace->done_invoked);
-}
-void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
-{
-	trace->next_add++;
-	trace->next_length++;
-}
-
-static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
-{
-	struct rcupreempt_trace *cp;
-	int cpu;
-
-	memset(sp, 0, sizeof(*sp));
-	for_each_possible_cpu(cpu) {
-		cp = rcupreempt_trace_cpu(cpu);
-		sp->next_length += cp->next_length;
-		sp->next_add += cp->next_add;
-		sp->wait_length += cp->wait_length;
-		sp->wait_add += cp->wait_add;
-		sp->done_length += cp->done_length;
-		sp->done_add += cp->done_add;
-		sp->done_remove += cp->done_remove;
-		atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
-		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
-		atomic_add(atomic_read(&cp->rcu_try_flip_1),
-			   &sp->rcu_try_flip_1);
-		atomic_add(atomic_read(&cp->rcu_try_flip_e1),
-			   &sp->rcu_try_flip_e1);
-		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
-		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
-		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
-		sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
-		sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
-		sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
-		sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
-		sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
-		sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
-		sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
-		sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
-		sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
-	}
-}
-
-static ssize_t rcustats_read(struct file *filp, char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	struct rcupreempt_trace trace;
-	ssize_t bcount;
-	int cnt = 0;
-
-	rcupreempt_trace_sum(&trace);
-	mutex_lock(&rcupreempt_trace_mutex);
-	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
-		 "ggp=%ld rcc=%ld\n",
-		 rcu_batches_completed(),
-		 trace.rcu_check_callbacks);
-	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
-		 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
-		 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
-		 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
-
-		 trace.next_add, trace.next_length,
-		 trace.wait_add, trace.wait_length,
-		 trace.done_add, trace.done_length,
-		 trace.done_remove, atomic_read(&trace.done_invoked),
-		 atomic_read(&trace.rcu_try_flip_1),
-		 atomic_read(&trace.rcu_try_flip_e1),
-		 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
-		 trace.rcu_try_flip_g1,
-		 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
-			 trace.rcu_try_flip_a2,
-		 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
-			 trace.rcu_try_flip_z2,
-		 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
-			trace.rcu_try_flip_m2);
-	bcount = simple_read_from_buffer(buffer, count, ppos,
-			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
-	mutex_unlock(&rcupreempt_trace_mutex);
-	return bcount;
-}
-
-static ssize_t rcugp_read(struct file *filp, char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	long oldgp = rcu_batches_completed();
-	ssize_t bcount;
-
-	mutex_lock(&rcupreempt_trace_mutex);
-	synchronize_rcu();
-	snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
-		"oldggp=%ld  newggp=%ld\n", oldgp, rcu_batches_completed());
-	bcount = simple_read_from_buffer(buffer, count, ppos,
-			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
-	mutex_unlock(&rcupreempt_trace_mutex);
-	return bcount;
-}
-
-static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	int cnt = 0;
-	int cpu;
-	int f = rcu_batches_completed() & 0x1;
-	ssize_t bcount;
-
-	mutex_lock(&rcupreempt_trace_mutex);
-
-	cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
-				"CPU last cur F M\n");
-	for_each_possible_cpu(cpu) {
-		long *flipctr = rcupreempt_flipctr(cpu);
-		cnt += snprintf(&rcupreempt_trace_buf[cnt],
-				RCUPREEMPT_TRACE_BUF_SIZE - cnt,
-					"%3d%c %4ld %3ld %d %d\n",
-			       cpu,
-			       cpu_is_offline(cpu) ? '!' : ' ',
-			       flipctr[!f],
-			       flipctr[f],
-			       rcupreempt_flip_flag(cpu),
-			       rcupreempt_mb_flag(cpu));
-	}
-	cnt += snprintf(&rcupreempt_trace_buf[cnt],
-			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
-			"ggp = %ld, state = %s\n",
-			rcu_batches_completed(),
-			rcupreempt_try_flip_state_name());
-	cnt += snprintf(&rcupreempt_trace_buf[cnt],
-			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
-			"\n");
-	bcount = simple_read_from_buffer(buffer, count, ppos,
-			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
-	mutex_unlock(&rcupreempt_trace_mutex);
-	return bcount;
-}
-
-static struct file_operations rcustats_fops = {
-	.owner = THIS_MODULE,
-	.read = rcustats_read,
-};
-
-static struct file_operations rcugp_fops = {
-	.owner = THIS_MODULE,
-	.read = rcugp_read,
-};
-
-static struct file_operations rcuctrs_fops = {
-	.owner = THIS_MODULE,
-	.read = rcuctrs_read,
-};
-
-static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
-static int rcupreempt_debugfs_init(void)
-{
-	rcudir = debugfs_create_dir("rcu", NULL);
-	if (!rcudir)
-		goto out;
-	statdir = debugfs_create_file("rcustats", 0444, rcudir,
-						NULL, &rcustats_fops);
-	if (!statdir)
-		goto free_out;
-
-	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
-	if (!gpdir)
-		goto free_out;
-
-	ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
-						NULL, &rcuctrs_fops);
-	if (!ctrsdir)
-		goto free_out;
-	return 0;
-free_out:
-	if (statdir)
-		debugfs_remove(statdir);
-	if (gpdir)
-		debugfs_remove(gpdir);
-	debugfs_remove(rcudir);
-out:
-	return 1;
-}
-
-static int __init rcupreempt_trace_init(void)
-{
-	int ret;
-
-	mutex_init(&rcupreempt_trace_mutex);
-	rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
-	if (!rcupreempt_trace_buf)
-		return 1;
-	ret = rcupreempt_debugfs_init();
-	if (ret)
-		kfree(rcupreempt_trace_buf);
-	return ret;
-}
-
-static void __exit rcupreempt_trace_cleanup(void)
-{
-	debugfs_remove(statdir);
-	debugfs_remove(gpdir);
-	debugfs_remove(ctrsdir);
-	debugfs_remove(rcudir);
-	kfree(rcupreempt_trace_buf);
-}
-
-
-module_init(rcupreempt_trace_init);
-module_exit(rcupreempt_trace_cleanup);
-- 
cgit v1.2.3


From d8e180dcd5bbbab9cd3ff2e779efcf70692ef541 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Thu, 20 Aug 2009 14:39:52 -0700
Subject: bsdacct: switch credentials for writing to the accounting file

When process accounting is enabled, every exiting process writes a log to
the account file.  In addition, every once in a while one of the exiting
processes checks whether there's enough free space for the log.

SELinux policy may or may not allow the exiting process to stat the fs.
So unsuspecting processes start generating AVC denials just because
someone enabled process accounting.

For these filesystem operations, the exiting process's credentials should
be temporarily switched to that of the process which enabled accounting,
because it's really that process which wanted to have the accounting
information logged.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/acct.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 9f3391090b3..9a4715a2f6b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	u64 run_time;
 	struct timespec uptime;
 	struct tty_struct *tty;
+	const struct cred *orig_cred;
+
+	/* Perform file operations on behalf of whoever enabled accounting */
+	orig_cred = override_creds(file->f_cred);
 
 	/*
 	 * First check to see if there is enough free_space to continue
 	 * the process accounting system.
 	 */
 	if (!check_free_space(acct, file))
-		return;
+		goto out;
 
 	/*
 	 * Fill the accounting struct with the needed info as recorded
@@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 			       sizeof(acct_t), &file->f_pos);
 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 	set_fs(fs);
+out:
+	revert_creds(orig_cred);
 }
 
 /**
-- 
cgit v1.2.3


From 33f76148ced0e0618062e302d2a9614efdbd4a06 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 24 Aug 2009 09:42:01 -0700
Subject: rcu: Add CPU-offline processing for single-node configurations

Add preemptable-RCU plugin to handle the CPU-offline
processing.

An additional plugin is forthcoming to handle multinode RCU
trees, but this current plugin works for configurations up to
32 CPUs (64 CPUs for 64-bit kernels).

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12511321213336-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        |  2 ++
 kernel/rcutree_plugin.h | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index cc025571407..000b0760987 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,7 @@ extern long rcu_batches_completed_sched(void);
 static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
 			  struct rcu_node *rnp, unsigned long flags);
 static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
 static void __rcu_process_callbacks(struct rcu_state *rsp,
 				    struct rcu_data *rdp);
 static void __call_rcu(struct rcu_head *head,
@@ -920,6 +921,7 @@ static void rcu_offline_cpu(int cpu)
 {
 	__rcu_offline_cpu(cpu, &rcu_sched_state);
 	__rcu_offline_cpu(cpu, &rcu_bh_state);
+	rcu_preempt_offline_cpu(cpu);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index cd2ab67400c..201334cdc20 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -259,6 +259,18 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 	return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Do CPU-offline processing for preemptable RCU.
+ */
+static void rcu_preempt_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_preempt_state);
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
 /*
  * Check for a quiescent state from the current CPU.  When a task blocks,
  * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -395,6 +407,18 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Because preemptable RCU does not exist, it never needs CPU-offline
+ * processing.
+ */
+static void rcu_preempt_offline_cpu(int cpu)
+{
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
 /*
  * Because preemptable RCU does not exist, it never has any callbacks
  * to check.
-- 
cgit v1.2.3


From fa289beca9de9119c7760bd984f3640da21bc94c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 25 Aug 2009 15:17:20 +1000
Subject: perf_counter: Start counting time enabled when group leader gets
 enabled

Currently, if a group is created where the group leader is
initially disabled but a non-leader member is initially
enabled, and then the leader is subsequently enabled some time
later, the time_enabled for the non-leader member will reflect
the whole time since it was created, not just the time since
the leader was enabled.

This is incorrect, because all of the members are effectively
disabled while the leader is disabled, since none of the
members can go on the PMU if the leader can't.

Thus we have to update the ->tstamp_enabled for all the enabled
group members when a group leader is enabled, so that the
time_enabled computation only counts the time since the leader
was enabled.

Similarly, when disabling a group leader we have to update the
time_enabled and time_running for all of the group members.

Also, in update_counter_times, we have to treat a counter whose
group leader is disabled as being disabled.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
LKML-Reference: <19091.29664.342227.445006@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f274e195988..06bf6a4f260 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -469,7 +469,8 @@ static void update_counter_times(struct perf_counter *counter)
 	struct perf_counter_context *ctx = counter->ctx;
 	u64 run_end;
 
-	if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+	    counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
 		return;
 
 	counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +519,7 @@ static void __perf_counter_disable(void *info)
 	 */
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 		update_context_time(ctx);
-		update_counter_times(counter);
+		update_group_times(counter);
 		if (counter == counter->group_leader)
 			group_sched_out(counter, cpuctx, ctx);
 		else
@@ -573,7 +574,7 @@ static void perf_counter_disable(struct perf_counter *counter)
 	 * in, so we can change the state safely.
 	 */
 	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-		update_counter_times(counter);
+		update_group_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 
@@ -850,6 +851,27 @@ retry:
 	spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Put a counter into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_counter_mark_enabled(struct perf_counter *counter,
+					struct perf_counter_context *ctx)
+{
+	struct perf_counter *sub;
+
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+	list_for_each_entry(sub, &counter->sibling_list, list_entry)
+		if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
+			sub->tstamp_enabled =
+				ctx->time - sub->total_time_enabled;
+}
+
 /*
  * Cross CPU call to enable a performance counter
  */
@@ -877,8 +899,7 @@ static void __perf_counter_enable(void *info)
 
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+	__perf_counter_mark_enabled(counter, ctx);
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -971,11 +992,9 @@ static void perf_counter_enable(struct perf_counter *counter)
 	 * Since we have the lock this context can't be scheduled
 	 * in, so we can change the state safely.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_OFF) {
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled =
-			ctx->time - counter->total_time_enabled;
-	}
+	if (counter->state == PERF_COUNTER_STATE_OFF)
+		__perf_counter_mark_enabled(counter, ctx);
+
  out:
 	spin_unlock_irq(&ctx->lock);
 }
@@ -1479,9 +1498,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
 		counter->attr.enable_on_exec = 0;
 		if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 			continue;
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled =
-			ctx->time - counter->total_time_enabled;
+		__perf_counter_mark_enabled(counter, ctx);
 		enabled = 1;
 	}
 
-- 
cgit v1.2.3


From a4be7c2778d1fd8e0a8a5607bec91fa221ab2c91 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 19 Aug 2009 11:18:27 +0200
Subject: perf_counter: Allow sharing of output channels

Provide the ability to configure a counter to send its output
to another (already existing) counter's output stream.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090819092023.980284148@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 06bf6a4f260..53abcbefa0b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1692,6 +1692,11 @@ static void free_counter(struct perf_counter *counter)
 			atomic_dec(&nr_task_counters);
 	}
 
+	if (counter->output) {
+		fput(counter->output->filp);
+		counter->output = NULL;
+	}
+
 	if (counter->destroy)
 		counter->destroy(counter);
 
@@ -1977,6 +1982,8 @@ unlock:
 	return ret;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -2000,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_COUNTER_IOC_PERIOD:
 		return perf_counter_period(counter, (u64 __user *)arg);
 
+	case PERF_COUNTER_IOC_SET_OUTPUT:
+		return perf_counter_set_output(counter, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -2270,6 +2280,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->mmap_mutex);
+	if (counter->output) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
 	if (atomic_inc_not_zero(&counter->mmap_count)) {
 		if (nr_pages != counter->data->nr_pages)
 			ret = -EINVAL;
@@ -2655,6 +2670,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
 			     int nmi, int sample)
 {
+	struct perf_counter *output_counter;
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
 	int have_lost;
@@ -2664,13 +2680,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
 		u64			 lost;
 	} lost_event;
 
+	rcu_read_lock();
 	/*
 	 * For inherited counters we send all the output towards the parent.
 	 */
 	if (counter->parent)
 		counter = counter->parent;
 
-	rcu_read_lock();
+	output_counter = rcu_dereference(counter->output);
+	if (output_counter)
+		counter = output_counter;
+
 	data = rcu_dereference(counter->data);
 	if (!data)
 		goto out;
@@ -4218,6 +4238,57 @@ err_size:
 	goto out;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+	struct perf_counter *output_counter = NULL;
+	struct file *output_file = NULL;
+	struct perf_counter *old_output;
+	int fput_needed = 0;
+	int ret = -EINVAL;
+
+	if (!output_fd)
+		goto set;
+
+	output_file = fget_light(output_fd, &fput_needed);
+	if (!output_file)
+		return -EBADF;
+
+	if (output_file->f_op != &perf_fops)
+		goto out;
+
+	output_counter = output_file->private_data;
+
+	/* Don't chain output fds */
+	if (output_counter->output)
+		goto out;
+
+	/* Don't set an output fd when we already have an output channel */
+	if (counter->data)
+		goto out;
+
+	atomic_long_inc(&output_file->f_count);
+
+set:
+	mutex_lock(&counter->mmap_mutex);
+	old_output = counter->output;
+	rcu_assign_pointer(counter->output, output_counter);
+	mutex_unlock(&counter->mmap_mutex);
+
+	if (old_output) {
+		/*
+		 * we need to make sure no existing perf_output_*()
+		 * is still referencing this counter.
+		 */
+		synchronize_rcu();
+		fput(old_output->filp);
+	}
+
+	ret = 0;
+out:
+	fput_light(output_file, fput_needed);
+	return ret;
+}
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -4240,7 +4311,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	int ret;
 
 	/* for future expandability... */
-	if (flags)
+	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
 		return -EINVAL;
 
 	ret = perf_copy_attr(attr_uptr, &attr);
@@ -4268,7 +4339,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	 * Look up the group leader (we will attach this counter to it):
 	 */
 	group_leader = NULL;
-	if (group_fd != -1) {
+	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
 		ret = -EINVAL;
 		group_file = fget_light(group_fd, &fput_needed);
 		if (!group_file)
@@ -4310,6 +4381,12 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (!counter_file)
 		goto err_free_put_context;
 
+	if (flags & PERF_FLAG_FD_OUTPUT) {
+		ret = perf_counter_set_output(counter, group_fd);
+		if (ret)
+			goto err_free_put_context;
+	}
+
 	counter->filp = counter_file;
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
-- 
cgit v1.2.3


From c935a331c8f569c7903ed26a3994a70cbea1802e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 25 Aug 2009 08:40:25 -0700
Subject: rcu: Add #ifdef to suppress __rcu_offline_cpu() warning in
 !HOTPLUG_CPU builds

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Paul Mundt <lethal@linux-sh.org>
LKML-Reference: <20090825154025.GD6616@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 000b0760987..fee6316a867 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,7 +84,9 @@ extern long rcu_batches_completed_sched(void);
 static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
 			  struct rcu_node *rnp, unsigned long flags);
 static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
+#ifdef CONFIG_HOTPLUG_CPU
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void __rcu_process_callbacks(struct rcu_state *rsp,
 				    struct rcu_data *rdp);
 static void __call_rcu(struct rcu_head *head,
-- 
cgit v1.2.3


From 31ffe249e5426d2648d68568fa00a7b66666a5db Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 26 Aug 2009 16:32:37 -0700
Subject: net: Temporarily backout SKB sources tracer.

Steven Rostedt has suggested that Neil work with the tracing
folks, trying to use TRACE_EVENT as the mechanism for
implementation.  And if that doesn't workout we can investigate
other solutions such as that one which was tried here.

This reverts the following 2 commits:

5a165657bef7c47e5ff4cd138f7758ef6278e87b
("net: skb ftracer - Add config option to enable new ftracer (v3)")

9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb
("net: skb ftracer - Add actual ftrace code to kernel (v3)")

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Kconfig             |  11 ---
 kernel/trace/Makefile            |   1 -
 kernel/trace/trace.h             |  19 -----
 kernel/trace/trace_skb_sources.c | 154 ---------------------------------------
 4 files changed, 185 deletions(-)
 delete mode 100644 kernel/trace/trace_skb_sources.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9a2f19bf051..019f380fd76 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -234,17 +234,6 @@ config BOOT_TRACER
 	  You must pass in initcall_debug and ftrace=initcall to the kernel
 	  command line to enable this on bootup.
 
-config SKB_SOURCES_TRACER
-	bool "Trace skb source information"
-	depends on NET
-	select GENERIC_TRACER
-	help
-	   This tracer helps developers/sysadmins correlate skb allocation and
-	   consumption.  The idea being that some processes will primarily consume data
-	   that was allocated on certain numa nodes.  By being able to visualize which
-	   nodes the data was allocated on, a sysadmin or developer can optimize the
-	   scheduling of those processes to cut back on cross node chatter.
-
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ee5e5b1b928..844164dca90 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -49,7 +49,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
 endif
-obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8a6281b2330..8b9f4f6e955 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,7 +11,6 @@
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
-#include <trace/events/skb.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -41,7 +40,6 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
-	TRACE_SKB_SOURCE,
 
 	__TRACE_LAST_TYPE,
 };
@@ -173,21 +171,6 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
-struct skb_record {
-	pid_t pid;		/* pid of the copying process */
-	int anid;		/* node where skb was allocated */
-	int cnid;		/* node to which skb was copied in userspace */
-	char ifname[IFNAMSIZ];	/* Name of the receiving interface */
-	int rx_queue;		/* The rx queue the skb was received on */
-	int ccpu;		/* Cpu the application got this frame from */
-	int len;		/* length of the data copied */
-};
-
-struct trace_skb_event {
-	struct trace_entry	ent;
-	struct skb_record	event_data;
-};
-
 enum kmemtrace_type_id {
 	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
 	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
@@ -340,8 +323,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
-		IF_ASSIGN(var, ent, struct trace_skb_event,		\
-			  TRACE_SKB_SOURCE);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c
deleted file mode 100644
index 40eb071afdb..00000000000
--- a/kernel/trace/trace_skb_sources.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * ring buffer based tracer for analyzing per-socket skb sources
- *
- * Neil Horman <nhorman@tuxdriver.com>
- * Copyright (C) 2009
- *
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <trace/events/skb.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/hardirq.h>
-#include <linux/netdevice.h>
-#include <net/sock.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec);
-
-static struct trace_array *skb_trace;
-static int __read_mostly trace_skb_source_enabled;
-
-static void probe_skb_dequeue(const struct sk_buff *skb, int len)
-{
-	struct ring_buffer_event *event;
-	struct trace_skb_event *entry;
-	struct trace_array *tr = skb_trace;
-	struct net_device *dev;
-
-	if (!trace_skb_source_enabled)
-		return;
-
-	if (in_interrupt())
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry = ring_buffer_event_data(event);
-
-	entry->event_data.pid = current->pid;
-	entry->event_data.anid = page_to_nid(virt_to_page(skb->data));
-	entry->event_data.cnid = cpu_to_node(smp_processor_id());
-	entry->event_data.len = len;
-	entry->event_data.rx_queue = skb->queue_mapping;
-	entry->event_data.ccpu = smp_processor_id();
-
-	dev = dev_get_by_index(sock_net(skb->sk), skb->iif);
-	if (dev) {
-		memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ);
-		dev_put(dev);
-	} else {
-		strcpy(entry->event_data.ifname, "Unknown");
-	}
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-
-static int tracing_skb_source_register(void)
-{
-	int ret;
-
-	ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
-	if (ret)
-		pr_info("skb source trace: Couldn't activate dequeue tracepoint");
-
-	return ret;
-}
-
-static void start_skb_source_trace(struct trace_array *tr)
-{
-	trace_skb_source_enabled = 1;
-}
-
-static void stop_skb_source_trace(struct trace_array *tr)
-{
-	trace_skb_source_enabled = 0;
-}
-
-static void skb_source_trace_reset(struct trace_array *tr)
-{
-	trace_skb_source_enabled = 0;
-	unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
-}
-
-
-static int skb_source_trace_init(struct trace_array *tr)
-{
-	int cpu;
-	skb_trace = tr;
-
-	trace_skb_source_enabled = 1;
-	tracing_skb_source_register();
-
-	for_each_cpu(cpu, cpu_possible_mask)
-		tracing_reset(tr, cpu);
-	return 0;
-}
-
-static enum print_line_t skb_source_print_line(struct trace_iterator *iter)
-{
-	int ret = 0;
-	struct trace_entry *entry = iter->ent;
-	struct trace_skb_event *event;
-	struct skb_record *record;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(event, entry);
-	record = &event->event_data;
-	if (entry->type != TRACE_SKB_SOURCE)
-		return TRACE_TYPE_UNHANDLED;
-
-	ret = trace_seq_printf(s, "	%d	%d	%d	%s	%d	%d	%d\n",
-			record->pid,
-			record->anid,
-			record->cnid,
-			record->ifname,
-			record->rx_queue,
-			record->ccpu,
-			record->len);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static void skb_source_print_header(struct seq_file *s)
-{
-	seq_puts(s, "#	PID	ANID	CNID	IFC	RXQ	CCPU	LEN\n");
-	seq_puts(s, "#	 |	 |	 |	 |	 |	 |	 |\n");
-}
-
-static struct tracer skb_source_tracer __read_mostly =
-{
-	.name		= "skb_sources",
-	.init		= skb_source_trace_init,
-	.start		= start_skb_source_trace,
-	.stop		= stop_skb_source_trace,
-	.reset		= skb_source_trace_reset,
-	.print_line	= skb_source_print_line,
-	.print_header	= skb_source_print_header,
-};
-
-static int init_skb_source_trace(void)
-{
-	return register_tracer(&skb_source_tracer);
-}
-device_initcall(init_skb_source_trace);
-- 
cgit v1.2.3


From 4dbc9ca219b0f294332e734528f7b82211700170 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 27 Aug 2009 09:38:49 +0200
Subject: genirq: Do not mask oneshot edge type interrupts

Masking oneshot edge type interrupts is wrong as we might lose an
interrupt which is issued when the threaded handler is handling the
device. We can keep the irq unmasked safely as with edge type
interrupts there is no danger of interrupt floods. If the threaded
handler has not yet finished then IRQTF_RUNTHREAD is set which will
keep the handler thread active.

Debugged and verified in preempt-rt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 5765aad9499..c1660194d11 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -548,13 +548,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
-	if (unlikely(desc->status & IRQ_ONESHOT)) {
-		desc->status |= IRQ_MASKED;
-		mask_ack_irq(desc, irq);
-	} else {
-		if (desc->chip->ack)
-			desc->chip->ack(irq);
-	}
+	if (desc->chip->ack)
+		desc->chip->ack(irq);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
-- 
cgit v1.2.3


From 34d76c41554a05425613d16efebb3069c4c545f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 27 Aug 2009 13:08:56 +0200
Subject: sched: Fix division by zero - really

When re-computing the shares for each task group's cpu
representation we need the ratio of weight on each cpu vs the
total weight of the sched domain.

Since load-balancing is loosely (read not) synchronized, the
weight of individual cpus can change between doing the sum and
calculating the ratio.

The previous patch dealt with only one of the race scenarios,
this patch side steps them all by saving a snapshot of all the
individual cpu weights, thereby always working on a consistent
set.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: torvalds@linux-foundation.org
Cc: jes@sgi.com
Cc: jens.axboe@oracle.com
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1251371336.18584.77.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 50 +++++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8f8a98eab9d..523e20a6269 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1515,30 +1515,29 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+struct update_shares_data {
+	unsigned long rq_weight[NR_CPUS];
+};
+
+static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
 /*
  * Calculate and set the cpu's group shares.
  */
-static void
-update_group_shares_cpu(struct task_group *tg, int cpu,
-			unsigned long sd_shares, unsigned long sd_rq_weight,
-			unsigned long sd_eff_weight)
+static void update_group_shares_cpu(struct task_group *tg, int cpu,
+				    unsigned long sd_shares,
+				    unsigned long sd_rq_weight,
+				    struct update_shares_data *usd)
 {
-	unsigned long rq_weight;
-	unsigned long shares;
+	unsigned long shares, rq_weight;
 	int boost = 0;
 
-	if (!tg->se[cpu])
-		return;
-
-	rq_weight = tg->cfs_rq[cpu]->rq_weight;
+	rq_weight = usd->rq_weight[cpu];
 	if (!rq_weight) {
 		boost = 1;
 		rq_weight = NICE_0_LOAD;
-		if (sd_rq_weight == sd_eff_weight)
-			sd_eff_weight += NICE_0_LOAD;
-		sd_rq_weight = sd_eff_weight;
 	}
 
 	/*
@@ -1555,6 +1554,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
+		tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
 		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
@@ -1568,25 +1568,31 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long weight, rq_weight = 0, eff_weight = 0;
-	unsigned long shares = 0;
+	unsigned long weight, rq_weight = 0, shares = 0;
+	struct update_shares_data *usd;
 	struct sched_domain *sd = data;
+	unsigned long flags;
 	int i;
 
+	if (!tg->se[0])
+		return 0;
+
+	local_irq_save(flags);
+	usd = &__get_cpu_var(update_shares_data);
+
 	for_each_cpu(i, sched_domain_span(sd)) {
+		weight = tg->cfs_rq[i]->load.weight;
+		usd->rq_weight[i] = weight;
+
 		/*
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
 		 * run here it will not get delayed by group starvation.
 		 */
-		weight = tg->cfs_rq[i]->load.weight;
-		tg->cfs_rq[i]->rq_weight = weight;
-		rq_weight += weight;
-
 		if (!weight)
 			weight = NICE_0_LOAD;
 
-		eff_weight += weight;
+		rq_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 
@@ -1597,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
 		shares = tg->shares;
 
 	for_each_cpu(i, sched_domain_span(sd))
-		update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight);
+		update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+
+	local_irq_restore(flags);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2bc481cf433879f0e6cdd4d899fc21ee05dcea23 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 28 Aug 2009 23:41:29 -0700
Subject: pktgen: spin using hrtimer

This changes how the pktgen thread spins/waits between
packets if delay is configured. It uses a high res timer to
wait for time to arrive.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/hrtimer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab848..05071bf6a37 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -485,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
 	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
 	__hrtimer_init(timer, clock_id, mode);
 }
+EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
 
 void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
@@ -1477,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 }
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
 {
-- 
cgit v1.2.3


From a417887637e862b434b293404f2a31ad1f282a58 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sat, 29 Aug 2009 18:47:59 +0800
Subject: lockdep: Remove recursion stattistics

Since lockdep has introduced BFS to avoid recursion, statistics
for recursion does not make any sense now. So remove them.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Cc: a.p.zijlstra@chello.nl
LKML-Reference: <1251542879-5211-1-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c      | 4 ----
 kernel/lockdep_proc.c | 8 --------
 2 files changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0843584aed5..f74d2d7aa60 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -399,7 +399,6 @@ unsigned int nr_hardirq_chains;
 unsigned int nr_softirq_chains;
 unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
-unsigned int max_recursion_depth;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
@@ -429,11 +428,8 @@ atomic_t redundant_softirqs_on;
 atomic_t redundant_softirqs_off;
 atomic_t nr_unused_locks;
 atomic_t nr_cyclic_checks;
-atomic_t nr_cyclic_check_recursions;
 atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_forwards_recursions;
 atomic_t nr_find_usage_backwards_checks;
-atomic_t nr_find_usage_backwards_recursions;
 #endif
 
 /*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8dac8402e07..ec33c6ad58d 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -199,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
 		debug_atomic_read(&chain_lookup_hits));
 	seq_printf(m, " cyclic checks:                 %11u\n",
 		debug_atomic_read(&nr_cyclic_checks));
-	seq_printf(m, " cyclic-check recursions:       %11u\n",
-		debug_atomic_read(&nr_cyclic_check_recursions));
 	seq_printf(m, " find-mask forwards checks:     %11u\n",
 		debug_atomic_read(&nr_find_usage_forwards_checks));
-	seq_printf(m, " find-mask forwards recursions: %11u\n",
-		debug_atomic_read(&nr_find_usage_forwards_recursions));
 	seq_printf(m, " find-mask backwards checks:    %11u\n",
 		debug_atomic_read(&nr_find_usage_backwards_checks));
-	seq_printf(m, " find-mask backwards recursions:%11u\n",
-		debug_atomic_read(&nr_find_usage_backwards_recursions));
 
 	seq_printf(m, " hardirq on events:             %11u\n", hi1);
 	seq_printf(m, " hardirq off events:            %11u\n", hi2);
@@ -350,8 +344,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			nr_unused);
 	seq_printf(m, " max locking depth:             %11u\n",
 			max_lockdep_depth);
-	seq_printf(m, " max recursion depth:           %11u\n",
-			max_recursion_depth);
 #ifdef CONFIG_PROVE_LOCKING
 	seq_printf(m, " max bfs queue depth:           %11u\n",
 			max_bfs_queue_depth);
-- 
cgit v1.2.3


From dd5d19bafd90d33043a4a14b2e2d98612caa293c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 27 Aug 2009 14:58:16 -0700
Subject: rcu: Create rcutree plugins to handle hotplug CPU for multi-level
 trees

When offlining CPUs from a multi-level tree, there is the
possibility of offlining the last CPU from a given node when
there are preempted RCU read-side critical sections that
started life on one of the CPUs on that node.

In this case, the corresponding tasks will be enqueued via the
task_struct's rcu_node_entry list_head onto one of the
rcu_node's blocked_tasks[] lists.  These tasks need to be moved
somewhere else so that they will prevent the current grace
period from ending. That somewhere is the root rcu_node.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <20090827215816.GA30472@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        |  2 ++
 kernel/rcutree_plugin.h | 69 +++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index fee6316a867..d903e2f2b84 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -81,6 +81,7 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 extern long rcu_batches_completed_sched(void);
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
 static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
 			  struct rcu_node *rnp, unsigned long flags);
 static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
@@ -876,6 +877,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 			spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			break;
 		}
+		rcu_preempt_offline_tasks(rsp, rnp);
 		mask = rnp->grpmask;
 		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 		rnp = rnp->parent;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 201334cdc20..04343bee646 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -92,7 +92,7 @@ static void rcu_preempt_qs(int cpu)
 		rnp = rdp->mynode;
 		spin_lock(&rnp->lock);
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
-		t->rcu_blocked_cpu = cpu;
+		t->rcu_blocked_node = (void *)rnp;
 
 		/*
 		 * If this CPU has already checked in, then this task
@@ -170,12 +170,21 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	if (special & RCU_READ_UNLOCK_BLOCKED) {
 		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
 
-		/* Remove this task from the list it blocked on. */
-		rnp = rcu_preempt_state.rda[t->rcu_blocked_cpu]->mynode;
-		spin_lock(&rnp->lock);
+		/*
+		 * Remove this task from the list it blocked on.  The
+		 * task can migrate while we acquire the lock, but at
+		 * most one time.  So at most two passes through loop.
+		 */
+		for (;;) {
+			rnp = (struct rcu_node *)t->rcu_blocked_node;
+			spin_lock(&rnp->lock);
+			if (rnp == (struct rcu_node *)t->rcu_blocked_node)
+				break;
+			spin_unlock(&rnp->lock);
+		}
 		empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
 		list_del_init(&t->rcu_node_entry);
-		t->rcu_blocked_cpu = -1;
+		t->rcu_blocked_node = NULL;
 
 		/*
 		 * If this was the last task on the current list, and if
@@ -261,6 +270,47 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+/*
+ * Handle tasklist migration for case in which all CPUs covered by the
+ * specified rcu_node have gone offline.  Move them up to the root
+ * rcu_node.  The reason for not just moving them to the immediate
+ * parent is to remove the need for rcu_read_unlock_special() to
+ * make more than two attempts to acquire the target rcu_node's lock.
+ *
+ * The caller must hold rnp->lock with irqs disabled.
+ */
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+				      struct rcu_node *rnp)
+{
+	int i;
+	struct list_head *lp;
+	struct list_head *lp_root;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
+	struct task_struct *tp;
+
+	if (rnp == rnp_root)
+		return;  /* Shouldn't happen: at least one CPU online. */
+
+	/*
+	 * Move tasks up to root rcu_node.  Rely on the fact that the
+	 * root rcu_node can be at most one ahead of the rest of the
+	 * rcu_nodes in terms of gp_num value.  This fact allows us to
+	 * move the blocked_tasks[] array directly, element by element.
+	 */
+	for (i = 0; i < 2; i++) {
+		lp = &rnp->blocked_tasks[i];
+		lp_root = &rnp_root->blocked_tasks[i];
+		while (!list_empty(lp)) {
+			tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
+			spin_lock(&rnp_root->lock); /* irqs already disabled */
+			list_del(&tp->rcu_node_entry);
+			tp->rcu_blocked_node = rnp_root;
+			list_add(&tp->rcu_node_entry, lp_root);
+			spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+		}
+	}
+}
+
 /*
  * Do CPU-offline processing for preemptable RCU.
  */
@@ -409,6 +459,15 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+/*
+ * Because preemptable RCU does not exist, it never needs to migrate
+ * tasks that were blocked within RCU read-side critical sections.
+ */
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+				      struct rcu_node *rnp)
+{
+}
+
 /*
  * Because preemptable RCU does not exist, it never needs CPU-offline
  * processing.
-- 
cgit v1.2.3


From 868489660dabc0c28087cca3dbc1adbbc398c6fe Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 27 Aug 2009 15:00:12 -0700
Subject: rcu: Changes from reviews: avoid casts, fix/add warnings, improve
 comments

Changes suggested by review comments from Josh Triplett and
Mathieu Desnoyers.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <20090827220012.GA30525@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 13 ++++++-------
 kernel/rcutree.h        |  2 ++
 kernel/rcutree_plugin.h | 10 ++++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d903e2f2b84..71bc79791cd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -229,7 +229,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 #endif /* #ifdef CONFIG_SMP */
 
 #ifdef CONFIG_NO_HZ
-static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
 
 /**
  * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -249,7 +248,7 @@ void rcu_enter_nohz(void)
 	rdtp = &__get_cpu_var(rcu_dynticks);
 	rdtp->dynticks++;
 	rdtp->dynticks_nesting--;
-	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+	WARN_ON_ONCE(rdtp->dynticks & 0x1);
 	local_irq_restore(flags);
 }
 
@@ -268,7 +267,7 @@ void rcu_exit_nohz(void)
 	rdtp = &__get_cpu_var(rcu_dynticks);
 	rdtp->dynticks++;
 	rdtp->dynticks_nesting++;
-	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
 	local_irq_restore(flags);
 	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
@@ -287,7 +286,7 @@ void rcu_nmi_enter(void)
 	if (rdtp->dynticks & 0x1)
 		return;
 	rdtp->dynticks_nmi++;
-	WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+	WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
 	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 
@@ -306,7 +305,7 @@ void rcu_nmi_exit(void)
 		return;
 	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 	rdtp->dynticks_nmi++;
-	WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+	WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
 }
 
 /**
@@ -322,7 +321,7 @@ void rcu_irq_enter(void)
 	if (rdtp->dynticks_nesting++)
 		return;
 	rdtp->dynticks++;
-	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
 	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 
@@ -341,7 +340,7 @@ void rcu_irq_exit(void)
 		return;
 	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 	rdtp->dynticks++;
-	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+	WARN_ON_ONCE(rdtp->dynticks & 0x1);
 
 	/* If the interrupt queued a callback, get out of dyntick mode. */
 	if (__get_cpu_var(rcu_sched_data).nxtlist ||
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ca560364d8c..bf8a6f9f134 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -81,6 +81,8 @@ struct rcu_dynticks {
 struct rcu_node {
 	spinlock_t lock;
 	long	gpnum;		/* Current grace period for this node. */
+				/*  This will either be equal to or one */
+				/*  behind the root rcu_node's gpnum. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
 				/*  order for current grace period to proceed.*/
 	unsigned long qsmaskinit;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 04343bee646..47789369ea5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -92,7 +92,7 @@ static void rcu_preempt_qs(int cpu)
 		rnp = rdp->mynode;
 		spin_lock(&rnp->lock);
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
-		t->rcu_blocked_node = (void *)rnp;
+		t->rcu_blocked_node = rnp;
 
 		/*
 		 * If this CPU has already checked in, then this task
@@ -176,9 +176,9 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		 * most one time.  So at most two passes through loop.
 		 */
 		for (;;) {
-			rnp = (struct rcu_node *)t->rcu_blocked_node;
+			rnp = t->rcu_blocked_node;
 			spin_lock(&rnp->lock);
-			if (rnp == (struct rcu_node *)t->rcu_blocked_node)
+			if (rnp == t->rcu_blocked_node)
 				break;
 			spin_unlock(&rnp->lock);
 		}
@@ -288,8 +288,10 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	struct task_struct *tp;
 
-	if (rnp == rnp_root)
+	if (rnp == rnp_root) {
+		WARN_ONCE(1, "Last CPU thought to be offlined?");
 		return;  /* Shouldn't happen: at least one CPU online. */
+	}
 
 	/*
 	 * Move tasks up to root rcu_node.  Rely on the fact that the
-- 
cgit v1.2.3


From 84e9dabf6e6a78928c6a1a8ba235c9fb0908d0f8 Mon Sep 17 00:00:00 2001
From: Anirban Sinha <ASinha@zeugmasystems.com>
Date: Fri, 28 Aug 2009 22:40:43 -0700
Subject: sched: Rename init_cfs_rq => init_tg_cfs_rq

... so that it does not share a common name with a function
within the same scope.

Signed-off-by: Anirban Sinha <asinha@zeugmasystems.com>
LKML-Reference: <DDFD17CC94A9BD49A82147DDF7D545C501EA98A6@exchange.ZeugmaSystems.local>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 523e20a6269..6244d24cafc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user)
 
 /*
  * Root task group.
- * 	Every UID task group (including init_task_group aka UID-0) will
- * 	be a child to this group.
+ *	Every UID task group (including init_task_group aka UID-0) will
+ *	be a child to this group.
  */
 struct task_group root_task_group;
 
@@ -318,7 +318,7 @@ struct task_group root_task_group;
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9400,11 +9400,11 @@ void __init sched_init(void)
 		 * system cpu resource, based on the weight assigned to root
 		 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
 		 * by letting tasks of init_task_group sit in a separate cfs_rq
-		 * (init_cfs_rq) and having one entity represent this group of
+		 * (init_tg_cfs_rq) and having one entity represent this group of
 		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
 		 */
 		init_tg_cfs_entry(&init_task_group,
-				&per_cpu(init_cfs_rq, i),
+				&per_cpu(init_tg_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1,
 				root_task_group.se[i]);
 
-- 
cgit v1.2.3


From 372e24b0cb764ec55b4cf3408a95ae40a29e5b96 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 26 Aug 2009 16:20:13 -0700
Subject: irq: Make sure irq_desc for legacy irq get correct node setting

when there is no ram on node 0.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
LKML-Reference: <4A95C32D.5040605@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd92..a81cf80554d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
 
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
- 	node = first_online_node;
+	node = first_online_node;
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
 	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
 
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
+#ifdef CONFIG_SMP
+		desc[i].node = node;
+#endif
 		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
 		alloc_desc_masks(&desc[i], node, true);
-- 
cgit v1.2.3


From 69d0ee7377eef808e34ba5542b554ec97244b871 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 31 Aug 2009 14:43:36 +0200
Subject: locking: Move spinlock function bodies to header file

Move spinlock function bodies to header file by creating a
static inline version of each variant. Use the inline version
on the out-of-line code.

This shouldn't make any difference besides that the spinlock
code can now be used to generate inlined spinlock code.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Horst Hartmann <horsth@linux.vnet.ibm.com>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David Miller <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <20090831124417.859022429@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/spinlock.c | 174 +++++++++---------------------------------------------
 1 file changed, 28 insertions(+), 146 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4eb..2c000f5c070 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -23,40 +23,19 @@
 
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
-	preempt_disable();
-	if (_raw_spin_trylock(lock)) {
-		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-		return 1;
-	}
-	
-	preempt_enable();
-	return 0;
+	return __spin_trylock(lock);
 }
 EXPORT_SYMBOL(_spin_trylock);
 
 int __lockfunc _read_trylock(rwlock_t *lock)
 {
-	preempt_disable();
-	if (_raw_read_trylock(lock)) {
-		rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
-		return 1;
-	}
-
-	preempt_enable();
-	return 0;
+	return __read_trylock(lock);
 }
 EXPORT_SYMBOL(_read_trylock);
 
 int __lockfunc _write_trylock(rwlock_t *lock)
 {
-	preempt_disable();
-	if (_raw_write_trylock(lock)) {
-		rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-		return 1;
-	}
-
-	preempt_enable();
-	return 0;
+	return __write_trylock(lock);
 }
 EXPORT_SYMBOL(_write_trylock);
 
@@ -69,129 +48,74 @@ EXPORT_SYMBOL(_write_trylock);
 
 void __lockfunc _read_lock(rwlock_t *lock)
 {
-	preempt_disable();
-	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	__read_lock(lock);
 }
 EXPORT_SYMBOL(_read_lock);
 
 unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	preempt_disable();
-	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * _raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
-	_raw_spin_lock_flags(lock, &flags);
-#endif
-	return flags;
+	return __spin_lock_irqsave(lock);
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
 
 void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
-	local_irq_disable();
-	preempt_disable();
-	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+	__spin_lock_irq(lock);
 }
 EXPORT_SYMBOL(_spin_lock_irq);
 
 void __lockfunc _spin_lock_bh(spinlock_t *lock)
 {
-	local_bh_disable();
-	preempt_disable();
-	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+	__spin_lock_bh(lock);
 }
 EXPORT_SYMBOL(_spin_lock_bh);
 
 unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	preempt_disable();
-	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
-			     _raw_read_lock_flags, &flags);
-	return flags;
+	return __read_lock_irqsave(lock);
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
 
 void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
-	local_irq_disable();
-	preempt_disable();
-	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	__read_lock_irq(lock);
 }
 EXPORT_SYMBOL(_read_lock_irq);
 
 void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
-	local_bh_disable();
-	preempt_disable();
-	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	__read_lock_bh(lock);
 }
 EXPORT_SYMBOL(_read_lock_bh);
 
 unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	preempt_disable();
-	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
-			     _raw_write_lock_flags, &flags);
-	return flags;
+	return __write_lock_irqsave(lock);
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
 
 void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
-	local_irq_disable();
-	preempt_disable();
-	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	__write_lock_irq(lock);
 }
 EXPORT_SYMBOL(_write_lock_irq);
 
 void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
-	local_bh_disable();
-	preempt_disable();
-	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	__write_lock_bh(lock);
 }
 EXPORT_SYMBOL(_write_lock_bh);
 
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
-	preempt_disable();
-	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+	__spin_lock(lock);
 }
-
 EXPORT_SYMBOL(_spin_lock);
 
 void __lockfunc _write_lock(rwlock_t *lock)
 {
-	preempt_disable();
-	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	__write_lock(lock);
 }
-
 EXPORT_SYMBOL(_write_lock);
 
 #else /* CONFIG_PREEMPT: */
@@ -320,121 +244,79 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
 
 void __lockfunc _spin_unlock(spinlock_t *lock)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_spin_unlock(lock);
-	preempt_enable();
+	__spin_unlock(lock);
 }
 EXPORT_SYMBOL(_spin_unlock);
 
 void __lockfunc _write_unlock(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_write_unlock(lock);
-	preempt_enable();
+	__write_unlock(lock);
 }
 EXPORT_SYMBOL(_write_unlock);
 
 void __lockfunc _read_unlock(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_read_unlock(lock);
-	preempt_enable();
+	__read_unlock(lock);
 }
 EXPORT_SYMBOL(_read_unlock);
 
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_spin_unlock(lock);
-	local_irq_restore(flags);
-	preempt_enable();
+	__spin_unlock_irqrestore(lock, flags);
 }
 EXPORT_SYMBOL(_spin_unlock_irqrestore);
 
 void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_spin_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__spin_unlock_irq(lock);
 }
 EXPORT_SYMBOL(_spin_unlock_irq);
 
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
-	spin_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_spin_unlock(lock);
-	preempt_enable_no_resched();
-	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+	__spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
 
 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_read_unlock(lock);
-	local_irq_restore(flags);
-	preempt_enable();
+	__read_unlock_irqrestore(lock, flags);
 }
 EXPORT_SYMBOL(_read_unlock_irqrestore);
 
 void __lockfunc _read_unlock_irq(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_read_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__read_unlock_irq(lock);
 }
 EXPORT_SYMBOL(_read_unlock_irq);
 
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_read_unlock(lock);
-	preempt_enable_no_resched();
-	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+	__read_unlock_bh(lock);
 }
 EXPORT_SYMBOL(_read_unlock_bh);
 
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_write_unlock(lock);
-	local_irq_restore(flags);
-	preempt_enable();
+	__write_unlock_irqrestore(lock, flags);
 }
 EXPORT_SYMBOL(_write_unlock_irqrestore);
 
 void __lockfunc _write_unlock_irq(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_write_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__write_unlock_irq(lock);
 }
 EXPORT_SYMBOL(_write_unlock_irq);
 
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
-	rwlock_release(&lock->dep_map, 1, _RET_IP_);
-	_raw_write_unlock(lock);
-	preempt_enable_no_resched();
-	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+	__write_unlock_bh(lock);
 }
 EXPORT_SYMBOL(_write_unlock_bh);
 
 int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 {
-	local_bh_disable();
-	preempt_disable();
-	if (_raw_spin_trylock(lock)) {
-		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-		return 1;
-	}
-
-	preempt_enable_no_resched();
-	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
-	return 0;
+	return __spin_trylock_bh(lock);
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
 
-- 
cgit v1.2.3


From 892a7c67c12da63fa4b51728bbe5b982356a090a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 31 Aug 2009 14:43:37 +0200
Subject: locking: Allow arch-inlined spinlocks

This allows an architecture to specify per lock variant if the
locking code should be kept out-of-line or inlined.

If an architecure wants out-of-line locking code no change is
needed. To force inlining of e.g. spin_lock() the line:

  #define __always_inline__spin_lock

needs to be added to arch/<...>/include/asm/spinlock.h

If CONFIG_DEBUG_SPINLOCK or CONFIG_GENERIC_LOCKBREAK are
defined the per architecture defines are (partly) ignored and
still out-of-line spinlock code will be generated.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Horst Hartmann <horsth@linux.vnet.ibm.com>
Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David Miller <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <20090831124418.375299024@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/spinlock.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 2c000f5c070..5ddab730cb2 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,23 +21,29 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 
+#ifndef _spin_trylock
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	return __spin_trylock(lock);
 }
 EXPORT_SYMBOL(_spin_trylock);
+#endif
 
+#ifndef _read_trylock
 int __lockfunc _read_trylock(rwlock_t *lock)
 {
 	return __read_trylock(lock);
 }
 EXPORT_SYMBOL(_read_trylock);
+#endif
 
+#ifndef _write_trylock
 int __lockfunc _write_trylock(rwlock_t *lock)
 {
 	return __write_trylock(lock);
 }
 EXPORT_SYMBOL(_write_trylock);
+#endif
 
 /*
  * If lockdep is enabled then we use the non-preemption spin-ops
@@ -46,77 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
  */
 #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 
+#ifndef _read_lock
 void __lockfunc _read_lock(rwlock_t *lock)
 {
 	__read_lock(lock);
 }
 EXPORT_SYMBOL(_read_lock);
+#endif
 
+#ifndef _spin_lock_irqsave
 unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
 	return __spin_lock_irqsave(lock);
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
+#endif
 
+#ifndef _spin_lock_irq
 void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
 	__spin_lock_irq(lock);
 }
 EXPORT_SYMBOL(_spin_lock_irq);
+#endif
 
+#ifndef _spin_lock_bh
 void __lockfunc _spin_lock_bh(spinlock_t *lock)
 {
 	__spin_lock_bh(lock);
 }
 EXPORT_SYMBOL(_spin_lock_bh);
+#endif
 
+#ifndef _read_lock_irqsave
 unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 {
 	return __read_lock_irqsave(lock);
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
+#endif
 
+#ifndef _read_lock_irq
 void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
 	__read_lock_irq(lock);
 }
 EXPORT_SYMBOL(_read_lock_irq);
+#endif
 
+#ifndef _read_lock_bh
 void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
 	__read_lock_bh(lock);
 }
 EXPORT_SYMBOL(_read_lock_bh);
+#endif
 
+#ifndef _write_lock_irqsave
 unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 {
 	return __write_lock_irqsave(lock);
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
+#endif
 
+#ifndef _write_lock_irq
 void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
 	__write_lock_irq(lock);
 }
 EXPORT_SYMBOL(_write_lock_irq);
+#endif
 
+#ifndef _write_lock_bh
 void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
 	__write_lock_bh(lock);
 }
 EXPORT_SYMBOL(_write_lock_bh);
+#endif
 
+#ifndef _spin_lock
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	__spin_lock(lock);
 }
 EXPORT_SYMBOL(_spin_lock);
+#endif
 
+#ifndef _write_lock
 void __lockfunc _write_lock(rwlock_t *lock)
 {
 	__write_lock(lock);
 }
 EXPORT_SYMBOL(_write_lock);
+#endif
 
 #else /* CONFIG_PREEMPT: */
 
@@ -242,83 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
 
 #endif
 
+#ifndef _spin_unlock
 void __lockfunc _spin_unlock(spinlock_t *lock)
 {
 	__spin_unlock(lock);
 }
 EXPORT_SYMBOL(_spin_unlock);
+#endif
 
+#ifndef _write_unlock
 void __lockfunc _write_unlock(rwlock_t *lock)
 {
 	__write_unlock(lock);
 }
 EXPORT_SYMBOL(_write_unlock);
+#endif
 
+#ifndef _read_unlock
 void __lockfunc _read_unlock(rwlock_t *lock)
 {
 	__read_unlock(lock);
 }
 EXPORT_SYMBOL(_read_unlock);
+#endif
 
+#ifndef _spin_unlock_irqrestore
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	__spin_unlock_irqrestore(lock, flags);
 }
 EXPORT_SYMBOL(_spin_unlock_irqrestore);
+#endif
 
+#ifndef _spin_unlock_irq
 void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 {
 	__spin_unlock_irq(lock);
 }
 EXPORT_SYMBOL(_spin_unlock_irq);
+#endif
 
+#ifndef _spin_unlock_bh
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	__spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
+#endif
 
+#ifndef _read_unlock_irqrestore
 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	__read_unlock_irqrestore(lock, flags);
 }
 EXPORT_SYMBOL(_read_unlock_irqrestore);
+#endif
 
+#ifndef _read_unlock_irq
 void __lockfunc _read_unlock_irq(rwlock_t *lock)
 {
 	__read_unlock_irq(lock);
 }
 EXPORT_SYMBOL(_read_unlock_irq);
+#endif
 
+#ifndef _read_unlock_bh
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	__read_unlock_bh(lock);
 }
 EXPORT_SYMBOL(_read_unlock_bh);
+#endif
 
+#ifndef _write_unlock_irqrestore
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	__write_unlock_irqrestore(lock, flags);
 }
 EXPORT_SYMBOL(_write_unlock_irqrestore);
+#endif
 
+#ifndef _write_unlock_irq
 void __lockfunc _write_unlock_irq(rwlock_t *lock)
 {
 	__write_unlock_irq(lock);
 }
 EXPORT_SYMBOL(_write_unlock_irq);
+#endif
 
+#ifndef _write_unlock_bh
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	__write_unlock_bh(lock);
 }
 EXPORT_SYMBOL(_write_unlock_bh);
+#endif
 
+#ifndef _spin_trylock_bh
 int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 {
 	return __spin_trylock_bh(lock);
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
+#endif
 
 notrace int in_lock_functions(unsigned long addr)
 {
-- 
cgit v1.2.3


From 8f0dfc34e9b323a028c2ec41abb7e9de477b7a94 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 20 Jul 2009 11:26:58 -0700
Subject: sched: Provide iowait counters

For counting how long an application has been waiting for
(disk) IO, there currently is only the HZ sample driven
information available, while for all other counters in this
class, a high resolution version is available via
CONFIG_SCHEDSTATS.

In order to make an improved bootchart tool possible, we also
need a higher resolution version of the iowait time.

This patch below adds this scheduler statistic to the kernel.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4A64B813.1080506@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 4 ++++
 kernel/sched_debug.c | 4 ++++
 kernel/sched_fair.c  | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6244d24cafc..38d05a89e0f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6754,7 +6754,9 @@ void __sched io_schedule(void)
 
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
+	current->in_iowait = 1;
 	schedule();
+	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
@@ -6767,7 +6769,9 @@ long __sched io_schedule_timeout(long timeout)
 
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
+	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
+	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b7994..5ddbd089126 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.wait_max);
 	PN(se.wait_sum);
 	P(se.wait_count);
+	PN(se.iowait_sum);
+	P(se.iowait_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
 	P(se.nr_migrations_cold);
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
 	p->se.wait_max				= 0;
 	p->se.wait_sum				= 0;
 	p->se.wait_count			= 0;
+	p->se.iowait_sum			= 0;
+	p->se.iowait_count			= 0;
 	p->se.sleep_max				= 0;
 	p->se.sum_sleep_runtime			= 0;
 	p->se.block_max				= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 342000b31ad..471fa281f5e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -652,6 +652,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		se->sum_sleep_runtime += delta;
 
 		if (tsk) {
+			if (tsk->in_iowait) {
+				se->iowait_sum += delta;
+				se->iowait_count++;
+			}
+
 			/*
 			 * Blocking time is in units of nanosecs, so shift by
 			 * 20 to get a milliseconds-range estimation of the
-- 
cgit v1.2.3


From 768d0c27226e6587cad2fcf543f9711da3f3774e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 23 Jul 2009 20:13:26 +0200
Subject: sched: Add wait, sleep and iowait accounting tracepoints

Add 3 schedstat tracepoints to help account for wait-time,
sleep-time and iowait-time.

They can also be used as a perf-counter source to profile tasks
on these clocks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <new-submission>
[ build fix for the !CONFIG_SCHEDSTATS case ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 471fa281f5e..2ff850f90d1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -546,6 +546,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->wait_sum, se->wait_sum +
 			rq_of(cfs_rq)->clock - se->wait_start);
 	schedstat_set(se->wait_start, 0);
+
+#ifdef CONFIG_SCHEDSTATS
+	if (entity_is_task(se)) {
+		trace_sched_stat_wait(task_of(se),
+			rq_of(cfs_rq)->clock - se->wait_start);
+	}
+#endif
 }
 
 static inline void
@@ -636,8 +643,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		se->sleep_start = 0;
 		se->sum_sleep_runtime += delta;
 
-		if (tsk)
+		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
+			trace_sched_stat_sleep(tsk, delta);
+		}
 	}
 	if (se->block_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -655,6 +664,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			if (tsk->in_iowait) {
 				se->iowait_sum += delta;
 				se->iowait_count++;
+				trace_sched_stat_iowait(tsk, delta);
 			}
 
 			/*
-- 
cgit v1.2.3


From e0e817392b9acf2c98d3be80c233dddb1b52003d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 2 Sep 2009 09:13:40 +0100
Subject: CRED: Add some configurable debugging [try #6]

Add a config option (CONFIG_DEBUG_CREDENTIALS) to turn on some debug checking
for credential management.  The additional code keeps track of the number of
pointers from task_structs to any given cred struct, and checks to see that
this number never exceeds the usage count of the cred struct (which includes
all references, not just those from task_structs).

Furthermore, if SELinux is enabled, the code also checks that the security
pointer in the cred struct is never seen to be invalid.

This attempts to catch the bug whereby inode_has_perm() faults in an nfsd
kernel thread on seeing cred->security be a NULL pointer (it appears that the
credential struct has been previously released):

	http://www.kerneloops.org/oops.php?number=252883

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/exit.c |   4 +
 kernel/fork.c |   6 +-
 kernel/kmod.c |   1 +
 4 files changed, 251 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d61..24dd2f5104b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
 #include <linux/cn_proc.h>
 #include "cred-internals.h"
 
+#if 0
+#define kdebug(FMT, ...) \
+	printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#else
+static inline __attribute__((format(printf, 1, 2)))
+void no_printk(const char *fmt, ...)
+{
+}
+#define kdebug(FMT, ...) \
+	no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#endif
+
 static struct kmem_cache *cred_jar;
 
 /*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
  */
 struct cred init_cred = {
 	.usage			= ATOMIC_INIT(4),
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	.subscribers		= ATOMIC_INIT(2),
+	.magic			= CRED_MAGIC,
+#endif
 	.securebits		= SECUREBITS_DEFAULT,
 	.cap_inheritable	= CAP_INIT_INH_SET,
 	.cap_permitted		= CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
 #endif
 };
 
+static inline void set_cred_subscribers(struct cred *cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	atomic_set(&cred->subscribers, n);
+#endif
+}
+
+static inline int read_cred_subscribers(const struct cred *cred)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	return atomic_read(&cred->subscribers);
+#else
+	return 0;
+#endif
+}
+
+static inline void alter_cred_subscribers(const struct cred *_cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	struct cred *cred = (struct cred *) _cred;
+
+	atomic_add(n, &cred->subscribers);
+#endif
+}
+
 /*
  * Dispose of the shared task group credentials
  */
@@ -85,9 +126,22 @@ static void put_cred_rcu(struct rcu_head *rcu)
 {
 	struct cred *cred = container_of(rcu, struct cred, rcu);
 
+	kdebug("put_cred_rcu(%p)", cred);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	if (cred->magic != CRED_MAGIC_DEAD ||
+	    atomic_read(&cred->usage) != 0 ||
+	    read_cred_subscribers(cred) != 0)
+		panic("CRED: put_cred_rcu() sees %p with"
+		      " mag %x, put %p, usage %d, subscr %d\n",
+		      cred, cred->magic, cred->put_addr,
+		      atomic_read(&cred->usage),
+		      read_cred_subscribers(cred));
+#else
 	if (atomic_read(&cred->usage) != 0)
 		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
 		      cred, atomic_read(&cred->usage));
+#endif
 
 	security_cred_free(cred);
 	key_put(cred->thread_keyring);
@@ -106,12 +160,47 @@ static void put_cred_rcu(struct rcu_head *rcu)
  */
 void __put_cred(struct cred *cred)
 {
+	kdebug("__put_cred(%p{%d,%d})", cred,
+	       atomic_read(&cred->usage),
+	       read_cred_subscribers(cred));
+
 	BUG_ON(atomic_read(&cred->usage) != 0);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	BUG_ON(read_cred_subscribers(cred) != 0);
+	cred->magic = CRED_MAGIC_DEAD;
+	cred->put_addr = __builtin_return_address(0);
+#endif
+	BUG_ON(cred == current->cred);
+	BUG_ON(cred == current->real_cred);
 
 	call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
+/*
+ * Clean up a task's credentials when it exits
+ */
+void exit_creds(struct task_struct *tsk)
+{
+	struct cred *cred;
+
+	kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+	       atomic_read(&tsk->cred->usage),
+	       read_cred_subscribers(tsk->cred));
+
+	cred = (struct cred *) tsk->real_cred;
+	tsk->real_cred = NULL;
+	validate_creds(cred);
+	alter_cred_subscribers(cred, -1);
+	put_cred(cred);
+
+	cred = (struct cred *) tsk->cred;
+	tsk->cred = NULL;
+	validate_creds(cred);
+	alter_cred_subscribers(cred, -1);
+	put_cred(cred);
+}
+
 /**
  * prepare_creds - Prepare a new set of credentials for modification
  *
@@ -132,16 +221,19 @@ struct cred *prepare_creds(void)
 	const struct cred *old;
 	struct cred *new;
 
-	BUG_ON(atomic_read(&task->real_cred->usage) < 1);
+	validate_process_creds();
 
 	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
 	if (!new)
 		return NULL;
 
+	kdebug("prepare_creds() alloc %p", new);
+
 	old = task->cred;
 	memcpy(new, old, sizeof(struct cred));
 
 	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
 
@@ -157,6 +249,7 @@ struct cred *prepare_creds(void)
 
 	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
 		goto error;
+	validate_creds(new);
 	return new;
 
 error:
@@ -229,9 +322,12 @@ struct cred *prepare_usermodehelper_creds(void)
 	if (!new)
 		return NULL;
 
+	kdebug("prepare_usermodehelper_creds() alloc %p", new);
+
 	memcpy(new, &init_cred, sizeof(struct cred));
 
 	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
 
@@ -250,6 +346,7 @@ struct cred *prepare_usermodehelper_creds(void)
 #endif
 	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
 		goto error;
+	validate_creds(new);
 
 	BUG_ON(atomic_read(&new->usage) != 1);
 	return new;
@@ -286,6 +383,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	    ) {
 		p->real_cred = get_cred(p->cred);
 		get_cred(p->cred);
+		alter_cred_subscribers(p->cred, 2);
+		kdebug("share_creds(%p{%d,%d})",
+		       p->cred, atomic_read(&p->cred->usage),
+		       read_cred_subscribers(p->cred));
 		atomic_inc(&p->cred->user->processes);
 		return 0;
 	}
@@ -331,6 +432,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 
 	atomic_inc(&new->user->processes);
 	p->cred = p->real_cred = get_cred(new);
+	alter_cred_subscribers(new, 2);
+	validate_creds(new);
 	return 0;
 
 error_put:
@@ -355,13 +458,20 @@ error_put:
 int commit_creds(struct cred *new)
 {
 	struct task_struct *task = current;
-	const struct cred *old;
+	const struct cred *old = task->real_cred;
 
-	BUG_ON(task->cred != task->real_cred);
-	BUG_ON(atomic_read(&task->real_cred->usage) < 2);
+	kdebug("commit_creds(%p{%d,%d})", new,
+	       atomic_read(&new->usage),
+	       read_cred_subscribers(new));
+
+	BUG_ON(task->cred != old);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	BUG_ON(read_cred_subscribers(old) < 2);
+	validate_creds(old);
+	validate_creds(new);
+#endif
 	BUG_ON(atomic_read(&new->usage) < 1);
 
-	old = task->real_cred;
 	security_commit_creds(new, old);
 
 	get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +500,14 @@ int commit_creds(struct cred *new)
 	 *   cheaply with the new uid cache, so if it matters
 	 *   we should be checking for it.  -DaveM
 	 */
+	alter_cred_subscribers(new, 2);
 	if (new->user != old->user)
 		atomic_inc(&new->user->processes);
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user)
 		atomic_dec(&old->user->processes);
+	alter_cred_subscribers(old, -2);
 
 	sched_switch_user(task);
 
@@ -428,6 +540,13 @@ EXPORT_SYMBOL(commit_creds);
  */
 void abort_creds(struct cred *new)
 {
+	kdebug("abort_creds(%p{%d,%d})", new,
+	       atomic_read(&new->usage),
+	       read_cred_subscribers(new));
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	BUG_ON(read_cred_subscribers(new) != 0);
+#endif
 	BUG_ON(atomic_read(&new->usage) < 1);
 	put_cred(new);
 }
@@ -444,7 +563,20 @@ const struct cred *override_creds(const struct cred *new)
 {
 	const struct cred *old = current->cred;
 
-	rcu_assign_pointer(current->cred, get_cred(new));
+	kdebug("override_creds(%p{%d,%d})", new,
+	       atomic_read(&new->usage),
+	       read_cred_subscribers(new));
+
+	validate_creds(old);
+	validate_creds(new);
+	get_cred(new);
+	alter_cred_subscribers(new, 1);
+	rcu_assign_pointer(current->cred, new);
+	alter_cred_subscribers(old, -1);
+
+	kdebug("override_creds() = %p{%d,%d}", old,
+	       atomic_read(&old->usage),
+	       read_cred_subscribers(old));
 	return old;
 }
 EXPORT_SYMBOL(override_creds);
@@ -460,7 +592,15 @@ void revert_creds(const struct cred *old)
 {
 	const struct cred *override = current->cred;
 
+	kdebug("revert_creds(%p{%d,%d})", old,
+	       atomic_read(&old->usage),
+	       read_cred_subscribers(old));
+
+	validate_creds(old);
+	validate_creds(override);
+	alter_cred_subscribers(old, 1);
 	rcu_assign_pointer(current->cred, old);
+	alter_cred_subscribers(override, -1);
 	put_cred(override);
 }
 EXPORT_SYMBOL(revert_creds);
@@ -502,11 +642,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	if (!new)
 		return NULL;
 
+	kdebug("prepare_kernel_cred() alloc %p", new);
+
 	if (daemon)
 		old = get_task_cred(daemon);
 	else
 		old = get_cred(&init_cred);
 
+	validate_creds(old);
+
 	*new = *old;
 	get_uid(new->user);
 	get_group_info(new->group_info);
@@ -526,7 +670,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 		goto error;
 
 	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
 	put_cred(old);
+	validate_creds(new);
 	return new;
 
 error:
@@ -589,3 +735,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 	return security_kernel_create_files_as(new, inode);
 }
 EXPORT_SYMBOL(set_create_files_as);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+
+/*
+ * dump invalid credentials
+ */
+static void dump_invalid_creds(const struct cred *cred, const char *label,
+			       const struct task_struct *tsk)
+{
+	printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+	       label, cred,
+	       cred == &init_cred ? "[init]" : "",
+	       cred == tsk->real_cred ? "[real]" : "",
+	       cred == tsk->cred ? "[eff]" : "");
+	printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+	       cred->magic, cred->put_addr);
+	printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+	       atomic_read(&cred->usage),
+	       read_cred_subscribers(cred));
+	printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+	       cred->uid, cred->euid, cred->suid, cred->fsuid);
+	printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+	       cred->gid, cred->egid, cred->sgid, cred->fsgid);
+#ifdef CONFIG_SECURITY
+	printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+	if ((unsigned long) cred->security >= PAGE_SIZE &&
+	    (((unsigned long) cred->security & 0xffffff00) !=
+	     (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
+		printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+		       ((u32*)cred->security)[0],
+		       ((u32*)cred->security)[1]);
+#endif
+}
+
+/*
+ * report use of invalid credentials
+ */
+void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+{
+	printk(KERN_ERR "CRED: Invalid credentials\n");
+	printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+	dump_invalid_creds(cred, "Specified", current);
+	BUG();
+}
+EXPORT_SYMBOL(__invalid_creds);
+
+/*
+ * check the credentials on a process
+ */
+void __validate_process_creds(struct task_struct *tsk,
+			      const char *file, unsigned line)
+{
+	if (tsk->cred == tsk->real_cred) {
+		if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
+			     creds_are_invalid(tsk->cred)))
+			goto invalid_creds;
+	} else {
+		if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
+			     read_cred_subscribers(tsk->cred) < 1 ||
+			     creds_are_invalid(tsk->real_cred) ||
+			     creds_are_invalid(tsk->cred)))
+			goto invalid_creds;
+	}
+	return;
+
+invalid_creds:
+	printk(KERN_ERR "CRED: Invalid process credentials\n");
+	printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+
+	dump_invalid_creds(tsk->real_cred, "Real", tsk);
+	if (tsk->cred != tsk->real_cred)
+		dump_invalid_creds(tsk->cred, "Effective", tsk);
+	else
+		printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+	BUG();
+}
+EXPORT_SYMBOL(__validate_process_creds);
+
+/*
+ * check creds for do_exit()
+ */
+void validate_creds_for_do_exit(struct task_struct *tsk)
+{
+	kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+	       tsk->real_cred, tsk->cred,
+	       atomic_read(&tsk->cred->usage),
+	       read_cred_subscribers(tsk->cred));
+
+	__validate_process_creds(tsk, __FILE__, __LINE__);
+}
+
+#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733..c98ff7a8025 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -901,6 +901,8 @@ NORET_TYPE void do_exit(long code)
 
 	tracehook_report_exit(&code);
 
+	validate_creds_for_do_exit(tsk);
+
 	/*
 	 * We're taking recursive faults here in do_exit. Safest is to just
 	 * leave this task alone and wait for reboot.
@@ -1009,6 +1011,8 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
+	validate_creds_for_do_exit(tsk);
+
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
diff --git a/kernel/fork.c b/kernel/fork.c
index 144326b7af5..043b5d88049 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -152,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
-	put_cred(tsk->real_cred);
-	put_cred(tsk->cred);
+	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -1307,8 +1306,7 @@ bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
-	put_cred(p->real_cred);
-	put_cred(p->cred);
+	exit_creds(p);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5a7ae57f983..4e8cae2e914 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -466,6 +466,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	int retval = 0;
 
 	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+	validate_creds(sub_info->cred);
 
 	helper_lock();
 	if (sub_info->path[0] == '\0')
-- 
cgit v1.2.3


From ee18d64c1f632043a02e6f5ba5e045bb26a5465f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 2 Sep 2009 09:14:21 +0100
Subject: KEYS: Add a keyctl to install a process's session keyring on its
 parent [try #6]

Add a keyctl to install a process's session keyring onto its parent.  This
replaces the parent's session keyring.  Because the COW credential code does
not permit one process to change another process's credentials directly, the
change is deferred until userspace next starts executing again.  Normally this
will be after a wait*() syscall.

To support this, three new security hooks have been provided:
cred_alloc_blank() to allocate unset security creds, cred_transfer() to fill in
the blank security creds and key_session_to_parent() - which asks the LSM if
the process may replace its parent's session keyring.

The replacement may only happen if the process has the same ownership details
as its parent, and the process has LINK permission on the session keyring, and
the session keyring is owned by the process, and the LSM permits it.

Note that this requires alteration to each architecture's notify_resume path.
This has been done for all arches barring blackfin, m68k* and xtensa, all of
which need assembly alteration to support TIF_NOTIFY_RESUME.  This allows the
replacement to be performed at the point the parent process resumes userspace
execution.

This allows the userspace AFS pioctl emulation to fully emulate newpag() and
the VIOCSETTOK and VIOCSETTOK2 pioctls, all of which require the ability to
alter the parent process's PAG membership.  However, since kAFS doesn't use
PAGs per se, but rather dumps the keys into the session keyring, the session
keyring of the parent must be replaced if, for example, VIOCSETTOK is passed
the newpag flag.

This can be tested with the following program:

	#include <stdio.h>
	#include <stdlib.h>
	#include <keyutils.h>

	#define KEYCTL_SESSION_TO_PARENT	18

	#define OSERROR(X, S) do { if ((long)(X) == -1) { perror(S); exit(1); } } while(0)

	int main(int argc, char **argv)
	{
		key_serial_t keyring, key;
		long ret;

		keyring = keyctl_join_session_keyring(argv[1]);
		OSERROR(keyring, "keyctl_join_session_keyring");

		key = add_key("user", "a", "b", 1, keyring);
		OSERROR(key, "add_key");

		ret = keyctl(KEYCTL_SESSION_TO_PARENT);
		OSERROR(ret, "KEYCTL_SESSION_TO_PARENT");

		return 0;
	}

Compiled and linked with -lkeyutils, you should see something like:

	[dhowells@andromeda ~]$ keyctl show
	Session Keyring
	       -3 --alswrv   4043  4043  keyring: _ses
	355907932 --alswrv   4043    -1   \_ keyring: _uid.4043
	[dhowells@andromeda ~]$ /tmp/newpag
	[dhowells@andromeda ~]$ keyctl show
	Session Keyring
	       -3 --alswrv   4043  4043  keyring: _ses
	1055658746 --alswrv   4043  4043   \_ user: a
	[dhowells@andromeda ~]$ /tmp/newpag hello
	[dhowells@andromeda ~]$ keyctl show
	Session Keyring
	       -3 --alswrv   4043  4043  keyring: hello
	340417692 --alswrv   4043  4043   \_ user: a

Where the test program creates a new session keyring, sticks a user key named
'a' into it and then installs it on its parent.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 24dd2f5104b..006fcab009d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -199,6 +199,49 @@ void exit_creds(struct task_struct *tsk)
 	validate_creds(cred);
 	alter_cred_subscribers(cred, -1);
 	put_cred(cred);
+
+	cred = (struct cred *) tsk->replacement_session_keyring;
+	if (cred) {
+		tsk->replacement_session_keyring = NULL;
+		validate_creds(cred);
+		put_cred(cred);
+	}
+}
+
+/*
+ * Allocate blank credentials, such that the credentials can be filled in at a
+ * later date without risk of ENOMEM.
+ */
+struct cred *cred_alloc_blank(void)
+{
+	struct cred *new;
+
+	new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+#ifdef CONFIG_KEYS
+	new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
+	if (!new->tgcred) {
+		kfree(new);
+		return NULL;
+	}
+	atomic_set(&new->tgcred->usage, 1);
+#endif
+
+	atomic_set(&new->usage, 1);
+
+	if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+		goto error;
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	new->magic = CRED_MAGIC;
+#endif
+	return new;
+
+error:
+	abort_creds(new);
+	return NULL;
 }
 
 /**
-- 
cgit v1.2.3


From 0fbdea19e9394a5cb5f2f5081b028c50b558910a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 2 Sep 2009 21:46:00 +0200
Subject: perf_counter: Introduce new (non-)paranoia level to allow raw
 tracepoint access

I want to sample inherited tracepoint workloads as a normal
user and the CAP_SYS_ADMIN check prevents me from doing that
right now.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d988dfb4bba..0aa609f6910 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,12 +46,18 @@ static atomic_t nr_task_counters __read_mostly;
 
 /*
  * perf counter paranoia level:
- *  0 - not paranoid
- *  1 - disallow cpu counters to unpriv
- *  2 - disallow kernel profiling to unpriv
+ *  -1 - not paranoid at all
+ *   0 - disallow raw tracepoint access for unpriv
+ *   1 - disallow cpu counters for unpriv
+ *   2 - disallow kernel profiling for unpriv
  */
 int sysctl_perf_counter_paranoid __read_mostly = 1;
 
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+	return sysctl_perf_counter_paranoid > -1;
+}
+
 static inline bool perf_paranoid_cpu(void)
 {
 	return sysctl_perf_counter_paranoid > 0;
@@ -3971,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 	 * have these.
 	 */
 	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+			perf_paranoid_tracepoint_raw() &&
 			!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-- 
cgit v1.2.3


From dc86cabe4b242446ea9aa8492c727e1220817898 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 3 Sep 2009 18:03:00 +0200
Subject: perf_counter: Fix output-sharing error path

We forget to release the fd in the PERF_FLAG_FD_OUTPUT
error path.

Reorganize the error flow here to be a clean fall-through
logic.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0aa609f6910..e0d91fdf0c3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4316,15 +4316,15 @@ SYSCALL_DEFINE5(perf_counter_open,
 	struct file *group_file = NULL;
 	int fput_needed = 0;
 	int fput_needed2 = 0;
-	int ret;
+	int err;
 
 	/* for future expandability... */
 	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
 		return -EINVAL;
 
-	ret = perf_copy_attr(attr_uptr, &attr);
-	if (ret)
-		return ret;
+	err = perf_copy_attr(attr_uptr, &attr);
+	if (err)
+		return err;
 
 	if (!attr.exclude_kernel) {
 		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4348,7 +4348,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	 */
 	group_leader = NULL;
 	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
-		ret = -EINVAL;
+		err = -EINVAL;
 		group_file = fget_light(group_fd, &fput_needed);
 		if (!group_file)
 			goto err_put_context;
@@ -4377,22 +4377,22 @@ SYSCALL_DEFINE5(perf_counter_open,
 
 	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
 				     NULL, GFP_KERNEL);
-	ret = PTR_ERR(counter);
+	err = PTR_ERR(counter);
 	if (IS_ERR(counter))
 		goto err_put_context;
 
-	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
-	if (ret < 0)
+	err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+	if (err < 0)
 		goto err_free_put_context;
 
-	counter_file = fget_light(ret, &fput_needed2);
+	counter_file = fget_light(err, &fput_needed2);
 	if (!counter_file)
 		goto err_free_put_context;
 
 	if (flags & PERF_FLAG_FD_OUTPUT) {
-		ret = perf_counter_set_output(counter, group_fd);
-		if (ret)
-			goto err_free_put_context;
+		err = perf_counter_set_output(counter, group_fd);
+		if (err)
+			goto err_fput_free_put_context;
 	}
 
 	counter->filp = counter_file;
@@ -4408,20 +4408,20 @@ SYSCALL_DEFINE5(perf_counter_open,
 	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
 	mutex_unlock(&current->perf_counter_mutex);
 
+err_fput_free_put_context:
 	fput_light(counter_file, fput_needed2);
 
-out_fput:
-	fput_light(group_file, fput_needed);
-
-	return ret;
-
 err_free_put_context:
-	kfree(counter);
+	if (err < 0)
+		kfree(counter);
 
 err_put_context:
-	put_ctx(ctx);
+	if (err < 0)
+		put_ctx(ctx);
+
+	fput_light(group_file, fput_needed);
 
-	goto out_fput;
+	return err;
 }
 
 /*
-- 
cgit v1.2.3


From f93e65c186ab3c05ce2068733ca10e34fd00125e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:32 +0200
Subject: sched: Restore __cpu_power to a straight sum of power

cpu_power is supposed to be a representation of the process
capacity of the cpu, not a value to randomly tweak in order to
affect placement.

Remove the placement hacks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083825.810860576@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index da1edc8277d..584a122b553 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8464,15 +8464,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
- *
- * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
- * the maximum number of tasks a group can handle in the presence of other idle
- * or lightly loaded groups in the same sched domain.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_domain *child;
 	struct sched_group *group;
+	long power;
+	int weight;
 
 	WARN_ON(!sd || !sd->groups);
 
@@ -8483,22 +8481,20 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
 	sd->groups->__cpu_power = 0;
 
-	/*
-	 * For perf policy, if the groups in child domain share resources
-	 * (for example cores sharing some portions of the cache hierarchy
-	 * or SMT), then set this domain groups cpu_power such that each group
-	 * can handle only one task, when there are other idle groups in the
-	 * same sched domain.
-	 */
-	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
-		       (child->flags &
-			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
-		sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+	if (!child) {
+		power = SCHED_LOAD_SCALE;
+		weight = cpumask_weight(sched_domain_span(sd));
+		/*
+		 * SMT siblings share the power of a single core.
+		 */
+		if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1)
+			power /= weight;
+		sg_inc_cpu_power(sd->groups, power);
 		return;
 	}
 
 	/*
-	 * add cpu_power of each child group to this groups cpu_power
+	 * Add cpu_power of each child group to this groups cpu_power.
 	 */
 	group = child->groups;
 	do {
-- 
cgit v1.2.3


From b5d978e0c7e79a7ff842e895c85a86b38c71f1cd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:33 +0200
Subject: sched: Add SD_PREFER_SIBLING

Do the placement thing using SD flags.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083825.897028974@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 584a122b553..9d64cec9ae1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3811,9 +3811,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			const struct cpumask *cpus, int *balance,
 			struct sd_lb_stats *sds)
 {
+	struct sched_domain *child = sd->child;
 	struct sched_group *group = sd->groups;
 	struct sg_lb_stats sgs;
-	int load_idx;
+	int load_idx, prefer_sibling = 0;
+
+	if (child && child->flags & SD_PREFER_SIBLING)
+		prefer_sibling = 1;
 
 	init_sd_power_savings_stats(sd, sds, idle);
 	load_idx = get_sd_load_idx(sd, idle);
@@ -3833,6 +3837,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		sds->total_load += sgs.group_load;
 		sds->total_pwr += group->__cpu_power;
 
+		/*
+		 * In case the child domain prefers tasks go to siblings
+		 * first, lower the group capacity to one so that we'll try
+		 * and move all the excess tasks away.
+		 */
+		if (prefer_sibling)
+			sgs.group_capacity = 1;
+
 		if (local_group) {
 			sds->this_load = sgs.avg_load;
 			sds->this = group;
-- 
cgit v1.2.3


From cc9fba7d7672fa3ed58d9d9ecb6c45b1351c29a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:34 +0200
Subject: sched: Update the cpu_power sum during load-balance

In order to prepare for a more dynamic cpu_power, update the
group sum while walking the sched domains during load-balance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083825.985050292@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9d64cec9ae1..ecb4a47d421 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3699,6 +3699,28 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
+static void update_sched_power(struct sched_domain *sd)
+{
+	struct sched_domain *child = sd->child;
+	struct sched_group *group, *sdg = sd->groups;
+	unsigned long power = sdg->__cpu_power;
+
+	if (!child) {
+		/* compute cpu power for this cpu */
+		return;
+	}
+
+	sdg->__cpu_power = 0;
+
+	group = child->groups;
+	do {
+		sdg->__cpu_power += group->__cpu_power;
+		group = group->next;
+	} while (group != child->groups);
+
+	if (power != sdg->__cpu_power)
+		sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power);
+}
 
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
@@ -3712,7 +3734,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
  * @balance: Should we balance.
  * @sgs: variable to hold the statistics for this group.
  */
-static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+			struct sched_group *group, int this_cpu,
 			enum cpu_idle_type idle, int load_idx, int *sd_idle,
 			int local_group, const struct cpumask *cpus,
 			int *balance, struct sg_lb_stats *sgs)
@@ -3723,8 +3746,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
 	unsigned long sum_avg_load_per_task;
 	unsigned long avg_load_per_task;
 
-	if (local_group)
+	if (local_group) {
 		balance_cpu = group_first_cpu(group);
+		if (balance_cpu == this_cpu)
+			update_sched_power(sd);
+	}
 
 	/* Tally up the load of all CPUs in the group */
 	sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3828,7 +3854,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
 		memset(&sgs, 0, sizeof(sgs));
-		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+		update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
 				local_group, cpus, balance, &sgs);
 
 		if (local_group && balance && !(*balance))
@@ -3863,7 +3889,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		update_sd_power_savings_stats(group, sds, local_group, &sgs);
 		group = group->next;
 	} while (group != sd->groups);
-
 }
 
 /**
-- 
cgit v1.2.3


From a52bfd73589eaf88d9c95ad2c1de0b38a6b27972 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:35 +0200
Subject: sched: Add smt_gain

The idea is that multi-threading a core yields more work
capacity than a single thread, provide a way to express a
static gain for threads.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083826.073345955@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ecb4a47d421..55112261027 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8523,9 +8523,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 		weight = cpumask_weight(sched_domain_span(sd));
 		/*
 		 * SMT siblings share the power of a single core.
+		 * Usually multiple threads get a better yield out of
+		 * that one core than a single thread would have,
+		 * reflect that in sd->smt_gain.
 		 */
-		if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1)
+		if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+			power *= sd->smt_gain;
 			power /= weight;
+			power >>= SCHED_LOAD_SHIFT;
+		}
 		sg_inc_cpu_power(sd->groups, power);
 		return;
 	}
-- 
cgit v1.2.3


From ab29230e673c646292c90c8b9d378b9562145af0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:36 +0200
Subject: sched: Implement dynamic cpu_power

Recompute the cpu_power for each cpu during load-balance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083826.162033479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 55112261027..036600fd70b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3699,14 +3699,46 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
-static void update_sched_power(struct sched_domain *sd)
+unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long smt_gain = sd->smt_gain;
+
+	smt_gain /= weight;
+
+	return smt_gain;
+}
+
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long power = SCHED_LOAD_SCALE;
+	struct sched_group *sdg = sd->groups;
+	unsigned long old = sdg->__cpu_power;
+
+	/* here we could scale based on cpufreq */
+
+	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+		power *= arch_smt_gain(sd, cpu);
+		power >>= SCHED_LOAD_SHIFT;
+	}
+
+	/* here we could scale based on RT time */
+
+	if (power != old) {
+		sdg->__cpu_power = power;
+		sdg->reciprocal_cpu_power = reciprocal_value(power);
+	}
+}
+
+static void update_group_power(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power = sdg->__cpu_power;
 
 	if (!child) {
-		/* compute cpu power for this cpu */
+		update_cpu_power(sd, cpu);
 		return;
 	}
 
@@ -3749,7 +3781,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if (local_group) {
 		balance_cpu = group_first_cpu(group);
 		if (balance_cpu == this_cpu)
-			update_sched_power(sd);
+			update_group_power(sd, this_cpu);
 	}
 
 	/* Tally up the load of all CPUs in the group */
-- 
cgit v1.2.3


From e9e9250bc78e7f6342517214c0178a529807964b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:37 +0200
Subject: sched: Scale down cpu_power due to RT tasks

Keep an average on the amount of time spend on RT tasks and use
that fraction to scale down the cpu_power for regular tasks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083826.287778431@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_rt.c |  6 ++----
 kernel/sysctl.c   |  8 +++++++
 3 files changed, 71 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 036600fd70b..ab532b5de40 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -627,6 +627,9 @@ struct rq {
 
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
+
+	u64 rt_avg;
+	u64 age_stamp;
 #endif
 
 	/* calc_load related fields */
@@ -862,6 +865,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
  */
 unsigned int sysctl_sched_shares_thresh = 4;
 
+/*
+ * period over which we average the RT time consumption, measured
+ * in ms.
+ *
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
@@ -1280,12 +1291,37 @@ void wake_up_idle_cpu(int cpu)
 }
 #endif /* CONFIG_NO_HZ */
 
+static u64 sched_avg_period(void)
+{
+	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+
+static void sched_avg_update(struct rq *rq)
+{
+	s64 period = sched_avg_period();
+
+	while ((s64)(rq->clock - rq->age_stamp) > period) {
+		rq->age_stamp += period;
+		rq->rt_avg /= 2;
+	}
+}
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+	rq->rt_avg += rt_delta;
+	sched_avg_update(rq);
+}
+
 #else /* !CONFIG_SMP */
 static void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+}
 #endif /* CONFIG_SMP */
 
 #if BITS_PER_LONG == 32
@@ -3699,7 +3735,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
-unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = cpumask_weight(sched_domain_span(sd));
 	unsigned long smt_gain = sd->smt_gain;
@@ -3709,6 +3745,24 @@ unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu)
 	return smt_gain;
 }
 
+unsigned long scale_rt_power(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 total, available;
+
+	sched_avg_update(rq);
+
+	total = sched_avg_period() + (rq->clock - rq->age_stamp);
+	available = total - rq->rt_avg;
+
+	if (unlikely((s64)total < SCHED_LOAD_SCALE))
+		total = SCHED_LOAD_SCALE;
+
+	total >>= SCHED_LOAD_SHIFT;
+
+	return div_u64(available, total);
+}
+
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = cpumask_weight(sched_domain_span(sd));
@@ -3719,11 +3773,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	/* here we could scale based on cpufreq */
 
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-		power *= arch_smt_gain(sd, cpu);
+		power *= arch_scale_smt_power(sd, cpu);
 		power >>= SCHED_LOAD_SHIFT;
 	}
 
-	/* here we could scale based on RT time */
+	power *= scale_rt_power(cpu);
+	power >>= SCHED_LOAD_SHIFT;
+
+	if (!power)
+		power = 1;
 
 	if (power != old) {
 		sdg->__cpu_power = power;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3d4020a9ba1..2eb4bd6a526 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -615,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
+	sched_rt_avg_update(rq, delta_exec);
+
 	if (!rt_bandwidth_enabled())
 		return;
 
@@ -887,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
-
-	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -899,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	dequeue_rt_entity(rt_se);
 
 	dequeue_pushable_task(rq, p);
-
-	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd..6c9836ef4b4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -330,6 +330,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_time_avg",
+		.data		= &sysctl_sched_time_avg,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "timer_migration",
-- 
cgit v1.2.3


From bdb94aa5dbd8b55e75f5a50b61312fe589e2c2d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:38 +0200
Subject: sched: Try to deal with low capacity

When the capacity drops low, we want to migrate load away.
Allow the load-balancer to remove all tasks when we hit rock
bottom.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083826.342231003@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ab532b5de40..5f5b359b01b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3908,8 +3908,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 		sgs->group_imb = 1;
 
-	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
-
+	sgs->group_capacity =
+		DIV_ROUND_CLOSEST(group->__cpu_power, SCHED_LOAD_SCALE);
 }
 
 /**
@@ -3959,7 +3959,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		 * and move all the excess tasks away.
 		 */
 		if (prefer_sibling)
-			sgs.group_capacity = 1;
+			sgs.group_capacity = min(sgs.group_capacity, 1UL);
 
 		if (local_group) {
 			sds->this_load = sgs.avg_load;
@@ -4191,6 +4191,26 @@ ret:
 	return NULL;
 }
 
+static struct sched_group *group_of(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+
+	if (!sd)
+		return NULL;
+
+	return sd->groups;
+}
+
+static unsigned long power_of(int cpu)
+{
+	struct sched_group *group = group_of(cpu);
+
+	if (!group)
+		return SCHED_LOAD_SCALE;
+
+	return group->__cpu_power;
+}
+
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
@@ -4203,15 +4223,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	int i;
 
 	for_each_cpu(i, sched_group_cpus(group)) {
+		unsigned long power = power_of(i);
+		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
 		unsigned long wl;
 
 		if (!cpumask_test_cpu(i, cpus))
 			continue;
 
 		rq = cpu_rq(i);
-		wl = weighted_cpuload(i);
+		wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
+		wl /= power;
 
-		if (rq->nr_running == 1 && wl > imbalance)
+		if (capacity && rq->nr_running == 1 && wl > imbalance)
 			continue;
 
 		if (wl > max_load) {
-- 
cgit v1.2.3


From d899a789c28ded9c72b57ddb61868d6b8fc23e80 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 2 Sep 2009 16:59:10 +0530
Subject: sched: Try to deal with low capacity, fix
 update_sd_power_savings_stats()

sgs.group_capacity can now be 0, if for some reason
group->__cpu_power happens to be less than SCHED_LOAD_SCALE/2.

In that case, we need the following fix to make it work for
update_sd_power_savings_stats(). That's because both
sum_nr_running and group_capacity are unsigned longs.

Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5f5b359b01b..e1ebf9b00f5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3668,7 +3668,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
 	 * capacity but still has some space to pick up some load
 	 * from other group and save more power
 	 */
-	if (sgs->sum_nr_running > sgs->group_capacity - 1)
+	if (sgs->sum_nr_running + 1 > sgs->group_capacity)
 		return;
 
 	if (sgs->sum_nr_running > sds->leader_nr_running ||
-- 
cgit v1.2.3


From 18a3885fc1ffa92c2212ff0afdf033403d5b0fa0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 1 Sep 2009 10:34:39 +0200
Subject: sched: Remove reciprocal for cpu_power

Its a source of fail, also, now that cpu_power is dynamical,
its a waste of time.

before:
<idle>-0   [000]   132.877936: find_busiest_group: avg_load: 0 group_load: 8241 power: 1

after:
bash-1689  [001]   137.862151: find_busiest_group: avg_load: 10636288 group_load: 10387 power: 1

[ v2: build fix from From: Andreas Herrmann ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
LKML-Reference: <20090901083826.425896304@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 101 +++++++++++++++++++--------------------------------------
 1 file changed, 34 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e1ebf9b00f5..b5378534685 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
-#include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
@@ -120,30 +119,8 @@
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
-#ifdef CONFIG_SMP
-
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 
-/*
- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
- * Since cpu_power is a 'constant', we can use a reciprocal divide.
- */
-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
-{
-	return reciprocal_divide(load, sg->reciprocal_cpu_power);
-}
-
-/*
- * Each time a sched group cpu_power is changed,
- * we must compute its reciprocal value
- */
-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
-{
-	sg->__cpu_power += val;
-	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
-}
-#endif
-
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -2335,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		}
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = sg_div_cpu_power(group,
-				avg_load * SCHED_LOAD_SCALE);
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
 		if (local_group) {
 			this_load = avg_load;
@@ -3768,7 +3744,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	unsigned long weight = cpumask_weight(sched_domain_span(sd));
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
-	unsigned long old = sdg->__cpu_power;
 
 	/* here we could scale based on cpufreq */
 
@@ -3783,33 +3758,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	if (!power)
 		power = 1;
 
-	if (power != old) {
-		sdg->__cpu_power = power;
-		sdg->reciprocal_cpu_power = reciprocal_value(power);
-	}
+	sdg->cpu_power = power;
 }
 
 static void update_group_power(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
-	unsigned long power = sdg->__cpu_power;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
 		return;
 	}
 
-	sdg->__cpu_power = 0;
+	sdg->cpu_power = 0;
 
 	group = child->groups;
 	do {
-		sdg->__cpu_power += group->__cpu_power;
+		sdg->cpu_power += group->cpu_power;
 		group = group->next;
 	} while (group != child->groups);
-
-	if (power != sdg->__cpu_power)
-		sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power);
 }
 
 /**
@@ -3889,8 +3857,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	}
 
 	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = sg_div_cpu_power(group,
-			sgs->group_load * SCHED_LOAD_SCALE);
+	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
 
 	/*
@@ -3902,14 +3869,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	 *      normalized nr_running number somewhere that negates
 	 *      the hierarchy?
 	 */
-	avg_load_per_task = sg_div_cpu_power(group,
-			sum_avg_load_per_task * SCHED_LOAD_SCALE);
+	avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
+		group->cpu_power;
 
 	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 		sgs->group_imb = 1;
 
 	sgs->group_capacity =
-		DIV_ROUND_CLOSEST(group->__cpu_power, SCHED_LOAD_SCALE);
+		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
 }
 
 /**
@@ -3951,7 +3918,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			return;
 
 		sds->total_load += sgs.group_load;
-		sds->total_pwr += group->__cpu_power;
+		sds->total_pwr += group->cpu_power;
 
 		/*
 		 * In case the child domain prefers tasks go to siblings
@@ -4016,28 +3983,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 	 * moving them.
 	 */
 
-	pwr_now += sds->busiest->__cpu_power *
+	pwr_now += sds->busiest->cpu_power *
 			min(sds->busiest_load_per_task, sds->max_load);
-	pwr_now += sds->this->__cpu_power *
+	pwr_now += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load);
 	pwr_now /= SCHED_LOAD_SCALE;
 
 	/* Amount of load we'd subtract */
-	tmp = sg_div_cpu_power(sds->busiest,
-			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+		sds->busiest->cpu_power;
 	if (sds->max_load > tmp)
-		pwr_move += sds->busiest->__cpu_power *
+		pwr_move += sds->busiest->cpu_power *
 			min(sds->busiest_load_per_task, sds->max_load - tmp);
 
 	/* Amount of load we'd add */
-	if (sds->max_load * sds->busiest->__cpu_power <
+	if (sds->max_load * sds->busiest->cpu_power <
 		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-		tmp = sg_div_cpu_power(sds->this,
-			sds->max_load * sds->busiest->__cpu_power);
+		tmp = (sds->max_load * sds->busiest->cpu_power) /
+			sds->this->cpu_power;
 	else
-		tmp = sg_div_cpu_power(sds->this,
-			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
-	pwr_move += sds->this->__cpu_power *
+		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+			sds->this->cpu_power;
+	pwr_move += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
 	pwr_move /= SCHED_LOAD_SCALE;
 
@@ -4072,8 +4039,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 			sds->max_load - sds->busiest_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds->busiest->__cpu_power,
-		(sds->avg_load - sds->this_load) * sds->this->__cpu_power)
+	*imbalance = min(max_pull * sds->busiest->cpu_power,
+		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
 	/*
@@ -4208,7 +4175,7 @@ static unsigned long power_of(int cpu)
 	if (!group)
 		return SCHED_LOAD_SCALE;
 
-	return group->__cpu_power;
+	return group->cpu_power;
 }
 
 /*
@@ -7922,7 +7889,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!group->__cpu_power) {
+		if (!group->cpu_power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
@@ -7946,9 +7913,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
 		printk(KERN_CONT " %s", str);
-		if (group->__cpu_power != SCHED_LOAD_SCALE) {
-			printk(KERN_CONT " (__cpu_power = %d)",
-				group->__cpu_power);
+		if (group->cpu_power != SCHED_LOAD_SCALE) {
+			printk(KERN_CONT " (cpu_power = %d)",
+				group->cpu_power);
 		}
 
 		group = group->next;
@@ -8233,7 +8200,7 @@ init_sched_build_groups(const struct cpumask *span,
 			continue;
 
 		cpumask_clear(sched_group_cpus(sg));
-		sg->__cpu_power = 0;
+		sg->cpu_power = 0;
 
 		for_each_cpu(j, span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8491,7 +8458,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 				continue;
 			}
 
-			sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+			sg->cpu_power += sd->groups->cpu_power;
 		}
 		sg = sg->next;
 	} while (sg != group_head);
@@ -8528,7 +8495,7 @@ static int build_numa_sched_groups(struct s_data *d,
 		sd->groups = sg;
 	}
 
-	sg->__cpu_power = 0;
+	sg->cpu_power = 0;
 	cpumask_copy(sched_group_cpus(sg), d->nodemask);
 	sg->next = sg;
 	cpumask_or(d->covered, d->covered, d->nodemask);
@@ -8551,7 +8518,7 @@ static int build_numa_sched_groups(struct s_data *d,
 			       "Can not alloc domain group for node %d\n", j);
 			return -ENOMEM;
 		}
-		sg->__cpu_power = 0;
+		sg->cpu_power = 0;
 		cpumask_copy(sched_group_cpus(sg), d->tmpmask);
 		sg->next = prev->next;
 		cpumask_or(d->covered, d->covered, d->tmpmask);
@@ -8629,7 +8596,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
 	child = sd->child;
 
-	sd->groups->__cpu_power = 0;
+	sd->groups->cpu_power = 0;
 
 	if (!child) {
 		power = SCHED_LOAD_SCALE;
@@ -8645,7 +8612,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 			power /= weight;
 			power >>= SCHED_LOAD_SHIFT;
 		}
-		sg_inc_cpu_power(sd->groups, power);
+		sd->groups->cpu_power += power;
 		return;
 	}
 
@@ -8654,7 +8621,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 	 */
 	group = child->groups;
 	do {
-		sg_inc_cpu_power(sd->groups, group->__cpu_power);
+		sd->groups->cpu_power += group->cpu_power;
 		group = group->next;
 	} while (group != child->groups);
 }
-- 
cgit v1.2.3


From d7ea17a76916e456fcc78e45142c66f7fb875e3d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 4 Sep 2009 11:49:25 +0200
Subject: sched: Fix dynamic power-balancing crash

This crash:

[ 1774.088275] divide error: 0000 [#1] SMP
[ 1774.100355] CPU 13
[ 1774.102498] Modules linked in:
[ 1774.105631] Pid: 30881, comm: hackbench Not tainted 2.6.31-rc8-tip-01308-g484d664-dirty #1629 X8DTN
[ 1774.114807] RIP: 0010:[<ffffffff81041c38>]  [<ffffffff81041c38>]
sched_balance_self+0x19b/0x2d4

Triggers because update_group_power() modifies the sd tree and does
temporary calculations there - not considering that other CPUs
could observe intermediate values, such as the zero initial value.

Calculate it in a temporary variable instead. (we need no memory
barrier as these are all statistical values anyway)

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090904092742.GA11014@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b5378534685..796baf73197 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3765,19 +3765,22 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
+	unsigned long power;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
 		return;
 	}
 
-	sdg->cpu_power = 0;
+	power = 0;
 
 	group = child->groups;
 	do {
-		sdg->cpu_power += group->cpu_power;
+		power += group->cpu_power;
 		group = group->next;
 	} while (group != child->groups);
+
+	sdg->cpu_power = power;
 }
 
 /**
-- 
cgit v1.2.3


From b6d9c25631e7c38dc7be220b04553068fb542683 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Fri, 4 Sep 2009 23:13:21 +0530
Subject: Security/SELinux: includecheck fix kernel/sysctl.c

fix the following 'make includecheck' warning:

  kernel/sysctl.c: linux/security.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/sysctl.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd..71d8dc7f992 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,7 +49,6 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
-#include <linux/security.h>
 #include <linux/slow-work.h>
 #include <linux/perf_counter.h>
 
-- 
cgit v1.2.3


From cdd2ab3de4301728b20efd6225681d3ff591a938 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 7 Sep 2009 18:12:06 +0200
Subject: sched: Remove short cut from select_task_rq_fair()

select_task_rq_fair() incorrectly skips the wake_affine()
logic, remove this.

When prev_cpu == this_cpu, the code jumps straight to the
wake_idle() logic, this doesn't give the wake_affine() logic
the chance to pin the task to this cpu.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2ff850f90d1..d7fda41ddaf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1305,8 +1305,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	this_rq		= cpu_rq(this_cpu);
 	new_cpu		= prev_cpu;
 
-	if (prev_cpu == this_cpu)
-		goto out;
 	/*
 	 * 'this_sd' is the first domain that both
 	 * this_cpu and prev_cpu are present in:
-- 
cgit v1.2.3


From 71a29aa7b600595d0ef373ea605ac656876d1f2f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 7 Sep 2009 18:28:05 +0200
Subject: sched: Deal with low-load in wake_affine()

wake_affine() would always fail under low-load situations where
both prev and this were idle, because adding a single task will
always be a significant imbalance, even if there's nothing
around that could balance it.

Deal with this by allowing imbalance when there's nothing you
can do about it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d7fda41ddaf..cc97ea498f2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1262,7 +1262,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	tg = task_group(p);
 	weight = p->se.load.weight;
 
-	balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+	/*
+	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
+	 * due to the sync cause above having dropped tl to 0, we'll always have
+	 * an imbalance, but there's really nothing you can do about that, so
+	 * that's good too.
+	 *
+	 * Otherwise check if either cpus are near enough in load to allow this
+	 * task to be woken on this_cpu.
+	 */
+	balanced = !tl ||
+		100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
 		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
-- 
cgit v1.2.3


From b5d9d734a53e0204aab0089079cbde2a1285a38f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 8 Sep 2009 11:12:28 +0200
Subject: sched: Ensure that a child can't gain time over it's parent after
 fork()

A fork/exec load is usually "pass the baton", so the child
should never be placed behind the parent.  With START_DEBIT we
make room for the new task, but with child_runs_first, that
room comes out of the _parent's_ hide. There's nothing to say
that the parent wasn't ahead of min_vruntime at fork() time,
which means that the "baton carrier", who is essentially the
parent in drag, can gain time and increase scheduling latencies
for waiters.

With NEW_FAIR_SLEEPERS + START_DEBIT + child_runs_first
enabled, we essentially pass the sleeper fairness off to the
child, which is fine, but if we don't base placement on the
parent's updated vruntime, we can end up compounding latency
woes if the child itself then does fork/exec.  The debit
incurred at fork doesn't hurt the parent who is then going to
sleep and maybe exit, but the child who acquires the error
harms all comers.

This improves latencies of make -j<n> kernel build workloads.

Reported-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cc97ea498f2..e386e5dfcda 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -728,11 +728,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 			vruntime -= thresh;
 		}
-
-		/* ensure we never gain time by being placed backwards. */
-		vruntime = max_vruntime(se->vruntime, vruntime);
 	}
 
+	/* ensure we never gain time by being placed backwards. */
+	vruntime = max_vruntime(se->vruntime, vruntime);
+
 	se->vruntime = vruntime;
 }
 
@@ -1756,6 +1756,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	sched_info_queued(p);
 
 	update_curr(cfs_rq);
+	if (curr)
+		se->vruntime = curr->vruntime;
 	place_entity(cfs_rq, se, 1);
 
 	/* 'curr' will be NULL if the child belongs to a different group */
-- 
cgit v1.2.3


From 2bba22c50b06abe9fd0d23933b1e64d35b419262 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 9 Sep 2009 15:41:37 +0200
Subject: sched: Turn off child_runs_first

Set child_runs_first default to off.

It hurts 'optimal' make -j<NR_CPUS> workloads as make jobs
get preempted by child tasks, reducing parallelism.

Note, this patch might make existing races in user
applications more prominent than before - so breakages
might be bisected to this commit.

Child-runs-first is broken on SMP to begin with, and we
already had it off briefly in v2.6.23 so most of the
offenders ought to be fixed. Would be nice not to revert
this commit but fix those apps finally ...

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1252486344.28645.18.camel@marge.simson.net>
[ made the sysctl independent of CONFIG_SCHED_DEBUG, in case
  people want to work around broken apps. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c |  4 ++--
 kernel/sysctl.c     | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e386e5dfcda..af325a382b1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
 static unsigned int sched_nr_latency = 5;
 
 /*
- * After fork, child runs first. (default) If set to 0 then
+ * After fork, child runs first. If set to 0 (default) then
  * parent will (try to) run first.
  */
-const_debug unsigned int sysctl_sched_child_runs_first = 1;
+unsigned int sysctl_sched_child_runs_first __read_mostly;
 
 /*
  * sys_sched_yield() compat mode
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6c9836ef4b4..25d6bf3383b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -246,6 +246,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 #endif
 
 static struct ctl_table kern_table[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_child_runs_first",
+		.data		= &sysctl_sched_child_runs_first,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -298,14 +306,6 @@ static struct ctl_table kern_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_child_runs_first",
-		.data		= &sysctl_sched_child_runs_first,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_features",
-- 
cgit v1.2.3


From 172e082a9111ea504ee34cbba26284a5ebdc53a7 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 9 Sep 2009 15:41:37 +0200
Subject: sched: Re-tune the scheduler latency defaults to decrease worst-case
 latencies

Reduce the latency target from 20 msecs to 5 msecs.

Why? Larger latencies increase spread, which is good for scaling,
but bad for worst case latency.

We still have the ilog(nr_cpus) rule to scale up on bigger
server boxes.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1252486344.28645.18.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index af325a382b1..26fadb44250 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
- * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
-unsigned int sysctl_sched_latency = 20000000ULL;
+unsigned int sysctl_sched_latency = 5000000ULL;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 4000000ULL;
+unsigned int sysctl_sched_min_granularity = 1000000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
-- 
cgit v1.2.3


From 61cbe54d9479ad98283b2dda686deae4c34b2d59 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 9 Sep 2009 15:41:37 +0200
Subject: sched: Keep kthreads at default priority

Removes kthread/workqueue priority boost, they increase worst-case
desktop latencies.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1252486344.28645.18.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kthread.c   | 4 ----
 kernel/workqueue.c | 2 --
 2 files changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa041..5fe709982ca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
 #include <linux/mutex.h>
 #include <trace/events/sched.h>
 
-#define KTHREAD_NICE_LEVEL (-5)
-
 static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 		 * The kernel thread should not inherit these properties.
 		 */
 		sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
-		set_user_nice(create.result, KTHREAD_NICE_LEVEL);
 		set_cpus_allowed_ptr(create.result, cpu_all_mask);
 	}
 	return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 	set_mems_allowed(node_possible_map);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d881..ea1b4e7674d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
 	if (cwq->wq->freezeable)
 		set_freezable();
 
-	set_user_nice(current, -5);
-
 	for (;;) {
 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
-- 
cgit v1.2.3


From 3f2aa307c4d26b4ed6509d0a79e8254c9e07e921 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Sep 2009 20:34:48 +0200
Subject: sched: Disable NEW_FAIR_SLEEPERS for now

Nikos Chantziaras and Jens Axboe reported that turning off
NEW_FAIR_SLEEPERS improves desktop interactivity visibly.

Nikos described his experiences the following way:

  " With this setting, I can do "nice -n 19 make -j20" and
    still have a very smooth desktop and watch a movie at
    the same time.  Various other annoyances (like the
    "logout/shutdown/restart" dialog of KDE not appearing
    at all until the background fade-out effect has finished)
    are also gone.  So this seems to be the single most
    important setting that vastly improves desktop behavior,
    at least here. "

Jens described it the following way, referring to a 10-seconds
xmodmap scheduling delay he was trying to debug:

  " Then I tried switching NO_NEW_FAIR_SLEEPERS on, and then
    I get:

    Performance counter stats for 'xmodmap .xmodmap-carl':

         9.009137  task-clock-msecs         #      0.447 CPUs
               18  context-switches         #      0.002 M/sec
                1  CPU-migrations           #      0.000 M/sec
              315  page-faults              #      0.035 M/sec

    0.020167093  seconds time elapsed

    Woot! "

So disable it for now. In perf trace output i can see weird
delta timestamps:

  cc1-9943  [001]  2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns]

That nsec field is not supposed to be that large. More digging
is needed - but lets turn it off while the real bug is found.

Reported-by: Nikos Chantziaras <realnc@arcor.de>
Tested-by: Nikos Chantziaras <realnc@arcor.de>
Reported-by: Jens Axboe <jens.axboe@oracle.com>
Tested-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <4AA93D34.8040500@arcor.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9..e2dc63a5815 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,4 @@
-SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
 SCHED_FEAT(NORMALIZED_SLEEPER, 0)
 SCHED_FEAT(ADAPTIVE_GRAN, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
-- 
cgit v1.2.3


From e1f8450854d69f0291882804406ea1bab3ca44b4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 10 Sep 2009 20:52:09 +0200
Subject: sched: Fix sched::sched_stat_wait tracepoint field

This weird perf trace output:

  cc1-9943  [001]  2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns]

Is caused by setting one component field of the delta to zero
a bit too early. Move it to later.

( Note, this does not affect the NEW_FAIR_SLEEPERS interactivity bug,
  it's just a reporting bug in essence. )

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikos Chantziaras <realnc@arcor.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <4AA93D34.8040500@arcor.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 26fadb44250..aa7f8412101 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -545,14 +545,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->wait_count, se->wait_count + 1);
 	schedstat_set(se->wait_sum, se->wait_sum +
 			rq_of(cfs_rq)->clock - se->wait_start);
-	schedstat_set(se->wait_start, 0);
-
 #ifdef CONFIG_SCHEDSTATS
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
 			rq_of(cfs_rq)->clock - se->wait_start);
 	}
 #endif
+	schedstat_set(se->wait_start, 0);
 }
 
 static inline void
-- 
cgit v1.2.3


From d993831fa7ffeb89e994f046f93eeb09ec91df08 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 12 Jun 2009 14:45:52 +0200
Subject: writeback: add name to backing_dev_info

This enables us to track who does what and print info. Its main use
is catching dirty inodes on the default_backing_dev_info, so we can
fix that up.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7..c7ece8f027f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
 static struct file_operations proc_cgroupstats_operations;
 
 static struct backing_dev_info cgroup_backing_dev_info = {
+	.name		= "cgroup",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
-- 
cgit v1.2.3


From 5e605b64a183a6c0e84cdb99a6f8acb1f8200437 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 5 Aug 2009 09:07:21 +0200
Subject: block: add blk-iopoll, a NAPI like approach for block devices

This borrows some code from NAPI and implements a polled completion
mode for block devices. The idea is the same as NAPI - instead of
doing the command completion when the irq occurs, schedule a dedicated
softirq in the hopes that we will complete more IO when the iopoll
handler is invoked. Devices have a budget of commands assigned, and will
stay in polled mode as long as they continue to consume their budget
from the iopoll softirq handler. If they do not, the device is set back
to interrupt completion mode.

This patch holds the core bits for blk-iopoll, device driver support
sold separately.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/sysctl.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd..0ed9fa6f322 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,6 +92,7 @@ extern int sysctl_nr_trim_pages;
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+extern int blk_iopoll_enabled;
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -990,7 +991,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "blk_iopoll",
+		.data		= &blk_iopoll_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3


From e681c9dd62fe8fcc5bba28a3ca3f7dc8be940206 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Wed, 8 Jul 2009 13:23:32 +0200
Subject: PM: Fix typo in label name s/Platofrm_finish/Platform_finish/

Although the same label name is used somewhere else in the file, this
particular label was consistently typoed in all of its uses.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e746489..ec8202512b0 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -460,11 +460,11 @@ int hibernation_platform_enter(void)
 
 	error = hibernation_ops->prepare();
 	if (error)
-		goto Platofrm_finish;
+		goto Platform_finish;
 
 	error = disable_nonboot_cpus();
 	if (error)
-		goto Platofrm_finish;
+		goto Platform_finish;
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +476,7 @@ int hibernation_platform_enter(void)
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
 	 * the system is going to be halted anyway.
 	 */
- Platofrm_finish:
+ Platform_finish:
 	hibernation_ops->finish();
 
 	dpm_suspend_noirq(PMSG_RESTORE);
-- 
cgit v1.2.3


From 4bb334353ebd821bc8eeabeb019eaac33c7307df Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 8 Jul 2009 13:23:51 +0200
Subject: PM/Hibernate: Rework shrinking of memory

Rework swsusp_shrink_memory() so that it calls shrink_all_memory()
just once to make some room for the image and then allocates memory
to apply more pressure to the memory management subsystem, if
necessary.

Unfortunately, we don't seem to be able to drop shrink_all_memory()
entirely just yet, because that would lead to huge performance
regressions in some test cases.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
---
 kernel/power/snapshot.c | 205 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 158 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d..a3a175fa0a4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1066,69 +1066,180 @@ void swsusp_free(void)
 	buffer = NULL;
 }
 
+/* Helper functions used for the shrinking of memory. */
+
+#define GFP_IMAGE	(GFP_KERNEL | __GFP_NOWARN)
+
 /**
- *	swsusp_shrink_memory -  Try to free as much memory as needed
- *
- *	... but do not OOM-kill anyone
+ * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * @nr_pages: Number of page frames to allocate.
+ * @mask: GFP flags to use for the allocation.
  *
- *	Notice: all userland should be stopped before it is called, or
- *	livelock is possible.
+ * Return value: Number of page frames actually allocated
  */
+static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
+{
+	unsigned long nr_alloc = 0;
+
+	while (nr_pages > 0) {
+		if (!alloc_image_page(mask))
+			break;
+		nr_pages--;
+		nr_alloc++;
+	}
 
-#define SHRINK_BITE	10000
-static inline unsigned long __shrink_memory(long tmp)
+	return nr_alloc;
+}
+
+static unsigned long preallocate_image_memory(unsigned long nr_pages)
+{
+	return preallocate_image_pages(nr_pages, GFP_IMAGE);
+}
+
+#ifdef CONFIG_HIGHMEM
+static unsigned long preallocate_image_highmem(unsigned long nr_pages)
 {
-	if (tmp > SHRINK_BITE)
-		tmp = SHRINK_BITE;
-	return shrink_all_memory(tmp);
+	return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
 }
 
+/**
+ *  __fraction - Compute (an approximation of) x * (multiplier / base)
+ */
+static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
+{
+	x *= multiplier;
+	do_div(x, base);
+	return (unsigned long)x;
+}
+
+static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
+						unsigned long highmem,
+						unsigned long total)
+{
+	unsigned long alloc = __fraction(nr_pages, highmem, total);
+
+	return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
+}
+#else /* CONFIG_HIGHMEM */
+static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
+{
+	return 0;
+}
+
+static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
+						unsigned long highmem,
+						unsigned long total)
+{
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * swsusp_shrink_memory -  Make the kernel release as much memory as needed
+ *
+ * To create a hibernation image it is necessary to make a copy of every page
+ * frame in use.  We also need a number of page frames to be free during
+ * hibernation for allocations made while saving the image and for device
+ * drivers, in case they need to allocate memory from their hibernation
+ * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
+ * respectively, both of which are rough estimates).  To make this happen, we
+ * compute the total number of available page frames and allocate at least
+ *
+ * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
+ *
+ * of them, which corresponds to the maximum size of a hibernation image.
+ *
+ * If image_size is set below the number following from the above formula,
+ * the preallocation of memory is continued until the total number of saveable
+ * pages in the system is below the requested image size or it is impossible to
+ * allocate more memory, whichever happens first.
+ */
 int swsusp_shrink_memory(void)
 {
-	long tmp;
 	struct zone *zone;
-	unsigned long pages = 0;
-	unsigned int i = 0;
-	char *p = "-\\|/";
+	unsigned long saveable, size, max_size, count, highmem, pages = 0;
+	unsigned long alloc, pages_highmem;
 	struct timeval start, stop;
+	int error = 0;
 
-	printk(KERN_INFO "PM: Shrinking memory...  ");
+	printk(KERN_INFO "PM: Shrinking memory... ");
 	do_gettimeofday(&start);
-	do {
-		long size, highmem_size;
-
-		highmem_size = count_highmem_pages();
-		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
-		tmp = size;
-		size += highmem_size;
-		for_each_populated_zone(zone) {
-			tmp += snapshot_additional_pages(zone);
-			if (is_highmem(zone)) {
-				highmem_size -=
-					zone_page_state(zone, NR_FREE_PAGES);
-			} else {
-				tmp -= zone_page_state(zone, NR_FREE_PAGES);
-				tmp += zone->lowmem_reserve[ZONE_NORMAL];
-			}
-		}
 
-		if (highmem_size < 0)
-			highmem_size = 0;
+	/* Count the number of saveable data pages. */
+	highmem = count_highmem_pages();
+	saveable = count_data_pages();
 
-		tmp += highmem_size;
-		if (tmp > 0) {
-			tmp = __shrink_memory(tmp);
-			if (!tmp)
-				return -ENOMEM;
-			pages += tmp;
-		} else if (size > image_size / PAGE_SIZE) {
-			tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
-			pages += tmp;
-		}
-		printk("\b%c", p[i++%4]);
-	} while (tmp > 0);
+	/*
+	 * Compute the total number of page frames we can use (count) and the
+	 * number of pages needed for image metadata (size).
+	 */
+	count = saveable;
+	saveable += highmem;
+	size = 0;
+	for_each_populated_zone(zone) {
+		size += snapshot_additional_pages(zone);
+		if (is_highmem(zone))
+			highmem += zone_page_state(zone, NR_FREE_PAGES);
+		else
+			count += zone_page_state(zone, NR_FREE_PAGES);
+	}
+	count += highmem;
+	count -= totalreserve_pages;
+
+	/* Compute the maximum number of saveable pages to leave in memory. */
+	max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+	size = DIV_ROUND_UP(image_size, PAGE_SIZE);
+	if (size > max_size)
+		size = max_size;
+	/*
+	 * If the maximum is not less than the current number of saveable pages
+	 * in memory, we don't need to do anything more.
+	 */
+	if (size >= saveable)
+		goto out;
+
+	/*
+	 * Let the memory management subsystem know that we're going to need a
+	 * large number of page frames to allocate and make it free some memory.
+	 * NOTE: If this is not done, performance will be hurt badly in some
+	 * test cases.
+	 */
+	shrink_all_memory(saveable - size);
+
+	/*
+	 * The number of saveable pages in memory was too high, so apply some
+	 * pressure to decrease it.  First, make room for the largest possible
+	 * image and fail if that doesn't work.  Next, try to decrease the size
+	 * of the image as much as indicated by image_size using allocations
+	 * from highmem and non-highmem zones separately.
+	 */
+	pages_highmem = preallocate_image_highmem(highmem / 2);
+	alloc = (count - max_size) - pages_highmem;
+	pages = preallocate_image_memory(alloc);
+	if (pages < alloc) {
+		error = -ENOMEM;
+		goto free_out;
+	}
+	size = max_size - size;
+	alloc = size;
+	size = preallocate_highmem_fraction(size, highmem, count);
+	pages_highmem += size;
+	alloc -= size;
+	pages += preallocate_image_memory(alloc);
+	pages += pages_highmem;
+
+ free_out:
+	/* Release all of the preallocated page frames. */
+	swsusp_free();
+
+	if (error) {
+		printk(KERN_CONT "\n");
+		return error;
+	}
+
+ out:
 	do_gettimeofday(&stop);
-	printk("\bdone (%lu pages freed)\n", pages);
+	printk(KERN_CONT "done (preallocated %lu free pages)\n", pages);
 	swsusp_show_speed(&start, &stop, pages, "Freed");
 
 	return 0;
-- 
cgit v1.2.3


From 64a473cb74a88cb4991edf985d55a266e65292e1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 8 Jul 2009 13:24:05 +0200
Subject: PM/Hibernate: Do not release preallocated memory unnecessarily (rev.
 2)

Since the hibernation code is now going to use allocations of memory
to make enough room for the image, it can also use the page frames
allocated at this stage as image page frames.  The low-level
hibernation code needs to be rearranged for this purpose, but it
allows us to avoid freeing a great number of pages and allocating
these same pages once again later, so it generally is worth doing.

[rev. 2: Take highmem into account correctly.]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c |  15 +++-
 kernel/power/power.h     |   2 +-
 kernel/power/snapshot.c  | 202 +++++++++++++++++++++++++++++++----------------
 3 files changed, 147 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index ec8202512b0..04b3a83d686 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
 	if (error)
 		return error;
 
-	/* Free memory before shutting down devices. */
-	error = swsusp_shrink_memory();
+	/* Preallocate image memory before shutting down devices. */
+	error = hibernate_preallocate_memory();
 	if (error)
 		goto Close;
 
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
 	/* Control returns here after successful restore */
 
  Resume_devices:
+	/* We may need to release the preallocated image pages here. */
+	if (error || !in_suspend)
+		swsusp_free();
+
 	dpm_resume_end(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 	resume_console();
@@ -578,7 +582,10 @@ int hibernate(void)
 		goto Thaw;
 
 	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
-	if (in_suspend && !error) {
+	if (error)
+		goto Thaw;
+
+	if (in_suspend) {
 		unsigned int flags = 0;
 
 		if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
 			power_down();
 	} else {
 		pr_debug("PM: Image restored successfully.\n");
-		swsusp_free();
 	}
+
  Thaw:
 	thaw_processes();
  Finish:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e..46c5a26630a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
-extern int swsusp_shrink_memory(void);
+extern int hibernate_preallocate_memory(void);
 
 /**
  *	Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a3a175fa0a4..2b1a7bc24c9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 static unsigned int nr_copy_pages;
 /* Number of pages needed for saving the original pfns of the image pages */
 static unsigned int nr_meta_pages;
+/*
+ * Numbers of normal and highmem page frames allocated for hibernation image
+ * before suspending devices.
+ */
+unsigned int alloc_normal, alloc_highmem;
+/*
+ * Memory bitmap used for marking saveable pages (during hibernation) or
+ * hibernation image pages (during restore)
+ */
+static struct memory_bitmap orig_bm;
+/*
+ * Memory bitmap used during hibernation for marking allocated page frames that
+ * will contain copies of saveable pages.  During restore it is initially used
+ * for marking hibernation image pages, but then the set bits from it are
+ * duplicated in @orig_bm and it is released.  On highmem systems it is next
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
+ */
+static struct memory_bitmap copy_bm;
 
 /**
  *	swsusp_free - free pages allocated for the suspend.
@@ -1064,6 +1083,8 @@ void swsusp_free(void)
 	nr_meta_pages = 0;
 	restore_pblist = NULL;
 	buffer = NULL;
+	alloc_normal = 0;
+	alloc_highmem = 0;
 }
 
 /* Helper functions used for the shrinking of memory. */
@@ -1082,8 +1103,16 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
 	unsigned long nr_alloc = 0;
 
 	while (nr_pages > 0) {
-		if (!alloc_image_page(mask))
+		struct page *page;
+
+		page = alloc_image_page(mask);
+		if (!page)
 			break;
+		memory_bm_set_bit(&copy_bm, page_to_pfn(page));
+		if (PageHighMem(page))
+			alloc_highmem++;
+		else
+			alloc_normal++;
 		nr_pages--;
 		nr_alloc++;
 	}
@@ -1135,7 +1164,47 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
 #endif /* CONFIG_HIGHMEM */
 
 /**
- * swsusp_shrink_memory -  Make the kernel release as much memory as needed
+ * free_unnecessary_pages - Release preallocated pages not needed for the image
+ */
+static void free_unnecessary_pages(void)
+{
+	unsigned long save_highmem, to_free_normal, to_free_highmem;
+
+	to_free_normal = alloc_normal - count_data_pages();
+	save_highmem = count_highmem_pages();
+	if (alloc_highmem > save_highmem) {
+		to_free_highmem = alloc_highmem - save_highmem;
+	} else {
+		to_free_highmem = 0;
+		to_free_normal -= save_highmem - alloc_highmem;
+	}
+
+	memory_bm_position_reset(&copy_bm);
+
+	while (to_free_normal > 0 && to_free_highmem > 0) {
+		unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+		struct page *page = pfn_to_page(pfn);
+
+		if (PageHighMem(page)) {
+			if (!to_free_highmem)
+				continue;
+			to_free_highmem--;
+			alloc_highmem--;
+		} else {
+			if (!to_free_normal)
+				continue;
+			to_free_normal--;
+			alloc_normal--;
+		}
+		memory_bm_clear_bit(&copy_bm, pfn);
+		swsusp_unset_page_forbidden(page);
+		swsusp_unset_page_free(page);
+		__free_page(page);
+	}
+}
+
+/**
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image
  *
  * To create a hibernation image it is necessary to make a copy of every page
  * frame in use.  We also need a number of page frames to be free during
@@ -1154,19 +1223,30 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
  * pages in the system is below the requested image size or it is impossible to
  * allocate more memory, whichever happens first.
  */
-int swsusp_shrink_memory(void)
+int hibernate_preallocate_memory(void)
 {
 	struct zone *zone;
 	unsigned long saveable, size, max_size, count, highmem, pages = 0;
-	unsigned long alloc, pages_highmem;
+	unsigned long alloc, save_highmem, pages_highmem;
 	struct timeval start, stop;
-	int error = 0;
+	int error;
 
-	printk(KERN_INFO "PM: Shrinking memory... ");
+	printk(KERN_INFO "PM: Preallocating image memory... ");
 	do_gettimeofday(&start);
 
+	error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
+	if (error)
+		goto err_out;
+
+	error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
+	if (error)
+		goto err_out;
+
+	alloc_normal = 0;
+	alloc_highmem = 0;
+
 	/* Count the number of saveable data pages. */
-	highmem = count_highmem_pages();
+	save_highmem = count_highmem_pages();
 	saveable = count_data_pages();
 
 	/*
@@ -1174,7 +1254,8 @@ int swsusp_shrink_memory(void)
 	 * number of pages needed for image metadata (size).
 	 */
 	count = saveable;
-	saveable += highmem;
+	saveable += save_highmem;
+	highmem = save_highmem;
 	size = 0;
 	for_each_populated_zone(zone) {
 		size += snapshot_additional_pages(zone);
@@ -1193,10 +1274,13 @@ int swsusp_shrink_memory(void)
 		size = max_size;
 	/*
 	 * If the maximum is not less than the current number of saveable pages
-	 * in memory, we don't need to do anything more.
+	 * in memory, allocate page frames for the image and we're done.
 	 */
-	if (size >= saveable)
+	if (size >= saveable) {
+		pages = preallocate_image_highmem(save_highmem);
+		pages += preallocate_image_memory(saveable - pages);
 		goto out;
+	}
 
 	/*
 	 * Let the memory management subsystem know that we're going to need a
@@ -1216,10 +1300,8 @@ int swsusp_shrink_memory(void)
 	pages_highmem = preallocate_image_highmem(highmem / 2);
 	alloc = (count - max_size) - pages_highmem;
 	pages = preallocate_image_memory(alloc);
-	if (pages < alloc) {
-		error = -ENOMEM;
-		goto free_out;
-	}
+	if (pages < alloc)
+		goto err_out;
 	size = max_size - size;
 	alloc = size;
 	size = preallocate_highmem_fraction(size, highmem, count);
@@ -1228,21 +1310,24 @@ int swsusp_shrink_memory(void)
 	pages += preallocate_image_memory(alloc);
 	pages += pages_highmem;
 
- free_out:
-	/* Release all of the preallocated page frames. */
-	swsusp_free();
-
-	if (error) {
-		printk(KERN_CONT "\n");
-		return error;
-	}
+	/*
+	 * We only need as many page frames for the image as there are saveable
+	 * pages in memory, but we have allocated more.  Release the excessive
+	 * ones now.
+	 */
+	free_unnecessary_pages();
 
  out:
 	do_gettimeofday(&stop);
-	printk(KERN_CONT "done (preallocated %lu free pages)\n", pages);
-	swsusp_show_speed(&start, &stop, pages, "Freed");
+	printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+	swsusp_show_speed(&start, &stop, pages, "Allocated");
 
 	return 0;
+
+ err_out:
+	printk(KERN_CONT "\n");
+	swsusp_free();
+	return -ENOMEM;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -1253,7 +1338,7 @@ int swsusp_shrink_memory(void)
 
 static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
 {
-	unsigned int free_highmem = count_free_highmem_pages();
+	unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
 
 	if (free_highmem >= nr_highmem)
 		nr_highmem = 0;
@@ -1275,19 +1360,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
 static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
 	struct zone *zone;
-	unsigned int free = 0, meta = 0;
+	unsigned int free = alloc_normal;
 
-	for_each_zone(zone) {
-		meta += snapshot_additional_pages(zone);
+	for_each_zone(zone)
 		if (!is_highmem(zone))
 			free += zone_page_state(zone, NR_FREE_PAGES);
-	}
 
 	nr_pages += count_pages_for_highmem(nr_highmem);
-	pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n",
-		nr_pages, PAGES_FOR_IO, meta, free);
+	pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
+		nr_pages, PAGES_FOR_IO, free);
 
-	return free > nr_pages + PAGES_FOR_IO + meta;
+	return free > nr_pages + PAGES_FOR_IO;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -1309,7 +1392,7 @@ static inline int get_highmem_buffer(int safe_needed)
  */
 
 static inline unsigned int
-alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 {
 	unsigned int to_alloc = count_free_highmem_pages();
 
@@ -1329,7 +1412,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 static inline int get_highmem_buffer(int safe_needed) { return 0; }
 
 static inline unsigned int
-alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1348,51 +1431,36 @@ static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 		unsigned int nr_pages, unsigned int nr_highmem)
 {
-	int error;
-
-	error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-	if (error)
-		goto Free;
-
-	error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-	if (error)
-		goto Free;
+	int error = 0;
 
 	if (nr_highmem > 0) {
 		error = get_highmem_buffer(PG_ANY);
 		if (error)
-			goto Free;
-
-		nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+			goto err_out;
+		if (nr_highmem > alloc_highmem) {
+			nr_highmem -= alloc_highmem;
+			nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
+		}
 	}
-	while (nr_pages-- > 0) {
-		struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
-
-		if (!page)
-			goto Free;
+	if (nr_pages > alloc_normal) {
+		nr_pages -= alloc_normal;
+		while (nr_pages-- > 0) {
+			struct page *page;
 
-		memory_bm_set_bit(copy_bm, page_to_pfn(page));
+			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+			if (!page)
+				goto err_out;
+			memory_bm_set_bit(copy_bm, page_to_pfn(page));
+		}
 	}
+
 	return 0;
 
- Free:
+ err_out:
 	swsusp_free();
-	return -ENOMEM;
+	return error;
 }
 
-/* Memory bitmap used for marking saveable pages (during suspend) or the
- * suspend image pages (during resume)
- */
-static struct memory_bitmap orig_bm;
-/* Memory bitmap used on suspend for marking allocated pages that will contain
- * the copies of saveable pages.  During resume it is initially used for
- * marking the suspend image pages, but then its set bits are duplicated in
- * @orig_bm and it is released.  Next, on systems with high memory, it may be
- * used for marking "safe" highmem pages, but it has to be reinitialized for
- * this purpose.
- */
-static struct memory_bitmap copy_bm;
-
 asmlinkage int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
-- 
cgit v1.2.3


From ef4aede3f10d82adef1fb044b565ba5f08f851e0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 8 Jul 2009 13:24:12 +0200
Subject: PM/Hibernate: Do not try to allocate too much memory too hard (rev.
 2)

We want to avoid attempting to free too much memory too hard during
hibernation, so estimate the minimum size of the image to use as the
lower limit for preallocating memory.

The approach here is based on the (experimental) observation that we
can't free more page frames than the sum of:

* global_page_state(NR_SLAB_RECLAIMABLE)
* global_page_state(NR_ACTIVE_ANON)
* global_page_state(NR_INACTIVE_ANON)
* global_page_state(NR_ACTIVE_FILE)
* global_page_state(NR_INACTIVE_FILE)

minus

* global_page_state(NR_FILE_MAPPED)

Namely, if this number is subtracted from the number of saveable
pages in the system, we get a good estimate of the minimum reasonable
size of a hibernation image.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
---
 kernel/power/snapshot.c | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2b1a7bc24c9..0a06b114dd3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1203,6 +1203,36 @@ static void free_unnecessary_pages(void)
 	}
 }
 
+/**
+ * minimum_image_size - Estimate the minimum acceptable size of an image
+ * @saveable: Number of saveable pages in the system.
+ *
+ * We want to avoid attempting to free too much memory too hard, so estimate the
+ * minimum acceptable size of a hibernation image to use as the lower limit for
+ * preallocating memory.
+ *
+ * We assume that the minimum image size should be proportional to
+ *
+ * [number of saveable pages] - [number of pages that can be freed in theory]
+ *
+ * where the second term is the sum of (1) reclaimable slab pages, (2) active
+ * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
+ * minus mapped file pages.
+ */
+static unsigned long minimum_image_size(unsigned long saveable)
+{
+	unsigned long size;
+
+	size = global_page_state(NR_SLAB_RECLAIMABLE)
+		+ global_page_state(NR_ACTIVE_ANON)
+		+ global_page_state(NR_INACTIVE_ANON)
+		+ global_page_state(NR_ACTIVE_FILE)
+		+ global_page_state(NR_INACTIVE_FILE)
+		- global_page_state(NR_FILE_MAPPED);
+
+	return saveable <= size ? 0 : saveable - size;
+}
+
 /**
  * hibernate_preallocate_memory - Preallocate memory for hibernation image
  *
@@ -1220,8 +1250,8 @@ static void free_unnecessary_pages(void)
  *
  * If image_size is set below the number following from the above formula,
  * the preallocation of memory is continued until the total number of saveable
- * pages in the system is below the requested image size or it is impossible to
- * allocate more memory, whichever happens first.
+ * pages in the system is below the requested image size or the minimum
+ * acceptable image size returned by minimum_image_size(), whichever is greater.
  */
 int hibernate_preallocate_memory(void)
 {
@@ -1282,6 +1312,11 @@ int hibernate_preallocate_memory(void)
 		goto out;
 	}
 
+	/* Estimate the minimum size of the image. */
+	pages = minimum_image_size(saveable);
+	if (size < pages)
+		size = min_t(unsigned long, pages, max_size);
+
 	/*
 	 * Let the memory management subsystem know that we're going to need a
 	 * large number of page frames to allocate and make it free some memory.
@@ -1294,8 +1329,8 @@ int hibernate_preallocate_memory(void)
 	 * The number of saveable pages in memory was too high, so apply some
 	 * pressure to decrease it.  First, make room for the largest possible
 	 * image and fail if that doesn't work.  Next, try to decrease the size
-	 * of the image as much as indicated by image_size using allocations
-	 * from highmem and non-highmem zones separately.
+	 * of the image as much as indicated by 'size' using allocations from
+	 * highmem and non-highmem zones separately.
 	 */
 	pages_highmem = preallocate_image_highmem(highmem / 2);
 	alloc = (count - max_size) - pages_highmem;
-- 
cgit v1.2.3


From 98e73dc5d2dadfcb95305ad71ac9239f4e361870 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Wed, 22 Jul 2009 00:36:56 +0200
Subject: PM / Hibernate / Memory hotplug: Always use for_each_populated_zone()

Use for_each_populated_zone() instead of for_each_zone() in hibernation
code. This fixes a bug on s390, where we allow both config options
HIBERNATION and MEMORY_HOTPLUG, so that we also have a ZONE_MOVABLE
here. We only allow hibernation if no memory hotplug operation was
performed, so in fact both features can only be used exclusively, but
this way we don't need 2 differently configured (distribution) kernels.

If we have an unpopulated ZONE_MOVABLE, we allow hibernation but run
into a BUG_ON() in memory_bm_test/set/clear_bit() because hibernation
code iterates through all zones, not only the populated zones, in
several places. For example, swsusp_free() does for_each_zone() and
then checks for pfn_valid(), which is true even if the zone is not
populated, resulting in a BUG_ON() later because the pfn cannot be
found in the memory bitmap.

Replacing all occurences of for_each_zone() in hibernation code with
for_each_populated_zone() would fix this issue.

[rjw: Rebased on top of linux-next hibernation patches.]

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0a06b114dd3..bf06658f205 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
 	struct zone *zone;
 	unsigned int n = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long pfn, max_zone_pfn;
 
 		if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
 	unsigned long pfn, max_zone_pfn;
 	unsigned int n = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		if (is_highmem(zone))
 			continue;
 
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 	struct zone *zone;
 	unsigned long pfn;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long max_zone_pfn;
 
 		mark_free_pages(zone);
@@ -1065,7 +1065,7 @@ void swsusp_free(void)
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn)) {
@@ -1397,7 +1397,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 	struct zone *zone;
 	unsigned int free = alloc_normal;
 
-	for_each_zone(zone)
+	for_each_populated_zone(zone)
 		if (!is_highmem(zone))
 			free += zone_page_state(zone, NR_FREE_PAGES);
 
@@ -1688,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 	unsigned long pfn, max_zone_pfn;
 
 	/* Clear page flags */
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn))
-- 
cgit v1.2.3


From 8de0307326be94148436082a9abf365da8e3c66d Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 22 Jul 2009 19:56:10 +0200
Subject: PM: Trivial fixes

Fix the definition of BM_BITS_PER_BLOCK and kerneldoc
description of create_bm_block_list().

[rjw: Added changelog.]

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bf06658f205..97955b0e44f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 
 #define BM_END_OF_MAP	(~0UL)
 
-#define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
+#define BM_BITS_PER_BLOCK	(PAGE_SIZE * BITS_PER_BYTE)
 
 struct bm_block {
 	struct list_head hook;	/* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 
 /**
  *	create_bm_block_list - create a list of block bitmap objects
- *	@nr_blocks - number of blocks to allocate
+ *	@pages - number of pages to track
  *	@list - list to put the allocated blocks into
  *	@ca - chain allocator to be used for allocating memory
  */
-- 
cgit v1.2.3


From 4a5d6ba1914d1bf1fcfb5e15834c29d84a879219 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 14 Sep 2009 12:45:39 +0100
Subject: CRED: Allow put_cred() to cope with a NULL groups list

put_cred() will oops if given a NULL groups list, but that is now possible with
the existence of cred_alloc_blank(), as used in keyctl_session_to_parent().

Added in commit:

	commit ee18d64c1f632043a02e6f5ba5e045bb26a5465f
	Author: David Howells <dhowells@redhat.com>
	Date:   Wed Sep 2 09:14:21 2009 +0100
	KEYS: Add a keyctl to install a process's session keyring on its parent [try #6]

Reported-by: Marc Dionne <marc.c.dionne@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 006fcab009d..d7f7a01082e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -147,7 +147,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
 	release_tgcred(cred);
-	put_group_info(cred->group_info);
+	if (cred->group_info)
+		put_group_info(cred->group_info);
 	free_uid(cred->user);
 	kmem_cache_free(cred_jar, cred);
 }
-- 
cgit v1.2.3


From 353f6dd2dec992ddd34620a94b051b0f76227379 Mon Sep 17 00:00:00 2001
From: Anirban Sinha <asinha@zeugmasystems.com>
Date: Mon, 14 Sep 2009 11:13:37 -0700
Subject: cleanup console_print()

console_print() is an old legacy interface mostly unused in the entire
kernel tree. It's best to clean up its existing use and let developers
use their own implementation of it as they feel fit.

Signed-off-by: Anirban Sinha <asinha@zeugmasystems.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e10d193a833..602033acd6c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1075,12 +1075,6 @@ void __sched console_conditional_schedule(void)
 }
 EXPORT_SYMBOL(console_conditional_schedule);
 
-void console_print(const char *s)
-{
-	printk(KERN_EMERG "%s", s);
-}
-EXPORT_SYMBOL(console_print);
-
 void console_unblank(void)
 {
 	struct console *c;
-- 
cgit v1.2.3