From 0b8f1efad30bd58f89961b82dfe68b9edf8fd2ac Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 5 Dec 2008 18:58:31 -0800
Subject: sparse irq_desc[] array: core kernel and x86 changes

Impact: new feature

Problem on distro kernels: irq_desc[NR_IRQS] takes megabytes of RAM with
NR_CPUS set to large values. The goal is to be able to scale up to much
larger NR_IRQS value without impacting the (important) common case.

To solve this, we generalize irq_desc[NR_IRQS] to an (optional) array of
irq_desc pointers.

When CONFIG_SPARSE_IRQ=y is used, we use kzalloc_node to get irq_desc,
this also makes the IRQ descriptors NUMA-local (to the site that calls
request_irq()).

This gets rid of the irq_cfg[] static array on x86 as well: irq_cfg now
uses desc->chip_data for x86 to store irq_cfg.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/main.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index 7e117a231af..c1f999a3cf3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,6 +539,15 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
+void __init __weak arch_early_irq_init(void)
+{
+}
+
+void __init __weak early_irq_init(void)
+{
+	arch_early_irq_init();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -603,6 +612,8 @@ asmlinkage void __init start_kernel(void)
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
-- 
cgit v1.2.3


From 98a79d6a50181ca1ecf7400eda01d5dc1bc0dbf0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:19:41 +1030
Subject: cpumask: centralize cpu_online_map and cpu_possible_map

Impact: cleanup

Each SMP arch defines these themselves.  Move them to a central
location.

Twists:
1) Some archs (m32, parisc, s390) set possible_map to all 1, so we add a
   CONFIG_INIT_ALL_POSSIBLE for this rather than break them.

2) mips and sparc32 '#define cpu_possible_map phys_cpu_present_map'.
   Those archs simply have phys_cpu_present_map replaced everywhere.

3) Alpha defined cpu_possible_map to cpu_present_map; this is tricky
   so I just manipulate them both in sync.

4) IA64, cris and m32r have gratuitous 'extern cpumask_t cpu_possible_map'
   declarations.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Tested-by: Tony Luck <tony.luck@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Mike Travis <travis@sgi.com>
Cc: ink@jurassic.park.msu.ru
Cc: rmk@arm.linux.org.uk
Cc: starvik@axis.com
Cc: tony.luck@intel.com
Cc: takata@linux-m32r.org
Cc: ralf@linux-mips.org
Cc: grundler@parisc-linux.org
Cc: paulus@samba.org
Cc: schwidefsky@de.ibm.com
Cc: lethal@linux-sh.org
Cc: wli@holomorphy.com
Cc: davem@davemloft.net
Cc: jdike@addtoit.com
Cc: mingo@redhat.com
---
 init/Kconfig | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index f763762d544..7656623f500 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -916,6 +916,15 @@ config KMOD
 
 endif # MODULES
 
+config INIT_ALL_POSSIBLE
+	bool
+	help
+	  Back when each arch used to define their own cpu_online_map and
+	  cpu_possible_map, some of them chose to initialize cpu_possible_map
+	  with all 1s, and others with all 0s.  When they were centralised,
+	  it was better to provide this option than to break all the archs
+	  and have several arch maintainers persuing me down dark alleys.
+
 config STOP_MACHINE
 	bool
 	default y
-- 
cgit v1.2.3


From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 21:55:32 +0100
Subject: "Tree RCU": scalable classic RCU implementation

This patch fixes a long-standing performance bug in classic RCU that
results in massive internal-to-RCU lock contention on systems with
more than a few hundred CPUs.  Although this patch creates a separate
flavor of RCU for ease of review and patch maintenance, it is intended
to replace classic RCU.

This patch still handles stress better than does mainline, so I am still
calling it ready for inclusion.  This patch is against the -tip tree.
Nevertheless, experience on an actual 1000+ CPU machine would still be
most welcome.

Most of the changes noted below were found while creating an rcutiny
(which should permit ejecting the current rcuclassic) and while doing
detailed line-by-line documentation.

Updates from v9 (http://lkml.org/lkml/2008/12/2/334):

o	Fixes from remainder of line-by-line code walkthrough,
	including comment spelling, initialization, undesirable
	narrowing due to type conversion, removing redundant memory
	barriers, removing redundant local-variable initialization,
	and removing redundant local variables.

	I do not believe that any of these fixes address the CPU-hotplug
	issues that Andi Kleen was seeing, but please do give it a whirl
	in case the machine is smarter than I am.

	A writeup from the walkthrough may be found at the following
	URL, in case you are suffering from terminal insomnia or
	masochism:

	http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf

o	Made rcutree tracing use seq_file, as suggested some time
	ago by Lai Jiangshan.

o	Added a .csv variant of the rcudata debugfs trace file, to allow
	people having thousands of CPUs to drop the data into
	a spreadsheet.	Tested with oocalc and gnumeric.  Updated
	documentation to suit.

Updates from v8 (http://lkml.org/lkml/2008/11/15/139):

o	Fix a theoretical race between grace-period initialization and
	force_quiescent_state() that could occur if more than three
	jiffies were required to carry out the grace-period
	initialization.  Which it might, if you had enough CPUs.

o	Apply Ingo's printk-standardization patch.

o	Substitute local variables for repeated accesses to global
	variables.

o	Fix comment misspellings and redundant (but harmless) increments
	of ->n_rcu_pending (this latter after having explicitly added it).

o	Apply checkpatch fixes.

Updates from v7 (http://lkml.org/lkml/2008/10/10/291):

o	Fixed a number of problems noted by Gautham Shenoy, including
	the cpu-stall-detection bug that he was having difficulty
	convincing me was real.  ;-)

o	Changed cpu-stall detection to wait for ten seconds rather than
	three in order to reduce false positive, as suggested by Ingo
	Molnar.

o	Produced a design document (http://lwn.net/Articles/305782/).
	The act of writing this document uncovered a number of both
	theoretical and "here and now" bugs as noted below.

o	Fix dynticks_nesting accounting confusion, simplify WARN_ON()
	condition, fix kerneldoc comments, and add memory barriers
	in dynticks interface functions.

o	Add more data to tracing.

o	Remove unused "rcu_barrier" field from rcu_data structure.

o	Count calls to rcu_pending() from scheduling-clock interrupt
	to use as a surrogate timebase should jiffies stop counting.

o	Fix a theoretical race between force_quiescent_state() and
	grace-period initialization.  Yes, initialization does have to
	go on for some jiffies for this race to occur, but given enough
	CPUs...

Updates from v6 (http://lkml.org/lkml/2008/9/23/448):

o	Fix a number of checkpatch.pl complaints.

o	Apply review comments from Ingo Molnar and Lai Jiangshan
	on the stall-detection code.

o	Fix several bugs in !CONFIG_SMP builds.

o	Fix a misspelled config-parameter name so that RCU now announces
	at boot time if stall detection is configured.

o	Run tests on numerous combinations of configurations parameters,
	which after the fixes above, now build and run correctly.

Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line):

o	Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a
	changeset some time ago, and finally got around to retesting
	this option).

o	Fix some tracing bugs in rcupreempt that caused incorrect
	totals to be printed.

o	I now test with a more brutal random-selection online/offline
	script (attached).  Probably more brutal than it needs to be
	on the people reading it as well, but so it goes.

o	A number of optimizations and usability improvements:

	o	Make rcu_pending() ignore the grace-period timeout when
		there is no grace period in progress.

	o	Make force_quiescent_state() avoid going for a global
		lock in the case where there is no grace period in
		progress.

	o	Rearrange struct fields to improve struct layout.

	o	Make call_rcu() initiate a grace period if RCU was
		idle, rather than waiting for the next scheduling
		clock interrupt.

	o	Invoke rcu_irq_enter() and rcu_irq_exit() only when
		idle, as suggested by Andi Kleen.  I still don't
		completely trust this change, and might back it out.

	o	Make CONFIG_RCU_TRACE be the single config variable
		manipulated for all forms of RCU, instead of the prior
		confusion.

	o	Document tracing files and formats for both rcupreempt
		and rcutree.

Updates from v4 for those missing v5 given its bad subject line:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a hierarchy,
greatly reducing the contention on the top-level lock for large machines.
This passes 10-hour concurrent rcutorture and online-offline testing on
128-CPU ppc64 without dynticks enabled, and exposes some timekeeping
bugs in presence of dynticks (exciting working on a system where
"sleep 1" hangs until interrupted...), which were fixed in the
2.6.27 kernel.  It is getting more reliable than mainline by some
measures, so the next version will be against -tip for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	More bugs will probably surface as a result of an ongoing
	line-by-line code inspection.

	Patches will be provided as required.

o	There are probably hangs, rcutorture failures, &c.  Seems
	quite stable on a 128-CPU machine, but that is kind of small
	compared to 4096 CPUs.  However, seems to do better than
	mainline.

	Patches will be provided as required.

o	The memory footprint of this version is several KB larger
	than rcuclassic.

	A separate UP-only rcutiny patch will be provided, which will
	reduce the memory footprint significantly, even compared
	to the old rcuclassic.  One such patch passes light testing,
	and has a memory footprint smaller even than rcuclassic.
	Initial reaction from various embedded guys was "it is not
	worth it", so am putting it aside.

Credits:

o	Manfred Spraul for ideas, review comments, and bugs spotted,
	as well as some good friendly competition.  ;-)

o	Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton
	for reviews and comments.

o	Thomas Gleixner for much-needed help with some timer issues
	(see patches below).

o	Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos,
	Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton
	Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines
	alive despite my heavy abuse^Wtesting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index f763762d544..9dd7958a71f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -928,10 +928,16 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
-config CLASSIC_RCU
-	def_bool !PREEMPT_RCU
+config TREE_RCU_TRACE
+	def_bool RCU_TRACE && TREE_RCU
+	select DEBUG_FS
 	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.  Classic RCU is the default.  Note that the
-	  PREEMPT_RCU symbol is used to select/deselect this option.
+	  This option provides tracing for the TREE_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcutree_trace.c.
+
+config PREEMPT_RCU_TRACE
+	def_bool RCU_TRACE && PREEMPT_RCU
+	select DEBUG_FS
+	help
+	  This option provides tracing for the PREEMPT_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcupreempt_trace.c.
-- 
cgit v1.2.3


From 12d79bafb75639f406a9f71aab94808c414c836e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 09:31:28 +0100
Subject: rcu: provide RCU options on non-preempt architectures too

Impact: build fix

Some old architectures still do not use kernel/Kconfig.preempt, so the
moving of the RCU options there broke their build:

 In file included from /home/mingo/tip/include/linux/sem.h:81,
                 from /home/mingo/tip/include/linux/sched.h:69,
                 from /home/mingo/tip/arch/alpha/kernel/asm-offsets.c:9:
 /home/mingo/tip/include/linux/rcupdate.h:62:2: error: #error "Unknown RCU implementation specified to kernel configuration"

Move these options back to init/Kconfig, which every architecture
includes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index 9dd7958a71f..6b0fdedf359 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -928,6 +928,80 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
+config PREEMPT_RCU
+	bool "Preemptible RCU"
+	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by making certain
+	  RCU sections preemptible. Normally RCU code is non-preemptible, if
+	  this option is selected then read-only RCU sections become
+	  preemptible. This helps latency, but may expose bugs due to
+	  now-naive assumptions about each RCU read-side critical section
+	  remaining on a given CPU through its execution.
+
+endchoice
+
+config RCU_TRACE
+	bool "Enable tracing for RCU"
+	depends on TREE_RCU || PREEMPT_RCU
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.
+
+	  Say Y here if you want to enable RCU tracing
+	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say N if unsure.
+
 config TREE_RCU_TRACE
 	def_bool RCU_TRACE && TREE_RCU
 	select DEBUG_FS
-- 
cgit v1.2.3


From 13a0c3c269b223f60abfac8a9811d77111a8b4ba Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 26 Dec 2008 02:05:47 -0800
Subject: sparseirq: work around compiler optimizing away __weak functions

Impact: fix panic on null pointer with sparseirq

Some GCC versions seem to inline the weak global function,
when that function is empty.

Work it around, by making the functions return a (dummy) integer.

Signed-off-by: Yinghai <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/main.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index c1f999a3cf3..c314aa15370 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,13 +539,14 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
-void __init __weak arch_early_irq_init(void)
+int __init __weak arch_early_irq_init(void)
 {
+	return 0;
 }
 
-void __init __weak early_irq_init(void)
+int __init __weak early_irq_init(void)
 {
-	arch_early_irq_init();
+	return arch_early_irq_init();
 }
 
 asmlinkage void __init start_kernel(void)
-- 
cgit v1.2.3


From 43a256322ac1fc105c181b3cade3b9bfc0b63ca1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 28 Dec 2008 16:01:13 -0800
Subject: sparseirq: move __weak symbols into separate compilation unit

GCC has a bug with __weak alias functions: if the functions are in
the same compilation unit as their call site, GCC can decide to
inline them - and thus rob the linker of the opportunity to override
the weak alias with the real thing.

So move all the IRQ handling related __weak symbols to kernel/irq/chip.c.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/main.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index c314aa15370..2c183abbf61 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,16 +539,6 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
-int __init __weak arch_early_irq_init(void)
-{
-	return 0;
-}
-
-int __init __weak early_irq_init(void)
-{
-	return arch_early_irq_init();
-}
-
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
-- 
cgit v1.2.3


From 915441b601e6662e79f6c958e7be307967a96977 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:15 +1030
Subject: cpumask: Use accessors code in core

Impact: use new API

cpu_*_map are going away in favour of cpu_*_mask, but const pointers.
So we have accessors where we really do want to frob them.  Archs
will also need the (trivial) conversion before we can finally remove
cpu_*_map.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 init/main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index 2a7ce0f8e45..84d3732c0ce 100644
--- a/init/main.c
+++ b/init/main.c
@@ -527,9 +527,9 @@ static void __init boot_cpu_init(void)
 {
 	int cpu = smp_processor_id();
 	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
-	cpu_set(cpu, cpu_online_map);
-	cpu_set(cpu, cpu_present_map);
-	cpu_set(cpu, cpu_possible_map);
+	set_cpu_online(cpu, true);
+	set_cpu_present(cpu, true);
+	set_cpu_possible(cpu, true);
 }
 
 void __init __weak smp_setup_processor_id(void)
-- 
cgit v1.2.3


From e0c0ba736547e81c4f986ce192307c549d214167 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:19 +1030
Subject: cpumask: Use find_last_bit()

Impact: cleanup

There's one obvious place to use it: to find the highest possible cpu.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 init/main.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index 84d3732c0ce..546ebd2f44b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -380,12 +380,7 @@ EXPORT_SYMBOL(nr_cpu_ids);
 /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
 static void __init setup_nr_cpu_ids(void)
 {
-	int cpu, highest_cpu = 0;
-
-	for_each_possible_cpu(cpu)
-		highest_cpu = cpu;
-
-	nr_cpu_ids = highest_cpu + 1;
+	nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
 }
 
 #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
-- 
cgit v1.2.3


From 609e5b71d0eca163df017ecfcf917b149875e744 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 2 Jan 2009 16:16:16 +0100
Subject: kbuild: Remove gcc 4.1.0 quirk from init/main.c

Impact: cleanup

We now have a cleaner check for gcc 4.1.0/4.1.1 trouble in
include/linux/compiler-gcc4.h, so remove the 4.1.0 quirk from
init/main.c.

Reported-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index f5e64f20d2b..ad8f9f53f8d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,15 +75,6 @@
 #include <asm/smp.h>
 #endif
 
-/*
- * This is one of the first .c files built. Error out early if we have compiler
- * trouble.
- */
-
-#if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0
-#warning gcc-4.1.0 is known to miscompile the kernel.  A different compiler version is recommended.
-#endif
-
 static int kernel_init(void *);
 
 extern void init_IRQ(void);
-- 
cgit v1.2.3


From a327ca2c2674c5a9a0073421df19bfc362698136 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 8 Jul 2008 19:00:26 +0200
Subject: remove CONFIG_KMOD

Now that nothing depends on it any more, remove CONFIG_KMOD.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 init/Kconfig | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index f6281711166..52847eec739 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -916,12 +916,6 @@ config MODULE_SRCVERSION_ALL
 	  the version).  With this option, such a "srcversion" field
 	  will be created for all modules.  If unsure, say N.
 
-config KMOD
-	def_bool y
-	help
-	  This is being removed soon.  These days, CONFIG_MODULES
-	  implies CONFIG_KMOD, so use that instead.
-
 endif # MODULES
 
 config INIT_ALL_POSSIBLE
-- 
cgit v1.2.3


From b8fed87d3265d21d6f90e34934d93f70cb1e5cc1 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:28 +0000
Subject: Squashfs: initrd support

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 init/do_mounts_rd.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'init')

diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index a7c748fa977..0f0f0cf3ba9 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -9,6 +9,7 @@
 #include <linux/string.h>
 
 #include "do_mounts.h"
+#include "../fs/squashfs/squashfs_fs.h"
 
 int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */
 
@@ -41,6 +42,7 @@ static int __init crd_load(int in_fd, int out_fd);
  * 	ext2
  *	romfs
  *	cramfs
+ *	squashfs
  * 	gzip
  */
 static int __init 
@@ -51,6 +53,7 @@ identify_ramdisk_image(int fd, int start_block)
 	struct ext2_super_block *ext2sb;
 	struct romfs_super_block *romfsb;
 	struct cramfs_super *cramfsb;
+	struct squashfs_super_block *squashfsb;
 	int nblocks = -1;
 	unsigned char *buf;
 
@@ -62,6 +65,7 @@ identify_ramdisk_image(int fd, int start_block)
 	ext2sb = (struct ext2_super_block *) buf;
 	romfsb = (struct romfs_super_block *) buf;
 	cramfsb = (struct cramfs_super *) buf;
+	squashfsb = (struct squashfs_super_block *) buf;
 	memset(buf, 0xe5, size);
 
 	/*
@@ -99,6 +103,16 @@ identify_ramdisk_image(int fd, int start_block)
 		goto done;
 	}
 
+	/* squashfs is at block zero too */
+	if (le32_to_cpu(squashfsb->s_magic) == SQUASHFS_MAGIC) {
+		printk(KERN_NOTICE
+		       "RAMDISK: squashfs filesystem found at block %d\n",
+		       start_block);
+		nblocks = (le64_to_cpu(squashfsb->bytes_used) + BLOCK_SIZE - 1)
+			 >> BLOCK_SIZE_BITS;
+		goto done;
+	}
+
 	/*
 	 * Read block 1 to test for minix and ext2 superblock
 	 */
-- 
cgit v1.2.3


From 24d431d06aeeda2b12cc925c6e1693c45ae5088b Mon Sep 17 00:00:00 2001
From: Ron Lee <ron@debian.org>
Date: Thu, 27 Nov 2008 02:31:57 +1030
Subject: trivial: add missing printk loglevel in start_kernel

Add missing printk loglevel in start_kernel

Signed-off-by: Ron Lee <ron@debian.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 init/main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index cd168ebc592..798fe450e70 100644
--- a/init/main.c
+++ b/init/main.c
@@ -602,7 +602,8 @@ asmlinkage void __init start_kernel(void)
 	sched_clock_init();
 	profile_init();
 	if (!irqs_disabled())
-		printk("start_kernel(): bug: interrupts were enabled early\n");
+		printk(KERN_CRIT "start_kernel(): bug: interrupts were "
+				 "enabled early\n");
 	early_boot_irqs_on();
 	local_irq_enable();
 
-- 
cgit v1.2.3


From fce3e804cfad49208fd2ff9db4bcb57481409e1d Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sat, 1 Nov 2008 14:03:00 +0100
Subject: sysfs: clarify SYSFS_DEPRECATED help text

This should make the help text of SYSFS_DEPRECATED more clear, that this
is _not_ about (what some people think it is) suppressing a few symlinks
and variables, but a different sysfs _layout_ with new features.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 init/Kconfig | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index 52847eec739..d9d3dbabdb1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -423,27 +423,37 @@ config SYSFS_DEPRECATED
 	bool
 
 config SYSFS_DEPRECATED_V2
-	bool "Create deprecated sysfs files"
+	bool "Create deprecated sysfs layout for older userspace tools"
 	depends on SYSFS
 	default y
 	select SYSFS_DEPRECATED
 	help
-	  This option creates deprecated symlinks such as the
-	  "device"-link, the <subsystem>:<name>-link, and the
-	  "bus"-link. It may also add deprecated key in the
-	  uevent environment.
-	  None of these features or values should be used today, as
-	  they export driver core implementation details to userspace
-	  or export properties which can't be kept stable across kernel
-	  releases.
-
-	  If enabled, this option will also move any device structures
-	  that belong to a class, back into the /sys/class hierarchy, in
-	  order to support older versions of udev and some userspace
-	  programs.
-
-	  If you are using a distro with the most recent userspace
-	  packages, it should be safe to say N here.
+	  This option switches the layout of sysfs to the deprecated
+	  version.
+
+	  The current sysfs layout features a unified device tree at
+	  /sys/devices/, which is able to express a hierarchy between
+	  class devices. If the deprecated option is set to Y, the
+	  unified device tree is split into a bus device tree at
+	  /sys/devices/ and several individual class device trees at
+	  /sys/class/. The class and bus devices will be connected by
+	  "<subsystem>:<name>" and the "device" links. The "block"
+	  class devices, will not show up in /sys/class/block/. Some
+	  subsystems will suppress the creation of some devices which
+	  depend on the unified device tree.
+
+	  This option is not a pure compatibility option that can
+	  be safely enabled on newer distributions. It will change the
+	  layout of sysfs to the non-extensible deprecated version,
+	  and disable some features, which can not be exported without
+	  confusing older userspace tools. Since 2007/2008 all major
+	  distributions do not enable this option, and ship no tools which
+	  depend on the deprecated layout or this option.
+
+	  If you are using a new kernel on an older distribution, or use
+	  older userspace tools, you might need to say Y here. Do not say Y,
+	  if the original kernel, that came with your distribution, has
+	  this option set to N.
 
 config PROC_PID_CPUSET
 	bool "Include legacy /proc/<pid>/cpuset file"
-- 
cgit v1.2.3


From 853ac43ab194f5051b27a55060215d696dc9480d Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Tue, 6 Jan 2009 14:40:20 -0800
Subject: shmem: unify regular and tiny shmem

tiny-shmem shares most of its 130 lines of code with shmem and tends to
break when particular bits of shmem get modified.  Unifying saves code and
makes keeping these two in sync much easier.

before:
  14367	    392	     24	  14783	   39bf	mm/shmem.o
    396      72       8     476	    1dc	mm/tiny-shmem.o

after:
  14367	    392	     24	  14783	   39bf	mm/shmem.o
    412	     72       8     492	    1ec	mm/shmem.o tiny

Signed-off-by: Matt Mackall <mpm@selenic.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index 52847eec739..315a6114bf8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -838,10 +838,6 @@ config RT_MUTEXES
 	boolean
 	select PLIST
 
-config TINY_SHMEM
-	default !SHMEM
-	bool
-
 config BASE_SMALL
 	int
 	default 0 if BASE_FULL
-- 
cgit v1.2.3


From f99ebf0a86de13f77bc4ee349de96db9f2f67f2e Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Tue, 6 Jan 2009 14:40:38 -0800
Subject: init: properly placing noinline keyword

checkpatch warns about 'static void noinline'.  It wants `static noinline
void'.

Both are permissible, but the kernel consistently uses `static inline' and
`static noinline', and consistency is good.  Hence let's keep the
checkpatch warning and fix up this code site.

[akpm@linux-foundation.org: rewrote changelog]
Signed-off-by: Md.Rakib H. Mullick <rakib.mullick@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index cd168ebc592..90926dadc20 100644
--- a/init/main.c
+++ b/init/main.c
@@ -447,7 +447,7 @@ static void __init setup_command_line(char *command_line)
  * gcc-3.4 accidentally inlines this function, so use noinline.
  */
 
-static void noinline __init_refok rest_init(void)
+static noinline void __init_refok rest_init(void)
 	__releases(kernel_lock)
 {
 	int pid;
@@ -786,7 +786,7 @@ static void run_init_process(char *init_filename)
 /* This is a non __init function. Force it to be noinline otherwise gcc
  * makes it inline to init() and it becomes part of init.text section
  */
-static int noinline init_post(void)
+static noinline int init_post(void)
 {
 	free_initmem();
 	unlock_kernel();
-- 
cgit v1.2.3


From bca1033b092a6fab2ed00036e8a7f6e2df5d99a2 Mon Sep 17 00:00:00 2001
From: Marton Balint <cus@fazekas.hu>
Date: Tue, 6 Jan 2009 14:40:43 -0800
Subject: do_mounts: add device info to mount message

In the past, I used the root=...  command line parameter to specify the
root filesystem to the kernel.  Now it seems that specifying it is not
necessary.  The kernel detects the root filesystem even if the kernel
command line is empty.  My root fs is on a raid1 device by the way, and I
am not using initrd for the boot process.

If the kernel detects the root filesystem somehow, I think it should print
out the result of this detection, otherwise I will not know which device
has the root filesystem.  Or is there an easy way to get this information
on a running system?  I had a quick look at the /proc and /sys
filesystems, but haven't found anything useful there.

Signed-off-by: Marton Balint <cus@fazekas.hu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/do_mounts.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'init')

diff --git a/init/do_mounts.c b/init/do_mounts.c
index d055b1914c3..5efca73b39f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -220,10 +220,10 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
 
 	sys_chdir("/root");
 	ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev;
-	printk("VFS: Mounted root (%s filesystem)%s.\n",
+	printk("VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",
 	       current->fs->pwd.mnt->mnt_sb->s_type->name,
 	       current->fs->pwd.mnt->mnt_sb->s_flags & MS_RDONLY ?
-	       " readonly" : "");
+	       " readonly" : "", MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
 	return 0;
 }
 
-- 
cgit v1.2.3


From f1883f86dea84fe47a71a39fc1afccc005915ed8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Jan 2009 14:40:45 -0800
Subject: Remove remaining unwinder code

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Gabor Gombas <gombasg@sztaki.hu>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>,
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index 90926dadc20..e119dd28dd7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -50,7 +50,6 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
-#include <linux/unwind.h>
 #include <linux/buffer_head.h>
 #include <linux/page_cgroup.h>
 #include <linux/debug_locks.h>
@@ -537,7 +536,6 @@ asmlinkage void __init start_kernel(void)
 	 * Need to run as early as possible, to initialize the
 	 * lockdep hash:
 	 */
-	unwind_init();
 	lockdep_init();
 	debug_objects_early_init();
 	cgroup_init_early();
@@ -559,7 +557,6 @@ asmlinkage void __init start_kernel(void)
 	setup_arch(&command_line);
 	mm_init_owner(&init_mm, &init_task);
 	setup_command_line(command_line);
-	unwind_setup();
 	setup_per_cpu_areas();
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
-- 
cgit v1.2.3


From ff083c8372f6312bb3a8c7f7b748920aeeb210c9 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 6 Jan 2009 14:40:53 -0800
Subject: autodetect_raid: add missing __init marking

The function autodetect_raid is only used by __init functions, and it refers
to __initdata, so it needs __init markings.  Fixes this error:
The function autodetect_raid() references
the variable __initdata raid_noautodetect.
This is often because autodetect_raid lacks a __initdata
annotation or the annotation of raid_noautodetect is wrong.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/do_mounts_md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'init')

diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index d6da5cdd3c3..ff95e319288 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -271,7 +271,7 @@ static int __init raid_setup(char *str)
 __setup("raid=", raid_setup);
 __setup("md=", md_setup);
 
-static void autodetect_raid(void)
+static void __init autodetect_raid(void)
 {
 	int fd;
 
-- 
cgit v1.2.3


From d2e3192b6e372a441c18bc8cb32f89ef38f105b7 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 6 Jan 2009 14:41:10 -0800
Subject: init/main.c: mark late_time_init as __initdata

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'init')

diff --git a/init/main.c b/init/main.c
index e119dd28dd7..b5a892c6837 100644
--- a/init/main.c
+++ b/init/main.c
@@ -107,7 +107,7 @@ EXPORT_SYMBOL(system_state);
 
 extern void time_init(void);
 /* Default late time init is NULL. archs can override this later. */
-void (*late_time_init)(void);
+void (*__initdata late_time_init)(void);
 extern void softirq_init(void);
 
 /* Untouched command line saved by arch-specific code. */
-- 
cgit v1.2.3


From 22a9d645677feefd402befd02edd59b122289ef1 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 7 Jan 2009 08:45:46 -0800
Subject: async: Asynchronous function calls to speed up kernel boot

Right now, most of the kernel boot is strictly synchronous, such that
various hardware delays are done sequentially.

In order to make the kernel boot faster, this patch introduces
infrastructure to allow doing some of the initialization steps
asynchronously, which will hide significant portions of the hardware delays
in practice.

In order to not change device order and other similar observables, this
patch does NOT do full parallel initialization.

Rather, it operates more in the way an out of order CPU does; the work may
be done out of order and asynchronous, but the observable effects
(instruction retiring for the CPU) are still done in the original sequence.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 init/do_mounts.c | 2 ++
 init/main.c      | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'init')

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 5efca73b39f..708105e163d 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/initrd.h>
+#include <linux/async.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
@@ -372,6 +373,7 @@ void __init prepare_namespace(void)
 	/* wait for the known devices to complete their probing */
 	while (driver_probe_done() != 0)
 		msleep(100);
+	async_synchronize_full();
 
 	md_run_setup();
 
diff --git a/init/main.c b/init/main.c
index b5a892c6837..f66715d8a85 100644
--- a/init/main.c
+++ b/init/main.c
@@ -62,6 +62,7 @@
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/ftrace.h>
+#include <linux/async.h>
 #include <trace/boot.h>
 
 #include <asm/io.h>
@@ -684,7 +685,7 @@ asmlinkage void __init start_kernel(void)
 	rest_init();
 }
 
-static int initcall_debug;
+int initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
 
 int do_one_initcall(initcall_t fn)
@@ -785,6 +786,8 @@ static void run_init_process(char *init_filename)
  */
 static noinline int init_post(void)
 {
+	/* need to finish all async __init code before freeing the memory */
+	async_synchronize_full();
 	free_initmem();
 	unlock_kernel();
 	mark_rodata_ro();
-- 
cgit v1.2.3


From cb6ff208076b5f434db1b8c983429269d719cef5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:48 +0000
Subject: NOMMU: Support XIP on initramfs

Support XIP on files unpacked from the initramfs image on NOMMU systems.  This
simply requires the length of the file to be preset so that the ramfs fs can
attempt to garner sufficient contiguous storage to store the file (NOMMU mmap
can only map contiguous RAM).

All the other bits to do XIP on initramfs files are present:

 (1) ramfs's truncate attempts to allocate a contiguous run of pages when a
     file is truncated upwards from nothing.

 (2) ramfs sets BDI on its files to indicate direct mapping is possible, and
     that its files can be mapped for read, write and exec.

 (3) NOMMU mmap() will use the above bits to determine that it can do XIP.
     Possibly this needs better controls, because it will _always_ try and do
     XIP.

One disadvantage of this very simplistic approach is that sufficient space
will be allocated to store the whole file, and not just the bit that would be
XIP'd.  To deal with this, though, the initramfs unpacker would have to be
able to parse the file contents.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 init/initramfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'init')

diff --git a/init/initramfs.c b/init/initramfs.c
index 4f5ba75aaa7..d9c941c0c3c 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -317,6 +317,7 @@ static int __init do_name(void)
 			if (wfd >= 0) {
 				sys_fchown(wfd, uid, gid);
 				sys_fchmod(wfd, mode);
+				sys_ftruncate(wfd, body_len);
 				vcollected = kstrdup(collected, GFP_KERNEL);
 				state = CopyFile;
 			}
-- 
cgit v1.2.3


From 5cdc38f98596662620b822a4e13f797c3f2f65e0 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:30 -0800
Subject: cgroups: make cgroup config a submenu

Making CGROUP related configs be a sub-menu.

This patch make CGROUP related configs be a sub-menu and makes 1st level
configs of "General Setup" shorter.

 including following additional changes
  - add help comment about CGROUPS and GROUP_SCHED.
  - moved MM_OWNER config to the bottom.
    (for good indent in menuconfig)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 123 ++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 67 insertions(+), 56 deletions(-)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index e7893b1d3e4..6fcd192aa60 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -271,59 +271,6 @@ config LOG_BUF_SHIFT
 		     13 =>  8 KB
 		     12 =>  4 KB
 
-config CGROUPS
-	bool "Control Group support"
-	help
-	  This option will let you use process cgroup subsystems
-	  such as Cpusets
-
-	  Say N if unsure.
-
-config CGROUP_DEBUG
-	bool "Example debug cgroup subsystem"
-	depends on CGROUPS
-	default n
-	help
-	  This option enables a simple cgroup subsystem that
-	  exports useful debugging information about the cgroups
-	  framework
-
-	  Say N if unsure
-
-config CGROUP_NS
-        bool "Namespace cgroup subsystem"
-        depends on CGROUPS
-        help
-          Provides a simple namespace cgroup subsystem to
-          provide hierarchical naming of sets of namespaces,
-          for instance virtual servers and checkpoint/restart
-          jobs.
-
-config CGROUP_FREEZER
-        bool "control group freezer subsystem"
-        depends on CGROUPS
-        help
-          Provides a way to freeze and unfreeze all tasks in a
-	  cgroup.
-
-config CGROUP_DEVICE
-	bool "Device controller for cgroups"
-	depends on CGROUPS && EXPERIMENTAL
-	help
-	  Provides a cgroup implementing whitelists for devices which
-	  a process in the cgroup can mknod or open.
-
-config CPUSETS
-	bool "Cpuset support"
-	depends on SMP && CGROUPS
-	help
-	  This option will let you create and manage CPUSETs which
-	  allow dynamically partitioning a system into sets of CPUs and
-	  Memory Nodes and assigning tasks to run only within those sets.
-	  This is primarily useful on large SMP or NUMA systems.
-
-	  Say N if unsure.
-
 #
 # Architectures with an unreliable sched_clock() should select this:
 #
@@ -337,6 +284,8 @@ config GROUP_SCHED
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups.
+	  In order to create a group from arbitrary set of processes, use
+	  CONFIG_CGROUPS. (See Control Group support.)
 
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
@@ -379,6 +328,66 @@ config CGROUP_SCHED
 
 endchoice
 
+menu "Control Group support"
+config CGROUPS
+	bool "Control Group support"
+	help
+	  This option add support for grouping sets of processes together, for
+	  use with process control subsystems such as Cpusets, CFS, memory
+	  controls or device isolation.
+	  See
+		- Documentation/cpusets.txt	(Cpusets)
+		- Documentation/scheduler/sched-design-CFS.txt	(CFS)
+		- Documentation/cgroups/ (features for grouping, isolation)
+		- Documentation/controllers/ (features for resource control)
+
+	  Say N if unsure.
+
+config CGROUP_DEBUG
+	bool "Example debug cgroup subsystem"
+	depends on CGROUPS
+	default n
+	help
+	  This option enables a simple cgroup subsystem that
+	  exports useful debugging information about the cgroups
+	  framework
+
+	  Say N if unsure
+
+config CGROUP_NS
+        bool "Namespace cgroup subsystem"
+        depends on CGROUPS
+        help
+          Provides a simple namespace cgroup subsystem to
+          provide hierarchical naming of sets of namespaces,
+          for instance virtual servers and checkpoint/restart
+          jobs.
+
+config CGROUP_FREEZER
+        bool "control group freezer subsystem"
+        depends on CGROUPS
+        help
+          Provides a way to freeze and unfreeze all tasks in a
+	  cgroup.
+
+config CGROUP_DEVICE
+	bool "Device controller for cgroups"
+	depends on CGROUPS && EXPERIMENTAL
+	help
+	  Provides a cgroup implementing whitelists for devices which
+	  a process in the cgroup can mknod or open.
+
+config CPUSETS
+	bool "Cpuset support"
+	depends on SMP && CGROUPS
+	help
+	  This option will let you create and manage CPUSETs which
+	  allow dynamically partitioning a system into sets of CPUs and
+	  Memory Nodes and assigning tasks to run only within those sets.
+	  This is primarily useful on large SMP or NUMA systems.
+
+	  Say N if unsure.
+
 config CGROUP_CPUACCT
 	bool "Simple CPU accounting cgroup subsystem"
 	depends on CGROUPS
@@ -393,9 +402,6 @@ config RESOURCE_COUNTERS
           infrastructure that works with cgroups
 	depends on CGROUPS
 
-config MM_OWNER
-	bool
-
 config CGROUP_MEM_RES_CTLR
 	bool "Memory Resource Controller for Control Groups"
 	depends on CGROUPS && RESOURCE_COUNTERS
@@ -419,6 +425,11 @@ config CGROUP_MEM_RES_CTLR
 	  This config option also selects MM_OWNER config option, which
 	  could in turn add some fork/exit overhead.
 
+config MM_OWNER
+	bool
+
+endmenu
+
 config SYSFS_DEPRECATED
 	bool
 
-- 
cgit v1.2.3


From c9d5409f8d46fd0d18b4a4481d9caa04076d87fc Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:35 -0800
Subject: memcg: fix a typo in Kconfig

s/contoller/controller/

Signed-of-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index 6fcd192aa60..7cbe1f43ca2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -420,7 +420,7 @@ config CGROUP_MEM_RES_CTLR
 	  sure you need the memory resource controller. Even when you enable
 	  this, you can set "cgroup_disable=memory" at your boot option to
 	  disable memory resource controller and you can avoid overheads.
-	  (and lose benefits of memory resource contoller)
+	  (and lose benefits of memory resource controller)
 
 	  This config option also selects MM_OWNER config option, which
 	  could in turn add some fork/exit overhead.
-- 
cgit v1.2.3


From c077719be8e9e6b55702117513d1b5f41d80404a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:07:57 -0800
Subject: memcg: mem+swap controller Kconfig

Config and control variable for mem+swap controller.

This patch adds CONFIG_CGROUP_MEM_RES_CTLR_SWAP
(memory resource controller swap extension.)

For accounting swap, it's obvious that we have to use additional memory to
remember "who uses swap".  This adds more overhead.  So, it's better to
offer "choice" to users.  This patch adds 2 choices.

This patch adds 2 parameters to enable swap extension or not.
  - CONFIG
  - boot option

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'init')

diff --git a/init/Kconfig b/init/Kconfig
index 7cbe1f43ca2..a724a149bf3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -428,6 +428,23 @@ config CGROUP_MEM_RES_CTLR
 config MM_OWNER
 	bool
 
+config CGROUP_MEM_RES_CTLR_SWAP
+	bool "Memory Resource Controller Swap Extension(EXPERIMENTAL)"
+	depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
+	help
+	  Add swap management feature to memory resource controller. When you
+	  enable this, you can limit mem+swap usage per cgroup. In other words,
+	  when you disable this, memory resource controller has no cares to
+	  usage of swap...a process can exhaust all of the swap. This extension
+	  is useful when you want to avoid exhaustion swap but this itself
+	  adds more overheads and consumes memory for remembering information.
+	  Especially if you use 32bit system or small memory system, please
+	  be careful about enabling this. When memory resource controller
+	  is disabled by boot option, this will be automatically disabled and
+	  there will be no overhead from this. Even when you set this config=y,
+	  if boot option "noswapaccount" is set, swap will not be accounted.
+
+
 endmenu
 
 config SYSFS_DEPRECATED
-- 
cgit v1.2.3