From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- init/main.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 09131ec090c..602d724afa5 100644 --- a/init/main.c +++ b/init/main.c @@ -357,7 +357,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { -- cgit v1.2.3 From c17ef45342cc033fdf7bdd5b28615e0090f8d2e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2009 17:12:47 -0700 Subject: rcu: Remove Classic RCU Remove Classic RCU, given that the combination of Tree RCU and the proposed Bloatwatch RCU do everything that Classic RCU can with fewer bugs. Tree RCU has been default in x86 builds for almost six months, and seems to be quite reliable, so there does not seem to be much justification for keeping the Classic RCU code and config complexity around anymore. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: dipankar@in.ibm.com Cc: dhowells@redhat.com Cc: lethal@linux-sh.org Cc: kernel@wantstofly.org Signed-off-by: Ingo Molnar --- init/Kconfig | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 1ce05a4cb5f..d10f31dfa0b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -316,21 +316,13 @@ choice prompt "RCU Implementation" default TREE_RCU -config CLASSIC_RCU - bool "Classic RCU" - help - This option selects the classic RCU implementation that is - designed for best read-side performance on non-realtime - systems. - - Select this option if you are unsure. - config TREE_RCU bool "Tree-based hierarchical RCU" help This option selects the RCU implementation that is designed for very large SMP system with hundreds or - thousands of CPUs. + thousands of CPUs. It also scales down nicely to + smaller systems. config PREEMPT_RCU bool "Preemptible RCU" -- cgit v1.2.3 From f41d911f8c49a5d65c86504c19e8204bb605c4fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:52 -0700 Subject: rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar --- init/Kconfig | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 25373cf3267..f88da2d1c1f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -335,11 +335,20 @@ config PREEMPT_RCU now-naive assumptions about each RCU read-side critical section remaining on a given CPU through its execution. +config TREE_PREEMPT_RCU + bool "Preemptable tree-based hierarchical RCU" + depends on PREEMPT + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. + endchoice config RCU_TRACE bool "Enable tracing for RCU" - depends on TREE_RCU || PREEMPT_RCU + depends on TREE_RCU || PREEMPT_RCU || TREE_PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. @@ -351,7 +360,7 @@ config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on TREE_RCU + depends on TREE_RCU || TREE_PREEMPT_RCU default 64 if 64BIT default 32 if !64BIT help @@ -366,7 +375,7 @@ config RCU_FANOUT config RCU_FANOUT_EXACT bool "Disable tree-based hierarchical RCU auto-balancing" - depends on TREE_RCU + depends on TREE_RCU || TREE_PREEMPT_RCU default n help This option forces use of the exact RCU_FANOUT value specified, @@ -379,11 +388,12 @@ config RCU_FANOUT_EXACT Say N if unsure. config TREE_RCU_TRACE - def_bool RCU_TRACE && TREE_RCU + def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU ) select DEBUG_FS help - This option provides tracing for the TREE_RCU implementation, - permitting Makefile to trivially select kernel/rcutree_trace.c. + This option provides tracing for the TREE_RCU and + TREE_PREEMPT_RCU implementations, permitting Makefile to + trivially select kernel/rcutree_trace.c. config PREEMPT_RCU_TRACE def_bool RCU_TRACE && PREEMPT_RCU -- cgit v1.2.3 From 6b3ef48adf847f7adf11c870e3ffacac150f1564 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Aug 2009 13:56:53 -0700 Subject: rcu: Remove CONFIG_PREEMPT_RCU Now that CONFIG_TREE_PREEMPT_RCU is in place, there is no further need for CONFIG_PREEMPT_RCU. Remove it, along with whatever subtle bugs it may (or may not) contain. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <125097461396-git-send-email-> Signed-off-by: Ingo Molnar --- init/Kconfig | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index f88da2d1c1f..8e8b76d8a27 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -324,17 +324,6 @@ config TREE_RCU thousands of CPUs. It also scales down nicely to smaller systems. -config PREEMPT_RCU - bool "Preemptible RCU" - depends on PREEMPT - help - This option reduces the latency of the kernel by making certain - RCU sections preemptible. Normally RCU code is non-preemptible, if - this option is selected then read-only RCU sections become - preemptible. This helps latency, but may expose bugs due to - now-naive assumptions about each RCU read-side critical section - remaining on a given CPU through its execution. - config TREE_PREEMPT_RCU bool "Preemptable tree-based hierarchical RCU" depends on PREEMPT @@ -348,7 +337,7 @@ endchoice config RCU_TRACE bool "Enable tracing for RCU" - depends on TREE_RCU || PREEMPT_RCU || TREE_PREEMPT_RCU + depends on TREE_RCU || TREE_PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. @@ -395,13 +384,6 @@ config TREE_RCU_TRACE TREE_PREEMPT_RCU implementations, permitting Makefile to trivially select kernel/rcutree_trace.c. -config PREEMPT_RCU_TRACE - def_bool RCU_TRACE && PREEMPT_RCU - select DEBUG_FS - help - This option provides tracing for the PREEMPT_RCU implementation, - permitting Makefile to trivially select kernel/rcupreempt_trace.c. - endmenu # "RCU Subsystem" config IKCONFIG -- cgit v1.2.3 From fa84e9eecfff478df2d00e94deb3fc40fe4634ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 21 Aug 2009 22:01:12 +0200 Subject: init: Move sched_clock_init after late_time_init Some architectures initialize clocks and timers in late_time_init and x86 wants to do the same to avoid FIXMAP hackery for calibrating the TSC. That would result in undefined sched_clock readout and wreckaged printk timestamps again. We probably have those already on archs which do all their time/clock setup in late_time_init. There is no harm to move that after late_time_init except that a few more boot timestamps are stale. The scheduler is not active at that point so no real wreckage is expected. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: linux-arch@vger.kernel.org --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 11f4f145be3..0ec75ce771a 100644 --- a/init/main.c +++ b/init/main.c @@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); - sched_clock_init(); profile_init(); if (!irqs_disabled()) printk(KERN_CRIT "start_kernel(): bug: interrupts were " @@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void) numa_policy_init(); if (late_time_init) late_time_init(); + sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); -- cgit v1.2.3 From 7db905e636f08ea5bc9825c1f73d77802e8ccad5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Sep 2009 14:01:24 -0700 Subject: rcu: Move end of special early-boot RCU operation earlier Ingo was getting warnings from rcu_scheduler_starting() indicating that context switches had occurred before RCU ended its special early-boot handling of grace periods. This is a dangerous condition, as it indicates that RCU might have prematurely ended grace periods. This exploratory fix moves rcu_scheduler_starting() earlier in boot. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 11f4f145be3..525f6fb2bd2 100644 --- a/init/main.c +++ b/init/main.c @@ -451,6 +451,7 @@ static noinline void __init_refok rest_init(void) { int pid; + rcu_scheduler_starting(); kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); @@ -462,7 +463,6 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - rcu_scheduler_starting(); preempt_enable_no_resched(); schedule(); preempt_disable(); -- cgit v1.2.3 From 2b2af54a5bb6f7e80ccf78f20084b93c398c3a8b Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 30 Apr 2009 15:23:42 +0200 Subject: Driver Core: devtmpfs - kernel-maintained tmpfs-based /dev Devtmpfs lets the kernel create a tmpfs instance called devtmpfs very early at kernel initialization, before any driver-core device is registered. Every device with a major/minor will provide a device node in devtmpfs. Devtmpfs can be changed and altered by userspace at any time, and in any way needed - just like today's udev-mounted tmpfs. Unmodified udev versions will run just fine on top of it, and will recognize an already existing kernel-created device node and use it. The default node permissions are root:root 0600. Proper permissions and user/group ownership, meaningful symlinks, all other policy still needs to be applied by userspace. If a node is created by devtmps, devtmpfs will remove the device node when the device goes away. If the device node was created by userspace, or the devtmpfs created node was replaced by userspace, it will no longer be removed by devtmpfs. If it is requested to auto-mount it, it makes init=/bin/sh work without any further userspace support. /dev will be fully populated and dynamic, and always reflect the current device state of the kernel. With the commonly used dynamic device numbers, it solves the problem where static devices nodes may point to the wrong devices. It is intended to make the initial bootup logic simpler and more robust, by de-coupling the creation of the inital environment, to reliably run userspace processes, from a complex userspace bootstrap logic to provide a working /dev. Signed-off-by: Kay Sievers Signed-off-by: Jan Blunck Tested-By: Harald Hoyer Tested-By: Scott James Remnant Signed-off-by: Greg Kroah-Hartman --- init/do_mounts.c | 2 +- init/main.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'init') diff --git a/init/do_mounts.c b/init/do_mounts.c index 093f6591550..bb008d064c1 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -415,7 +415,7 @@ void __init prepare_namespace(void) mount_root(); out: + devtmpfs_mount("dev"); sys_mount(".", "/", NULL, MS_MOVE, NULL); sys_chroot("."); } - diff --git a/init/main.c b/init/main.c index b34fd8e5ede..8e6a7846bd0 100644 --- a/init/main.c +++ b/init/main.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -809,6 +810,7 @@ static void __init do_basic_setup(void) init_workqueues(); cpuset_init_smp(); usermodehelper_init(); + init_tmpfs(); driver_init(); init_irq_proc(); do_ctors(); -- cgit v1.2.3