From dbffa471611d3fc4b401ebabf7bb63ac0e0272b1 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Mon, 27 Mar 2006 01:14:26 -0800
Subject: [PATCH] PM-Timer: don't use workaround if chipset is not buggy

Current timer_pm.c reads I/O port triple times, in order to avoid the bug
of chipset.  But I/O port is slow.

2.6.16 (pmtmr)
Simple gettimeofday: 3.6532 microseconds

2.6.16+patch (pmtmr)
Simple gettimeofday: 1.4582 microseconds

[if chip is buggy, probably it will be 7us or more in 4.2% of probability.]

This patch adds blacklist of buggy chip, and if chip is not buggy, this
uses fast normal version instead of slow workaround version.

If chip is buggy, warnings "pmtmr is slow".  But sounds like there is gray
zone.  I found the PIIX4 errata, but I couldn't find the ICH4 errata.  But
some motherboard seems to have problem.

So, if we found a ICH4, generate warnings, and use a workaround version.
If user's ICH4 is good, the user can specify the "pmtmr_good" boot
parameter to use fast version.

Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/timers/timer_pm.c | 104 +++++++++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 15 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
index 264edaaac31..144e94a0493 100644
--- a/arch/i386/kernel/timers/timer_pm.c
+++ b/arch/i386/kernel/timers/timer_pm.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/init.h>
+#include <linux/pci.h>
 #include <asm/types.h>
 #include <asm/timer.h>
 #include <asm/smp.h>
@@ -45,24 +46,31 @@ static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
 
 #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
 
+static int pmtmr_need_workaround __read_mostly = 1;
+
 /*helper function to safely read acpi pm timesource*/
 static inline u32 read_pmtmr(void)
 {
-	u32 v1=0,v2=0,v3=0;
-	/* It has been reported that because of various broken
-	 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
-	 * source is not latched, so you must read it multiple
-	 * times to insure a safe value is read.
-	 */
-	do {
-		v1 = inl(pmtmr_ioport);
-		v2 = inl(pmtmr_ioport);
-		v3 = inl(pmtmr_ioport);
-	} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
-			|| (v3 > v1 && v3 < v2));
-
-	/* mask the output to 24 bits */
-	return v2 & ACPI_PM_MASK;
+	if (pmtmr_need_workaround) {
+		u32 v1, v2, v3;
+
+		/* It has been reported that because of various broken
+		 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
+		 * source is not latched, so you must read it multiple
+		 * times to insure a safe value is read.
+		 */
+		do {
+			v1 = inl(pmtmr_ioport);
+			v2 = inl(pmtmr_ioport);
+			v3 = inl(pmtmr_ioport);
+		} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
+			 || (v3 > v1 && v3 < v2));
+
+		/* mask the output to 24 bits */
+		return v2 & ACPI_PM_MASK;
+	}
+
+	return inl(pmtmr_ioport) & ACPI_PM_MASK;
 }
 
 
@@ -263,6 +271,72 @@ struct init_timer_opts __initdata timer_pmtmr_init = {
 	.opts = &timer_pmtmr,
 };
 
+#ifdef CONFIG_PCI
+/*
+ * PIIX4 Errata:
+ *
+ * The power management timer may return improper results when read.
+ * Although the timer value settles properly after incrementing,
+ * while incrementing there is a 3 ns window every 69.8 ns where the
+ * timer value is indeterminate (a 4.2% chance that the data will be
+ * incorrect when read). As a result, the ACPI free running count up
+ * timer specification is violated due to erroneous reads.
+ */
+static int __init pmtmr_bug_check(void)
+{
+	static struct pci_device_id gray_list[] __initdata = {
+		/* these chipsets may have bug. */
+		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
+				PCI_DEVICE_ID_INTEL_82801DB_0) },
+		{ },
+	};
+	struct pci_dev *dev;
+	int pmtmr_has_bug = 0;
+	u8 rev;
+
+	if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround)
+		return 0;
+
+	dev = pci_get_device(PCI_VENDOR_ID_INTEL,
+			     PCI_DEVICE_ID_INTEL_82371AB_3, NULL);
+	if (dev) {
+		pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
+		/* the bug has been fixed in PIIX4M */
+		if (rev < 3) {
+			printk(KERN_WARNING "* Found PM-Timer Bug on this "
+				"chipset. Due to workarounds for a bug,\n"
+				"* this time source is slow.  Consider trying "
+				"other time sources (clock=)\n");
+			pmtmr_has_bug = 1;
+		}
+		pci_dev_put(dev);
+	}
+
+	if (pci_dev_present(gray_list)) {
+		printk(KERN_WARNING "* This chipset may have PM-Timer Bug.  Due"
+			" to workarounds for a bug,\n"
+			"* this time source is slow. If you are sure your timer"
+			" does not have\n"
+			"* this bug, please use \"pmtmr_good\" to disable the "
+			"workaround\n");
+		pmtmr_has_bug = 1;
+	}
+
+	if (!pmtmr_has_bug)
+		pmtmr_need_workaround = 0;
+
+	return 0;
+}
+device_initcall(pmtmr_bug_check);
+#endif
+
+static int __init pmtr_good_setup(char *__str)
+{
+	pmtmr_need_workaround = 0;
+	return 1;
+}
+__setup("pmtmr_good", pmtr_good_setup);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86");
-- 
cgit v1.2.3


From 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Mon, 27 Mar 2006 01:15:22 -0800
Subject: [PATCH] sched: new sched domain for representing multi-core

Add a new sched domain for representing multi-core with shared caches
between cores.  Consider a dual package system, each package containing two
cores and with last level cache shared between cores with in a package.  If
there are two runnable processes, with this appended patch those two
processes will be scheduled on different packages.

On such systems, with this patch we have observed 8% perf improvement with
specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2
users).

This new domain will come into play only on multi-core systems with shared
caches.  On other systems, this sched domain will be removed by domain
degeneration code.  This new domain can be also used for implementing power
savings policy (see OLS 2005 CMP kernel scheduler paper for more details..
I will post another patch for power savings policy soon)

Most of the arch/* file changes are for cpu_coregroup_map() implementation.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig                      |  9 +++++++++
 arch/i386/kernel/cpu/common.c          | 10 +++++++---
 arch/i386/kernel/cpu/intel_cacheinfo.c | 22 ++++++++++++++++++++--
 arch/i386/kernel/smpboot.c             | 24 ++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 5 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f7db71d0b91..f17bd1d2707 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 7e3d6b6a4e9..a06a49075f1 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
 void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
-	int junk;
+	int ebx;
 
 	if (have_cpuid_p()) {
 		/* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 		/* Intel-defined flags: level 0x00000001 */
 		if ( c->cpuid_level >= 0x00000001 ) {
 			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 			c->x86_capability[0] = capability;
 			c->x86_capability[4] = excap;
 			c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 			if (c->x86 >= 0x6)
 				c->x86_model += ((tfms >> 16) & 0xF) << 4;
 			c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+			c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+			c->apicid = (ebx >> 24) & 0xFF;
+#endif
 		} else {
 			/* Have CPUID level 0 only - unheard of */
 			c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index ce61921369e..7e7fd4e67dd 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+	unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
 
 	if (c->cpuid_level > 3) {
 		static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 					break;
 				    case 2:
 					new_l2 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l2_id = c->apicid >> index_msb;
 					break;
 				    case 3:
 					new_l3 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l3_id = c->apicid >> index_msb;
 					break;
 				    default:
 					break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 		if (new_l1i)
 			l1i = new_l1i;
 
-		if (new_l2)
+		if (new_l2) {
 			l2 = new_l2;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l2_id;
+#endif
+		}
 
-		if (new_l3)
+		if (new_l3) {
 			l3 = new_l3;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l3_id;
+#endif
+		}
 
 		if ( trace )
 			printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 82371d83bfa..a6969903f2d 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* Core ID of each logical CPU */
 int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
 
 static int cpucount;
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
-- 
cgit v1.2.3


From b06be912a3ad68c69dba0ed6e92723140020e392 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 27 Mar 2006 01:15:24 -0800
Subject: [PATCH] x86: don't use cpuid.2 to determine cache info if cpuid.4 is
 supported

Don't use cpuid.2 to determine cache info if cpuid.4 is supported.  The
exception is P4 trace cache.  We always use cpuid.2 to get trace cache
under P4.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel_cacheinfo.c | 69 ++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 28 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 7e7fd4e67dd..9df87b03612 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -225,11 +225,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 			}
 		}
 	}
-	if (c->cpuid_level > 1) {
+	/*
+	 * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
+	 * trace cache
+	 */
+	if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
 		/* supports eax=2  call */
 		int i, j, n;
 		int regs[4];
 		unsigned char *dp = (unsigned char *)regs;
+		int only_trace = 0;
+
+		if (num_cache_leaves != 0 && c->x86 == 15)
+			only_trace = 1;
 
 		/* Number of times to iterate */
 		n = cpuid_eax(2) & 0xFF;
@@ -251,6 +259,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 				while (cache_table[k].descriptor != 0)
 				{
 					if (cache_table[k].descriptor == des) {
+						if (only_trace && cache_table[k].cache_type != LVL_TRACE)
+							break;
 						switch (cache_table[k].cache_type) {
 						case LVL_1_INST:
 							l1i += cache_table[k].size;
@@ -276,43 +286,46 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 				}
 			}
 		}
+	}
 
-		if (new_l1d)
-			l1d = new_l1d;
+	if (new_l1d)
+		l1d = new_l1d;
 
-		if (new_l1i)
-			l1i = new_l1i;
+	if (new_l1i)
+		l1i = new_l1i;
 
-		if (new_l2) {
-			l2 = new_l2;
+	if (new_l2) {
+		l2 = new_l2;
 #ifdef CONFIG_SMP
-			cpu_llc_id[cpu] = l2_id;
+		cpu_llc_id[cpu] = l2_id;
 #endif
-		}
+	}
 
-		if (new_l3) {
-			l3 = new_l3;
+	if (new_l3) {
+		l3 = new_l3;
 #ifdef CONFIG_SMP
-			cpu_llc_id[cpu] = l3_id;
+		cpu_llc_id[cpu] = l3_id;
 #endif
-		}
-
-		if ( trace )
-			printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
-		else if ( l1i )
-			printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
-		if ( l1d )
-			printk(", L1 D cache: %dK\n", l1d);
-		else
-			printk("\n");
-		if ( l2 )
-			printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
-		if ( l3 )
-			printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
-
-		c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
 	}
 
+	if (trace)
+		printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
+	else if ( l1i )
+		printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
+
+	if (l1d)
+		printk(", L1 D cache: %dK\n", l1d);
+	else
+		printk("\n");
+
+	if (l2)
+		printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
+
+	if (l3)
+		printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
+
+	c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
+
 	return l2;
 }
 
-- 
cgit v1.2.3


From ec936fc563715a9e2b2e363eb060655b49529325 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:59 -0800
Subject: [PATCH] for_each_online_pgdat: renaming for_each_pgdat

Replace for_each_pgdat() with for_each_online_pgdat().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pgtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 9db3242103b..2889567e21a 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -36,7 +36,7 @@ void show_mem(void)
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
-- 
cgit v1.2.3


From 3571761fe49d960bb720c2308ffb9401f0a5e161 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:16:00 -0800
Subject: [PATCH] for_each_online_pgdat: remove sorting pgdat

Because pgdat_list was linked to pgdat_list in *reverse* order, (By default)
some of arch has to sort it by themselves.

for_each_pgdat has gone..for_each_online_pgdat() uses node_online_map, which
doesn't need to be sorted.

This patch removes codes for sorting pgdat.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/discontig.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index c4af9638dbf..c3f3ae95e22 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -352,17 +352,6 @@ void __init zone_sizes_init(void)
 {
 	int nid;
 
-	/*
-	 * Insert nodes into pgdat_list backward so they appear in order.
-	 * Clobber node 0's links and NULL out pgdat_list before starting.
-	 */
-	pgdat_list = NULL;
-	for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) {
-		if (!node_online(nid))
-			continue;
-		NODE_DATA(nid)->pgdat_next = pgdat_list;
-		pgdat_list = NODE_DATA(nid);
-	}
 
 	for_each_online_node(nid) {
 		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-- 
cgit v1.2.3


From 22a9835c350782a5c3257343713932af3ac92ee0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Mon, 27 Mar 2006 01:16:04 -0800
Subject: [PATCH] unify PFN_* macros

Just about every architecture defines some macros to do operations on pfns.
 They're all virtually identical.  This patch consolidates all of them.

One minor glitch is that at least i386 uses them in a very skeletal header
file.  To keep away from #include dependency hell, I stuck the new
definitions in a new, isolated header.

Of all of the implementations, sh64 is the only one that varied by a bit.
It used some masks to ensure that any sign-extension got ripped away before
the arithmetic is done.  This has been posted to that sh64 maintainers and
the development list.

Compiles on x86, x86_64, ia64 and ppc64.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/setup.c | 1 +
 arch/i386/mm/discontig.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 6917daa159a..8c08660b4e5 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -46,6 +46,7 @@
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
 #include <linux/dmi.h>
+#include <linux/pfn.h>
 
 #include <video/edid.h>
 
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index c3f3ae95e22..fe6eb901326 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -31,6 +31,7 @@
 #include <linux/nodemask.h>
 #include <linux/module.h>
 #include <linux/kexec.h>
+#include <linux/pfn.h>
 
 #include <asm/e820.h>
 #include <asm/setup.h>
-- 
cgit v1.2.3


From dfd4e3ec246355274c9cf62c6b04a1ee6fa3caba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:25 -0800
Subject: [PATCH] lightweight robust futexes: i386

i386: add the futex_atomic_cmpxchg_inuser() assembly implementation, and wire
up the new syscalls.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index ac687d00a1c..326595f3fa4 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -310,3 +310,5 @@ ENTRY(sys_call_table)
 	.long sys_pselect6
 	.long sys_ppoll
 	.long sys_unshare		/* 310 */
+	.long sys_set_robust_list
+	.long sys_get_robust_list
-- 
cgit v1.2.3


From e041c683412d5bf44dc2b109053e3b837b71742d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 27 Mar 2006 01:16:30 -0800
Subject: [PATCH] Notifier chain update: API changes

The kernel's implementation of notifier chains is unsafe.  There is no
protection against entries being added to or removed from a chain while the
chain is in use.  The issues were discussed in this thread:

    http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2

We noticed that notifier chains in the kernel fall into two basic usage
classes:

	"Blocking" chains are always called from a process context
	and the callout routines are allowed to sleep;

	"Atomic" chains can be called from an atomic context and
	the callout routines are not allowed to sleep.

We decided to codify this distinction and make it part of the API.  Therefore
this set of patches introduces three new, parallel APIs: one for blocking
notifiers, one for atomic notifiers, and one for "raw" notifiers (which is
really just the old API under a new name).  New kinds of data structures are
used for the heads of the chains, and new routines are defined for
registration, unregistration, and calling a chain.  The three APIs are
explained in include/linux/notifier.h and their implementation is in
kernel/sys.c.

With atomic and blocking chains, the implementation guarantees that the chain
links will not be corrupted and that chain callers will not get messed up by
entries being added or removed.  For raw chains the implementation provides no
guarantees at all; users of this API must provide their own protections.  (The
idea was that situations may come up where the assumptions of the atomic and
blocking APIs are not appropriate, so it should be possible for users to
handle these things in their own way.)

There are some limitations, which should not be too hard to live with.  For
atomic/blocking chains, registration and unregistration must always be done in
a process context since the chain is protected by a mutex/rwsem.  Also, a
callout routine for a non-raw chain must not try to register or unregister
entries on its own chain.  (This did happen in a couple of places and the code
had to be changed to avoid it.)

Since atomic chains may be called from within an NMI handler, they cannot use
spinlocks for synchronization.  Instead we use RCU.  The overhead falls almost
entirely in the unregister routine, which is okay since unregistration is much
less frequent that calling a chain.

Here is the list of chains that we adjusted and their classifications.  None
of them use the raw API, so for the moment it is only a placeholder.

  ATOMIC CHAINS
  -------------
arch/i386/kernel/traps.c:		i386die_chain
arch/ia64/kernel/traps.c:		ia64die_chain
arch/powerpc/kernel/traps.c:		powerpc_die_chain
arch/sparc64/kernel/traps.c:		sparc64die_chain
arch/x86_64/kernel/traps.c:		die_chain
drivers/char/ipmi/ipmi_si_intf.c:	xaction_notifier_list
kernel/panic.c:				panic_notifier_list
kernel/profile.c:			task_free_notifier
net/bluetooth/hci_core.c:		hci_notifier
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_chain
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_expect_chain
net/ipv6/addrconf.c:			inet6addr_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_expect_chain
net/netlink/af_netlink.c:		netlink_chain

  BLOCKING CHAINS
  ---------------
arch/powerpc/platforms/pseries/reconfig.c:	pSeries_reconfig_chain
arch/s390/kernel/process.c:		idle_chain
arch/x86_64/kernel/process.c		idle_notifier
drivers/base/memory.c:			memory_chain
drivers/cpufreq/cpufreq.c		cpufreq_policy_notifier_list
drivers/cpufreq/cpufreq.c		cpufreq_transition_notifier_list
drivers/macintosh/adb.c:		adb_client_list
drivers/macintosh/via-pmu.c		sleep_notifier_list
drivers/macintosh/via-pmu68k.c		sleep_notifier_list
drivers/macintosh/windfarm_core.c	wf_client_list
drivers/usb/core/notify.c		usb_notifier_list
drivers/video/fbmem.c			fb_notifier_list
kernel/cpu.c				cpu_chain
kernel/module.c				module_notify_list
kernel/profile.c			munmap_notifier
kernel/profile.c			task_exit_notifier
kernel/sys.c				reboot_notifier_list
net/core/dev.c				netdev_chain
net/decnet/dn_dev.c:			dnaddr_chain
net/ipv4/devinet.c:			inetaddr_chain

It's possible that some of these classifications are wrong.  If they are,
please let us know or submit a patch to fix them.  Note that any chain that
gets called very frequently should be atomic, because the rwsem read-locking
used for blocking chains is very likely to incur cache misses on SMP systems.
(However, if the chain's callout routines may sleep then the chain cannot be
atomic.)

The patch set was written by Alan Stern and Chandra Seetharaman, incorporating
material written by Keith Owens and suggestions from Paul McKenney and Andrew
Morton.

[jes@sgi.com: restructure the notifier chain initialization macros]
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4624f8ca245..6b63a5aa1e4 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -92,22 +92,21 @@ asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
-struct notifier_block *i386die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(i386die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&i386die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&i386die_chain, nb);
 }
 EXPORT_SYMBOL(register_die_notifier);
 
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&i386die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
+
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 {
 	return	p > (void *)tinfo &&
-- 
cgit v1.2.3


From 59153f7d7effdb5b3c81eb6d03914a866157b319 Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Mon, 27 Mar 2006 01:17:29 -0800
Subject: [PATCH] fbdev: Make BIOS EDID reading configurable

DDC reading via the Video BIOS may take several tens of seconds with some
combination of display cards and monitors.

Make this option configurable.  It defaults to `y' to minimise disruption.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/boot/video.S | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/i386')

diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S
index 2ac40c8244c..0000a267453 100644
--- a/arch/i386/boot/video.S
+++ b/arch/i386/boot/video.S
@@ -1924,6 +1924,7 @@ skip10:	movb	%ah, %al
 	ret
 
 store_edid:
+#ifdef CONFIG_FB_FIRMWARE_EDID
 	pushw	%es				# just save all registers
 	pushw	%ax
 	pushw	%bx
@@ -1954,6 +1955,7 @@ store_edid:
 	popw	%bx
 	popw	%ax
 	popw	%es
+#endif
 	ret
 
 # VIDEO_SELECT-only variables
-- 
cgit v1.2.3


From eef5167e5045fa8265b3e72cac9dbc4bc7dd82a6 Mon Sep 17 00:00:00 2001
From: "shin, jacob" <jacob.shin@amd.com>
Date: Mon, 27 Mar 2006 09:57:20 -0600
Subject: [CPUFREQ] hotplug cpu fix for powernow-k8

Andi's previous fix to initialise powernow_data on all siblings
will not work properly with CPU Hotplug.

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/i386/kernel/cpu/cpufreq/powernow-k8.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 1e70823e1cb..712a26bd445 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -1095,10 +1095,15 @@ static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
 
 static unsigned int powernowk8_get (unsigned int cpu)
 {
-	struct powernow_k8_data *data = powernow_data[cpu];
+	struct powernow_k8_data *data;
 	cpumask_t oldmask = current->cpus_allowed;
 	unsigned int khz = 0;
 
+	data = powernow_data[first_cpu(cpu_core_map[cpu])];
+
+	if (!data)
+		return -EINVAL;
+
 	set_cpus_allowed(current, cpumask_of_cpu(cpu));
 	if (smp_processor_id() != cpu) {
 		printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu);
-- 
cgit v1.2.3


From 64840e2722aeb789574e336d231bbc6436d51b34 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 01:51:23 -0800
Subject: [CPUFREQ] powernow: remove private for_each_cpu_mask()

It is unneeded and wrong.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/i386/kernel/cpu/cpufreq/powernow-k8.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h
index 00ea899c17e..79a7c5c87ed 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h
@@ -182,10 +182,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
 
 static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
 
-#ifndef for_each_cpu_mask
-#define for_each_cpu_mask(i,mask) for (i=0;i<1;i++)
-#endif
-
 #ifdef CONFIG_SMP
 static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
 {
-- 
cgit v1.2.3


From c8912599c677f58fd3a5e5420c9cb6df62b2d8fa Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 01:56:39 -0800
Subject: [PATCH] for_each_possible_cpu: i386

This patch replaces for_each_cpu with for_each_possible_cpu.

under arch/i386.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c           | 4 ++--
 arch/i386/kernel/nmi.c               | 6 +++---
 arch/i386/mach-voyager/voyager_smp.c | 2 +-
 arch/i386/oprofile/nmi_int.c         | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 311b4e7266f..3b329af4afc 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -381,7 +381,7 @@ static void do_irq_balance(void)
 	unsigned long imbalance = 0;
 	cpumask_t allowed_mask, target_cpu_mask, tmp;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int package_index;
 		CPU_IRQ(i) = 0;
 		if (!cpu_online(i))
@@ -632,7 +632,7 @@ static int __init balanced_irq_init(void)
 	else 
 		printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
 failed:
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		kfree(irq_cpu_data[i].irq_delta);
 		irq_cpu_data[i].irq_delta = NULL;
 		kfree(irq_cpu_data[i].last_irq);
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 9074818b947..5ad6a2c92d2 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -138,12 +138,12 @@ static int __init check_nmi_watchdog(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
-	for_each_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 #ifdef CONFIG_SMP
 		/* Check cpu_callin_map here because that is set
 		   after the timer is started. */
@@ -510,7 +510,7 @@ void touch_nmi_watchdog (void)
 	 * Just reset the alert counters, (other CPUs might be
 	 * spinning on locks we hold):
 	 */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		alert_counter[i] = 0;
 
 	/*
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index 8165626a5c3..70e560a1b79 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -1700,7 +1700,7 @@ after_handle_vic_irq(unsigned int irq)
 
 			printk("VOYAGER SMP: CPU%d lost interrupt %d\n",
 			       cpu, irq);
-			for_each_cpu(real_cpu, mask) {
+			for_each_possible_cpu(real_cpu, mask) {
 
 				outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu,
 				     VIC_PROCESSOR_ID);
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 1accce50c2c..1a2076ce6f6 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -122,7 +122,7 @@ static void nmi_save_registers(void * dummy)
 static void free_msrs(void)
 {
 	int i;
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		kfree(cpu_msrs[i].counters);
 		cpu_msrs[i].counters = NULL;
 		kfree(cpu_msrs[i].controls);
-- 
cgit v1.2.3


From f45e4656ac0609437267b242953c07d523649f8d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 28 Mar 2006 01:56:43 -0800
Subject: [PATCH] arch/i386/kernel/microcode.c: remove the obsolete
 microcode_ioctl

Nowadays, even Debian stable ships a microcode_ctl utility recent enough to no
longer use this ioctl.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Tigran Aivazian <tigran_aivazian@symantec.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/microcode.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index dd780a00553..e7c138f66c5 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -459,26 +459,9 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 	return ret;
 }
 
-static int microcode_ioctl (struct inode *inode, struct file *file, 
-		unsigned int cmd, unsigned long arg)
-{
-	switch (cmd) {
-		/* 
-		 *  XXX: will be removed after microcode_ctl 
-		 *  is updated to ignore failure of this ioctl()
-		 */
-		case MICROCODE_IOCFREE:
-			return 0;
-		default:
-			return -EINVAL;
-	}
-	return -EINVAL;
-}
-
 static struct file_operations microcode_fops = {
 	.owner		= THIS_MODULE,
 	.write		= microcode_write,
-	.ioctl		= microcode_ioctl,
 	.open		= microcode_open,
 };
 
-- 
cgit v1.2.3


From b791ccef21129f9c7e4ab8274d5d7a0ec0a2fb7c Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Tue, 28 Mar 2006 01:56:52 -0800
Subject: [PATCH] fix signed vs unsigned in nmi watchdog

Fix "signed vs unsigned" in nmi_watchdog_tick.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/i386')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 5ad6a2c92d2..d43b498ec74 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -529,7 +529,8 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 	 * always switch the stack NMI-atomically, it's safe to use
 	 * smp_processor_id().
 	 */
-	int sum, cpu = smp_processor_id();
+	unsigned int sum;
+	int cpu = smp_processor_id();
 
 	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
 
-- 
cgit v1.2.3