From a2f1b424900715ed9d1699c3bb88a434a2b42bc0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Add 4GB DMA32 zone

Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones.

As a bit of historical background: when the x86-64 port
was originally designed we had some discussion if we should
use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or
both. Both was ruled out at this point because it was in early
2.4 when VM is still quite shakey and had bad troubles even
dealing with one DMA zone.  We settled on the 16MB DMA zone mainly
because we worried about older soundcards and the floppy.

But this has always caused problems since then because
device drivers had trouble getting enough DMA able memory. These days
the VM works much better and the wide use of NUMA has proven
it can deal with many zones successfully.

So this patch adds both zones.

This helps drivers who need a lot of memory below 4GB because
their hardware is not accessing more (graphic drivers - proprietary
and free ones, video frame buffer drivers, sound drivers etc.).
Previously they could only use IOMMU+16MB GFP_DMA, which
was not enough memory.

Another common problem is that hardware who has full memory
addressing for >4GB misses it for some control structures in memory
(like transmit rings or other metadata).  They tended to allocate memory
in the 16MB GFP_DMA or the IOMMU/swiotlb then using pci_alloc_consistent,
but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory
(even on AMD systems the IOMMU tends to be quite small) especially if you have
many devices.  With the new zone pci_alloc_consistent can just put
this stuff into memory below 4GB which works better.

One argument was still if the zone should be 4GB or 2GB. The main
motivation for 2GB would be an unnamed not so unpopular hardware
raid controller (mostly found in older machines from a particular four letter
company) who has a strange 2GB restriction in firmware. But
that one works ok with swiotlb/IOMMU anyways, so it doesn't really
need GFP_DMA32. I chose 4GB to be compatible with IA64 and because
it seems to be the most common restriction.

The new zone is so far added only for x86-64.

For other architectures who don't set up this
new zone nothing changes. Architectures can set a compatibility
define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32
as GFP_DMA. Otherwise it's a nop because on 32bit architectures
it's normally not needed because GFP_NORMAL (=0) is DMA able
enough.

One problem is still that GFP_DMA means different things on different
architectures. e.g. some drivers used to have #ifdef ia64  use GFP_DMA
(trusting it to be 4GB) #elif __x86_64__ (use other hacks like
the swiotlb because 16MB is not enough) ... . This was quite
ugly and is now obsolete.

These should be now converted to use GFP_DMA32 unconditionally. I haven't done
this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent
which will use GFP_DMA32 transparently.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/dma.h   | 11 +++++++++--
 include/asm-x86_64/proto.h |  2 ++
 include/linux/gfp.h        | 11 +++++++++++
 include/linux/mmzone.h     | 16 +++++++++-------
 4 files changed, 31 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/dma.h b/include/asm-x86_64/dma.h
index 16fa3a064d0..6f2a817b6a7 100644
--- a/include/asm-x86_64/dma.h
+++ b/include/asm-x86_64/dma.h
@@ -72,8 +72,15 @@
 
 #define MAX_DMA_CHANNELS	8
 
-/* The maximum address that we can perform a DMA transfer to on this platform */
-#define MAX_DMA_ADDRESS      (PAGE_OFFSET+0x1000000)
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN   ((16*1024*1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
+
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
 
 /* 8237 DMA controllers */
 #define IO_DMA1_BASE	0x00	/* 8 bit slave DMA, channels 0..3 */
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index dbb37b0adb4..c251152a065 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -22,6 +22,8 @@ extern void mtrr_bp_init(void);
 #define mtrr_bp_init() do {} while (0)
 #endif
 extern void init_memory_mapping(unsigned long start, unsigned long end);
+extern void size_zones(unsigned long *z, unsigned long *h,
+			unsigned long start_pfn, unsigned long end_pfn);
 
 extern void system_call(void); 
 extern int kernel_syscall(void);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c3779432a72..4351e6bb5a7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -14,6 +14,13 @@ struct vm_area_struct;
 /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
 #define __GFP_DMA	((__force gfp_t)0x01u)
 #define __GFP_HIGHMEM	((__force gfp_t)0x02u)
+#ifdef CONFIG_DMA_IS_DMA32
+#define __GFP_DMA32	((__force gfp_t)0x01)	/* ZONE_DMA is ZONE_DMA32 */
+#elif BITS_PER_LONG < 64
+#define __GFP_DMA32	((__force gfp_t)0x00)	/* ZONE_NORMAL is ZONE_DMA32 */
+#else
+#define __GFP_DMA32	((__force gfp_t)0x04)	/* Has own ZONE_DMA32 */
+#endif
 
 /*
  * Action modifiers - doesn't change the zoning
@@ -64,6 +71,10 @@ struct vm_area_struct;
 
 #define GFP_DMA		__GFP_DMA
 
+/* 4GB DMA on some platforms */
+#define GFP_DMA32	__GFP_DMA32
+
+
 #define gfp_zone(mask) ((__force int)((mask) & (__force gfp_t)GFP_ZONEMASK))
 
 /*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f5fa3082fd6..da7a829f856 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -71,10 +71,11 @@ struct per_cpu_pageset {
 #endif
 
 #define ZONE_DMA		0
-#define ZONE_NORMAL		1
-#define ZONE_HIGHMEM		2
+#define ZONE_DMA32		1
+#define ZONE_NORMAL		2
+#define ZONE_HIGHMEM		3
 
-#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
+#define MAX_NR_ZONES		4	/* Sync this with ZONES_SHIFT */
 #define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 
 
@@ -108,9 +109,10 @@ struct per_cpu_pageset {
 
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
- * into multiple physical zones. On a PC we have 3 zones:
+ * into multiple physical zones. On a PC we have 4 zones:
  *
  * ZONE_DMA	  < 16 MB	ISA DMA capable memory
+ * ZONE_DMA32	     0 MB 	Empty
  * ZONE_NORMAL	16-896 MB	direct mapped by the kernel
  * ZONE_HIGHMEM	 > 896 MB	only page cache and user processes
  */
@@ -455,10 +457,10 @@ extern struct pglist_data contig_page_data;
 
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
- * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
- * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
+ * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
+ * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
  */
-#define FLAGS_RESERVED		8
+#define FLAGS_RESERVED		9
 
 #elif BITS_PER_LONG == 64
 /*
-- 
cgit v1.2.3


From 979edfadbae2286eec5b46143c00e81bca96498e Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Adjust, correct, and complete the HPET definitions
 for x86-64.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/hpet.h | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/hpet.h b/include/asm-x86_64/hpet.h
index a3877f57099..c20c28f5c7a 100644
--- a/include/asm-x86_64/hpet.h
+++ b/include/asm-x86_64/hpet.h
@@ -14,18 +14,18 @@
 #define HPET_CFG	0x010
 #define HPET_STATUS	0x020
 #define HPET_COUNTER	0x0f0
-#define HPET_T0_CFG	0x100
-#define HPET_T0_CMP	0x108
-#define HPET_T0_ROUTE	0x110
-#define HPET_T1_CFG	0x120
-#define HPET_T1_CMP	0x128
-#define HPET_T1_ROUTE	0x130
-#define HPET_T2_CFG	0x140
-#define HPET_T2_CMP	0x148
-#define HPET_T2_ROUTE	0x150
+#define HPET_Tn_OFFSET	0x20
+#define HPET_Tn_CFG(n)	 (0x100 + (n) * HPET_Tn_OFFSET)
+#define HPET_Tn_ROUTE(n) (0x104 + (n) * HPET_Tn_OFFSET)
+#define HPET_Tn_CMP(n)	 (0x108 + (n) * HPET_Tn_OFFSET)
+#define HPET_T0_CFG	HPET_Tn_CFG(0)
+#define HPET_T0_CMP	HPET_Tn_CMP(0)
+#define HPET_T1_CFG	HPET_Tn_CFG(1)
+#define HPET_T1_CMP	HPET_Tn_CMP(1)
 
 #define HPET_ID_VENDOR	0xffff0000
 #define HPET_ID_LEGSUP	0x00008000
+#define HPET_ID_64BIT	0x00002000
 #define HPET_ID_NUMBER	0x00001f00
 #define HPET_ID_REV	0x000000ff
 #define	HPET_ID_NUMBER_SHIFT	8
@@ -38,11 +38,18 @@
 #define	HPET_LEGACY_8254	2
 #define	HPET_LEGACY_RTC		8
 
-#define HPET_TN_ENABLE		0x004
-#define HPET_TN_PERIODIC	0x008
-#define HPET_TN_PERIODIC_CAP	0x010
-#define HPET_TN_SETVAL		0x040
-#define HPET_TN_32BIT		0x100
+#define HPET_TN_LEVEL		0x0002
+#define HPET_TN_ENABLE		0x0004
+#define HPET_TN_PERIODIC	0x0008
+#define HPET_TN_PERIODIC_CAP	0x0010
+#define HPET_TN_64BIT_CAP	0x0020
+#define HPET_TN_SETVAL		0x0040
+#define HPET_TN_32BIT		0x0100
+#define HPET_TN_ROUTE		0x3e00
+#define HPET_TN_FSB		0x4000
+#define HPET_TN_FSB_CAP		0x8000
+
+#define HPET_TN_ROUTE_SHIFT	9
 
 extern int is_hpet_enabled(void);
 extern int hpet_rtc_timer_init(void);
-- 
cgit v1.2.3


From 89b831ef8bf5cfbb357dbc0a2e07700d7f20eec5 Mon Sep 17 00:00:00 2001
From: Jacob Shin <jacob.shin@amd.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Support for AMD specific MCE Threshold.

MC4_MISC - DRAM Errors Threshold Register realized under AMD K8 Rev F.
This register is used to count correctable and uncorrectable ECC errors that occur during DRAM read operations.
The user may interface through sysfs files in order to change the threshold configuration.

bank%d/error_count - reads current error count, write to clear.
bank%d/interrupt_enable - set/clear interrupt enable.
bank%d/threshold_limit - read/write the threshold limit.

APIC vector 0xF9 in hw_irq.h.
5 software defined bank ids in mce.h.
new apic.c function to setup threshold apic lvt.
defaults to interrupt off, count enabled, and threshold limit max.
sysfs interface created on /sys/devices/system/threshold.

AK: added some ifdefs to make it compile on UP

Signed-off-by: Jacob Shin <jacob.shin@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/apic.h   |  2 ++
 include/asm-x86_64/hw_irq.h |  2 +-
 include/asm-x86_64/mce.h    | 10 ++++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index 6c5d5ca8383..5647b7de174 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -111,6 +111,8 @@ extern unsigned int nmi_watchdog;
 
 extern int disable_timer_pin_1;
 
+extern void setup_threshold_lvt(unsigned long lvt_off);
+
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 extern unsigned boot_cpu_id;
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index dc97668ea0f..c14a8c7267a 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -55,7 +55,7 @@ struct hw_interrupt_type;
 #define CALL_FUNCTION_VECTOR	0xfc
 #define KDB_VECTOR		0xfb	/* reserved for KDB */
 #define THERMAL_APIC_VECTOR	0xfa
-/* 0xf9 free */
+#define THRESHOLD_APIC_VECTOR   0xf9
 #define INVALIDATE_TLB_VECTOR_END	0xf8
 #define INVALIDATE_TLB_VECTOR_START	0xf0	/* f0-f8 used for TLB flush */
 
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h
index 869249db679..5d298b799a9 100644
--- a/include/asm-x86_64/mce.h
+++ b/include/asm-x86_64/mce.h
@@ -67,6 +67,8 @@ struct mce_log {
 /* Software defined banks */
 #define MCE_EXTENDED_BANK	128
 #define MCE_THERMAL_BANK	MCE_EXTENDED_BANK + 0
+#define MCE_THRESHOLD_BASE      MCE_EXTENDED_BANK + 1 /* MCE_AMD */
+#define MCE_THRESHOLD_DRAM_ECC  MCE_THRESHOLD_BASE + 4
 
 void mce_log(struct mce *m);
 #ifdef CONFIG_X86_MCE_INTEL
@@ -77,4 +79,12 @@ static inline void mce_intel_feature_init(struct cpuinfo_x86 *c)
 }
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+void mce_amd_feature_init(struct cpuinfo_x86 *c);
+#else
+static inline void mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 6004e1b7effcbb385a6b7c790e4b8008682cf679 Mon Sep 17 00:00:00 2001
From: James Cleverdon <jamesclv@us.ibm.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] i386/x86-64: Share interrupt vectors when there is a large
 number of interrupt sources

Here's a patch that builds on Natalie Protasevich's IRQ compression
patch and tries to work for MPS boots as well as ACPI.  It is meant for
a 4-node IBM x460 NUMA box, which was dying because it had interrupt
pins with GSI numbers > NR_IRQS and thus overflowed irq_desc.

The problem is that this system has 270 GSIs (which are 1:1 mapped with
I/O APIC RTEs) and an 8-node box would have 540.  This is much bigger
than NR_IRQS (224 for both i386 and x86_64).  Also, there aren't enough
vectors to go around.  There are about 190 usable vectors, not counting
the reserved ones and the unused vectors at 0x20 to 0x2F.  So, my patch
attempts to compress the GSI range and share vectors by sharing IRQs.

Cc: "Protasevich, Natalie" <Natalie.Protasevich@unisys.com>

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/desc.h   | 3 +++
 include/asm-x86_64/mpspec.h | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h
index 68ac3c62fe3..1a3d380f9d5 100644
--- a/include/asm-x86_64/desc.h
+++ b/include/asm-x86_64/desc.h
@@ -98,16 +98,19 @@ static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsig
 
 static inline void set_intr_gate(int nr, void *func) 
 { 
+	BUG_ON((unsigned)nr > 0xFF);
 	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
 } 
 
 static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
 { 
+	BUG_ON((unsigned)nr > 0xFF);
 	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
 } 
 
 static inline void set_system_gate(int nr, void *func) 
 { 
+	BUG_ON((unsigned)nr > 0xFF);
 	_set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
 } 
 
diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index f267e10c023..6b375384f5c 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -157,7 +157,8 @@ struct mpc_config_lintsrc
  */
 
 #define MAX_MP_BUSSES 256
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 enum mp_bustype {
 	MP_BUS_ISA = 1,
 	MP_BUS_EISA,
-- 
cgit v1.2.3


From 1dff7f3db5f045ccbfeca5bb00b0958a78501557 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Fix up outdated pfn_to_page comment

pfn_to_page really requires pfn_valid to be true now, no question.
Some people stumbled over it, but it was misleading and wrong.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/mmzone.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index b40c661f111..8d48dc7e43a 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -41,9 +41,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 #define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
 #define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 
-/* AK: this currently doesn't deal with invalid addresses. We'll see 
-   if the 2.5 kernel doesn't pass them
-   (2.4 used to). */
+/* Requires pfn_valid(pfn) to be true */
 #define pfn_to_page(pfn) ({ \
 	int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); 	\
 	((pfn) - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;	\
-- 
cgit v1.2.3


From 07808b74e7dab1aa385e698795875337d72daf7d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Remove obsolete ARCH_HAS_ATOMIC_UNSIGNED and
 page_flags_t

Has been introduced for x86-64 at some point to save memory
in struct page, but has been obsolete for some time. Just
remove it.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h     | 10 ++--------
 include/linux/mmzone.h |  2 +-
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5c1fb0a2e80..23fad4dae23 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,12 +206,6 @@ struct vm_operations_struct {
 struct mmu_gather;
 struct inode;
 
-#ifdef ARCH_HAS_ATOMIC_UNSIGNED
-typedef unsigned page_flags_t;
-#else
-typedef unsigned long page_flags_t;
-#endif
-
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -219,7 +213,7 @@ typedef unsigned long page_flags_t;
  * a page.
  */
 struct page {
-	page_flags_t flags;		/* Atomic flags, some possibly
+	unsigned long flags;		/* Atomic flags, some possibly
 					 * updated asynchronously */
 	atomic_t _count;		/* Usage count, see below. */
 	atomic_t _mapcount;		/* Count of ptes mapped in mms,
@@ -435,7 +429,7 @@ static inline void put_page(struct page *page)
 #endif
 
 /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
-#define SECTIONS_PGOFF		((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
+#define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
 #define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
 #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index da7a829f856..57fc99c67c3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -455,7 +455,7 @@ extern struct pglist_data contig_page_data;
 #include <asm/sparsemem.h>
 #endif
 
-#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
+#if BITS_PER_LONG == 32
 /*
  * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
  * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
-- 
cgit v1.2.3


From 69d81fcde7797342417591ba7affb372b9c86eae Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Speed up numa_node_id by putting it directly into the
 PDA

Not go from the CPU number to an mapping array.
Mode number is often used now in fast paths.

This also adds a generic numa_node_id to all the topology includes

Suggested by Eric Dumazet

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/numa.h     | 2 ++
 include/asm-x86_64/pda.h      | 1 +
 include/asm-x86_64/topology.h | 2 ++
 include/linux/mmzone.h        | 2 ++
 4 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index bcf55c3f7f7..d51e56fdc3d 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -17,6 +17,8 @@ extern void numa_add_cpu(int cpu);
 extern void numa_init_array(void);
 extern int numa_off;
 
+extern void numa_set_node(int cpu, int node);
+
 extern unsigned char apicid_to_node[256];
 
 #define NUMA_NO_NODE 0xff
diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h
index bbf89aa8a1a..8733ccfa442 100644
--- a/include/asm-x86_64/pda.h
+++ b/include/asm-x86_64/pda.h
@@ -15,6 +15,7 @@ struct x8664_pda {
         int irqcount;		    /* Irq nesting counter. Starts with -1 */  	
 	int cpunumber;		    /* Logical CPU number */
 	char *irqstackptr;	/* top of irqstack */
+	int nodenumber;		    /* number of current node */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
 	struct mm_struct *active_mm;
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 1c603cd7e4d..d39ebd5263e 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -28,6 +28,8 @@ extern int __node_distance(int, int);
 #define pcibus_to_node(bus)		((long)(bus->sysdata))	
 #define pcibus_to_cpumask(bus)		node_to_cpumask(pcibus_to_node(bus));
 
+#define numa_node_id()			read_pda(nodenumber)
+
 /* sched_domains SD_NODE_INIT for x86_64 machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 57fc99c67c3..f3cffc354de 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,7 +435,9 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
+#ifndef numa_node_id
 #define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
+#endif
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 
-- 
cgit v1.2.3


From f6c2e3330d3fdd5474bc3756da46fca889a30e33 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Unmap NULL during early bootup

We should zap the low mappings, as soon as possible, so that we can catch
kernel bugs more effectively. Previously early boot had NULL mapped
and didn't trap on NULL references.

This patch introduces boot_level4_pgt, which will always have low identity
addresses mapped.  Druing boot, all the processors will use this as their
level4 pgt.  On BP, we will switch to init_level4_pgt as soon as we enter C
code and zap the low mappings as soon as we are done with the usage of
identity low mapped addresses.  On AP's we will zap the low mappings as
soon as we jump to C code.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/pgtable.h | 1 +
 include/asm-x86_64/proto.h   | 2 ++
 include/asm-x86_64/smp.h     | 1 -
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 7a07196a720..a204efb553d 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -16,6 +16,7 @@ extern pud_t level3_physmem_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pgd_t init_level4_pgt[];
+extern pgd_t boot_level4_pgt[];
 extern unsigned long __supported_pte_mask;
 
 #define swapper_pg_dir init_level4_pgt
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index c251152a065..34501086afe 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -11,6 +11,8 @@ struct pt_regs;
 extern void start_kernel(void);
 extern void pda_init(int); 
 
+extern void zap_low_mappings(int cpu);
+
 extern void early_idt_handler(void);
 
 extern void mcheck_init(struct cpuinfo_x86 *c);
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index c57ce407134..592161e979e 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -47,7 +47,6 @@ extern void lock_ipi_call_lock(void);
 extern void unlock_ipi_call_lock(void);
 extern int smp_num_siblings;
 extern void smp_send_reschedule(int cpu);
-extern void zap_low_mappings(void);
 void smp_stop_cpu(void);
 extern int smp_call_function_single(int cpuid, void (*func) (void *info),
 				void *info, int retry, int wait);
-- 
cgit v1.2.3


From 6b75aeedde1e8a8513393d3c1367bf81bc5b0c67 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Don't apply __PHYSICAL_MASK to page frame numbers

It is for physical addresses, not for PFNs.

Pointed out by Tejun Heo.

Cc: htejun@gmail.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/page.h    | 2 +-
 include/asm-x86_64/pgtable.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index e5ab4d231f2..06e489f3247 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -11,7 +11,7 @@
 #define PAGE_SIZE	(1UL << PAGE_SHIFT)
 #endif
 #define PAGE_MASK	(~(PAGE_SIZE-1))
-#define PHYSICAL_PAGE_MASK	(~(PAGE_SIZE-1) & (__PHYSICAL_MASK << PAGE_SHIFT))
+#define PHYSICAL_PAGE_MASK	(~(PAGE_SIZE-1) & __PHYSICAL_MASK)
 
 #define THREAD_ORDER 1 
 #ifdef __ASSEMBLY__
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index a204efb553d..f8e87a57f3a 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -246,7 +246,7 @@ static inline unsigned long pud_bad(pud_t pud)
 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))	/* FIXME: is this
 						   right? */
 #define pte_page(x)	pfn_to_page(pte_pfn(x))
-#define pte_pfn(x)  ((pte_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK)
+#define pte_pfn(x)  ((pte_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
 
 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
@@ -353,7 +353,7 @@ static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 #define	pmd_bad(x)	((pmd_val(x) & (~PTE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE )
 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
-#define pmd_pfn(x)  ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK)
+#define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
 
 #define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
 #define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
-- 
cgit v1.2.3


From 2bc0414ee04fd8bb798760801f5d7476dff44241 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:53 +0100
Subject: [PATCH] x86_64: Only use asm/sections.h to declare section symbols

Adding __initdata_* to asm-generic/sections.h
Replaces a lot of open coded externs in arch/x86_64/*
I had to change __bss_end to __bss_stop to match the other architectures.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/sections.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 886dbd11689..0b49f9e070f 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -13,5 +13,6 @@ extern char _eextratext[] __attribute__((weak));
 extern char _end[];
 extern char __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __initdata_begin[], __initdata_end[];
 
 #endif /* _ASM_GENERIC_SECTIONS_H_ */
-- 
cgit v1.2.3


From 485832a5d928facd82f1525270d9f048da2063a1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Use int operations in spinlocks to support more than
 128 CPUs spinning.

Pointed out by Eric Dumazet

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/spinlock.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index 69636831ad2..fe484a699cc 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -18,22 +18,22 @@
  */
 
 #define __raw_spin_is_locked(x) \
-		(*(volatile signed char *)(&(x)->slock) <= 0)
+		(*(volatile signed int *)(&(x)->slock) <= 0)
 
 #define __raw_spin_lock_string \
 	"\n1:\t" \
-	"lock ; decb %0\n\t" \
+	"lock ; decl %0\n\t" \
 	"js 2f\n" \
 	LOCK_SECTION_START("") \
 	"2:\t" \
 	"rep;nop\n\t" \
-	"cmpb $0,%0\n\t" \
+	"cmpl $0,%0\n\t" \
 	"jle 2b\n\t" \
 	"jmp 1b\n" \
 	LOCK_SECTION_END
 
 #define __raw_spin_unlock_string \
-	"movb $1,%0" \
+	"movl $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
@@ -47,10 +47,10 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock)
 
 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
-	char oldval;
+	int oldval;
 
 	__asm__ __volatile__(
-		"xchgb %b0,%1"
+		"xchgl %0,%1"
 		:"=q" (oldval), "=m" (lock->slock)
 		:"0" (0) : "memory");
 
-- 
cgit v1.2.3


From 420f8f68c9c5148dddf946bebdbc7eacde2172cb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: New heuristics to find out hotpluggable CPUs.

With a NR_CPUS==128 kernel with CPU hotplug enabled we would waste 4MB
on per CPU data of all possible CPUs.  The reason was that HOTPLUG
always set up possible map to NR_CPUS cpus and then we need to allocate
that much (each per CPU data is roughly ~32k now)

The underlying problem is that ACPI didn't tell us how many hotplug CPUs
the platform supports.  So the old code just assumed all, which would
lead to this memory wastage.

This implements some new heuristics:

 - If the BIOS specified disabled CPUs in the ACPI/mptables assume they
   can be enabled later (this is bending the ACPI specification a bit,
   but seems like a obvious extension)
 - The user can overwrite it with a new additionals_cpus=NUM option
 - Otherwise use half of the available CPUs or 2, whatever is more.

Cc: ashok.raj@intel.com
Cc: len.brown@intel.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/smp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 592161e979e..cf8f969f902 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -81,6 +81,8 @@ extern int safe_smp_processor_id(void);
 extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 extern void prefill_possible_map(void);
+extern unsigned num_processors;
+extern unsigned disabled_cpus;
 
 #endif /* !ASSEMBLY */
 
-- 
cgit v1.2.3


From ea0be473a1f0ee89024a24d8ea4b05fbf6efcee3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Allow modular build of ia32 aout loader

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/ia32.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h
index 6efa00fe4e7..c7bc9c0525b 100644
--- a/include/asm-x86_64/ia32.h
+++ b/include/asm-x86_64/ia32.h
@@ -165,6 +165,11 @@ struct siginfo_t;
 int do_get_thread_area(struct thread_struct *t, struct user_desc __user *info);
 int do_set_thread_area(struct thread_struct *t, struct user_desc __user *info);
 int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs);
+
+struct linux_binprm;
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
+				unsigned long stack_top, int exec_stack);
+
 #endif
 
 #endif /* !CONFIG_IA32_SUPPORT */
-- 
cgit v1.2.3


From a88cde13bae3fffd6ecc812bdd02c91eafb6073e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Formatting fixes for arch/x86_64/kernel/process.c

No functional changes.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/msr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index 5a7fe3c6c3d..24dc39651bc 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -19,7 +19,7 @@
 			    : "=a" (a__), "=d" (b__) \
 			    : "c" (msr)); \
        val = a__ | (b__<<32); \
-} while(0); 
+} while(0)
 
 #define wrmsr(msr,val1,val2) \
      __asm__ __volatile__("wrmsr" \
-- 
cgit v1.2.3


From e90f22edf432512219cc2952f5811961abbd164f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Fix NUMA node lookup debug code which had bitrotted

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/mmzone.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index 8d48dc7e43a..69baaa8a3ce 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -17,16 +17,15 @@
 /* Simple perfect hash to map physical addresses to node numbers */
 extern int memnode_shift; 
 extern u8  memnodemap[NODEMAPSIZE]; 
-extern int maxnode;
 
 extern struct pglist_data *node_data[];
 
 static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
 { 
-	int nid; 
+	unsigned nid; 
 	VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
 	nid = memnodemap[addr >> memnode_shift]; 
-	VIRTUAL_BUG_ON(nid > maxnode); 
+	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
 	return nid; 
 } 
 
-- 
cgit v1.2.3


From 94605eff572b727aaad9b4b29bc358b919096503 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86-64/i386: Intel HT, Multi core detection fixes

Fields obtained through cpuid vector 0x1(ebx[16:23]) and
vector 0x4(eax[14:25], eax[26:31]) indicate the maximum values and might not
always be the same as what is available and what OS sees.  So make sure
"siblings" and "cpu cores" values in /proc/cpuinfo reflect the values as seen
by OS instead of what cpuid instruction says. This will also fix the buggy BIOS
cases (for example where cpuid on a single core cpu says there are "2" siblings,
even when HT is disabled in the BIOS.
http://bugzilla.kernel.org/show_bug.cgi?id=4359)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/processor.h   |  4 +++-
 include/asm-x86_64/processor.h |  4 +++-
 include/linux/bitops.h         | 10 ++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 0a4ec764377..9cd4a05234a 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -65,7 +65,9 @@ struct cpuinfo_x86 {
 	int	f00f_bug;
 	int	coma_bug;
 	unsigned long loops_per_jiffy;
-	unsigned char x86_num_cores;
+	unsigned char x86_max_cores;	/* cpuid returned max cores value */
+	unsigned char booted_cores;	/* number of cores as seen by OS */
+	unsigned char apicid;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL 0
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 03837d34fba..4861246548f 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -61,10 +61,12 @@ struct cpuinfo_x86 {
 	int	x86_cache_alignment;
 	int	x86_tlbsize;	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
         __u8    x86_virt_bits, x86_phys_bits;
-	__u8	x86_num_cores;
+	__u8	x86_max_cores;	/* cpuid returned max cores value */
         __u32   x86_power; 	
 	__u32   extended_cpuid_level;	/* Max extended CPUID function supported */
 	unsigned long loops_per_jiffy;
+	__u8	apicid;
+	__u8	booted_cores;	/* number of cores as seen by OS */
 } ____cacheline_aligned;
 
 #define X86_VENDOR_INTEL 0
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cb3c3ef50f5..38c2fb7ebe0 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -84,6 +84,16 @@ static __inline__ int get_bitmask_order(unsigned int count)
 	return order;	/* We could be slightly more clever with -1 here... */
 }
 
+static __inline__ int get_count_order(unsigned int count)
+{
+	int order;
+	
+	order = fls(count) - 1;
+	if (count & (count - 1))
+		order++;
+	return order;
+}
+
 /*
  * hweightN: returns the hamming weight (i.e. the number
  * of bits set) of a N-bit word
-- 
cgit v1.2.3


From a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Reduce number of retries for reset through keyboard
 controller

Old code could retry for 10 seconds worst time. Only try it
for one second now.

Suggested by Yinghai Lu

Cc: Yinghai.Lu@amd.com

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mach-default/mach_reboot.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-i386/mach-default/mach_reboot.h b/include/asm-i386/mach-default/mach_reboot.h
index 06ae4d81ba6..a955e57ad01 100644
--- a/include/asm-i386/mach-default/mach_reboot.h
+++ b/include/asm-i386/mach-default/mach_reboot.h
@@ -19,7 +19,7 @@ static inline void kb_wait(void)
 static inline void mach_reboot(void)
 {
 	int i;
-	for (i = 0; i < 100; i++) {
+	for (i = 0; i < 10; i++) {
 		kb_wait();
 		udelay(50);
 		outb(0x60, 0x64);	/* write Controller Command Byte */
-- 
cgit v1.2.3


From 8e0d4f4e9132ae6e353f9cf27261627bcc7c65cc Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Remove asm-x86_64/rwsem.h

Not needed since x86-64 always uses the spinlock based rwsems.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/rwsem.h | 283 ---------------------------------------------
 1 file changed, 283 deletions(-)
 delete mode 100644 include/asm-x86_64/rwsem.h

(limited to 'include')

diff --git a/include/asm-x86_64/rwsem.h b/include/asm-x86_64/rwsem.h
deleted file mode 100644
index 46077e9c191..00000000000
--- a/include/asm-x86_64/rwsem.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for x86_64+
- *
- * Written by David Howells (dhowells@redhat.com).
- * Ported by Andi Kleen <ak@suse.de> to x86-64.
- *
- * Derived from asm-i386/semaphore.h and asm-i386/rwsem.h
- *
- *
- * The MSW of the count is the negated number of active writers and waiting
- * lockers, and the LSW is the total number of active locks
- *
- * The lock count is initialized to 0 (no active and no waiting lockers).
- *
- * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an
- * uncontended lock. This can be determined because XADD returns the old value.
- * Readers increment by 1 and see a positive value when uncontended, negative
- * if there are writers (and maybe) readers waiting (in which case it goes to
- * sleep).
- *
- * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
- * be extended to 65534 by manually checking the whole MSW rather than relying
- * on the S flag.
- *
- * The value of ACTIVE_BIAS supports up to 65535 active processes.
- *
- * This should be totally fair - if anything is waiting, a process that wants a
- * lock will go to the back of the queue. When the currently active lock is
- * released, if there's a writer at the front of the queue, then that and only
- * that will be woken up; if there's a bunch of consecutive readers at the
- * front, then they'll all be woken up, but no other readers will be.
- */
-
-#ifndef _X8664_RWSEM_H
-#define _X8664_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-struct rwsem_waiter;
-
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed int		count;
-#define RWSEM_UNLOCKED_VALUE		0x00000000
-#define RWSEM_ACTIVE_BIAS		0x00000001
-#define RWSEM_ACTIVE_MASK		0x0000ffff
-#define RWSEM_WAITING_BIAS		(-0x00010000)
-#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#if RWSEM_DEBUG
-	int			debug;
-#endif
-};
-
-/*
- * initialisation
- */
-#if RWSEM_DEBUG
-#define __RWSEM_DEBUG_INIT      , 0
-#else
-#define __RWSEM_DEBUG_INIT	/* */
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
-	__RWSEM_DEBUG_INIT }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
-}
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
-	__asm__ __volatile__(
-		"# beginning down_read\n\t"
-LOCK_PREFIX	"  incl      (%%rdi)\n\t" /* adds 0x00000001, returns the old value */
-		"  js        2f\n\t" /* jump if we weren't granted the lock */
-		"1:\n\t"
-		LOCK_SECTION_START("") \
-		"2:\n\t"
-		"  call      rwsem_down_read_failed_thunk\n\t"
-		"  jmp       1b\n"
-		LOCK_SECTION_END \
-		"# ending down_read\n\t"
-		: "+m"(sem->count)
-		: "D"(sem)
-		: "memory", "cc");
-}
-
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
-	__s32 result, tmp;
-	__asm__ __volatile__(
-		"# beginning __down_read_trylock\n\t"
-		"  movl      %0,%1\n\t"
-		"1:\n\t"
-		"  movl	     %1,%2\n\t"
-		"  addl      %3,%2\n\t"
-		"  jle	     2f\n\t"
-LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
-		"  jnz	     1b\n\t"
-		"2:\n\t"
-		"# ending __down_read_trylock\n\t"
-		: "+m"(sem->count), "=&a"(result), "=&r"(tmp)
-		: "i"(RWSEM_ACTIVE_READ_BIAS)
-		: "memory", "cc");
-	return result>=0 ? 1 : 0;
-}
-
-
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	tmp = RWSEM_ACTIVE_WRITE_BIAS;
-	__asm__ __volatile__(
-		"# beginning down_write\n\t"
-LOCK_PREFIX	"  xaddl      %0,(%%rdi)\n\t" /* subtract 0x0000ffff, returns the old value */
-		"  testl     %0,%0\n\t" /* was the count 0 before? */
-		"  jnz       2f\n\t" /* jump if we weren't granted the lock */
-		"1:\n\t"
-		LOCK_SECTION_START("")
-		"2:\n\t"
-		"  call      rwsem_down_write_failed_thunk\n\t"
-		"  jmp       1b\n"
-		LOCK_SECTION_END
-		"# ending down_write"
-		: "=&r" (tmp) 
-		: "0"(tmp), "D"(sem)
-		: "memory", "cc");
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
-	signed long ret = cmpxchg(&sem->count,
-				  RWSEM_UNLOCKED_VALUE, 
-				  RWSEM_ACTIVE_WRITE_BIAS);
-	if (ret == RWSEM_UNLOCKED_VALUE)
-		return 1;
-	return 0;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
-	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
-	__asm__ __volatile__(
-		"# beginning __up_read\n\t"
-LOCK_PREFIX	"  xaddl      %[tmp],(%%rdi)\n\t" /* subtracts 1, returns the old value */
-		"  js        2f\n\t" /* jump if the lock is being waited upon */
-		"1:\n\t"
-		LOCK_SECTION_START("")
-		"2:\n\t"
-		"  decw      %w[tmp]\n\t" /* do nothing if still outstanding active readers */
-		"  jnz       1b\n\t"
-		"  call      rwsem_wake_thunk\n\t"
-		"  jmp       1b\n"
-		LOCK_SECTION_END
-		"# ending __up_read\n"
-		: "+m"(sem->count), [tmp] "+r" (tmp)
-		: "D"(sem)
-		: "memory", "cc");
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
-	unsigned tmp; 
-	__asm__ __volatile__(
-		"# beginning __up_write\n\t"
-		"  movl     %[bias],%[tmp]\n\t"
-LOCK_PREFIX	"  xaddl     %[tmp],(%%rdi)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
-		"  jnz       2f\n\t" /* jump if the lock is being waited upon */
-		"1:\n\t"
-		LOCK_SECTION_START("")
-		"2:\n\t"
-		"  decw      %w[tmp]\n\t" /* did the active count reduce to 0? */
-		"  jnz       1b\n\t" /* jump back if not */
-		"  call      rwsem_wake_thunk\n\t"
-		"  jmp       1b\n"
-		LOCK_SECTION_END
-		"# ending __up_write\n"
-		: "+m"(sem->count), [tmp] "=r" (tmp)
-		: "D"(sem), [bias] "i"(-RWSEM_ACTIVE_WRITE_BIAS)
-		: "memory", "cc");
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
-	__asm__ __volatile__(
-		"# beginning __downgrade_write\n\t"
-LOCK_PREFIX	"  addl      %[bias],(%%rdi)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
-		"  js        2f\n\t" /* jump if the lock is being waited upon */
-		"1:\n\t"
-		LOCK_SECTION_START("")
-		"2:\n\t"
-		"  call	     rwsem_downgrade_thunk\n"
-		"  jmp       1b\n"
-		LOCK_SECTION_END
-		"# ending __downgrade_write\n"
-		: "=m"(sem->count)
-		: "D"(sem), [bias] "i"(-RWSEM_WAITING_BIAS), "m"(sem->count)
-		: "memory", "cc");
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
-{
-	__asm__ __volatile__(
-LOCK_PREFIX	"addl %1,%0"
-		:"=m"(sem->count)
-		:"ir"(delta), "m"(sem->count));
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
-{
-	int tmp = delta;
-
-	__asm__ __volatile__(
-LOCK_PREFIX	"xaddl %0,(%2)"
-		: "=r"(tmp), "=m"(sem->count)
-		: "r"(sem), "m"(sem->count), "0" (tmp)
-		: "memory");
-
-	return tmp+delta;
-}
-
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
-#endif /* __KERNEL__ */
-#endif /* _X8664_RWSEM_H */
-- 
cgit v1.2.3


From bf0f2e23834e2bf7d64b467ef07095b1c7e2c04b Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Set ____cacheline_maxaligned_in_smp alignment to 128
 bytes

The current value was correct before the introduction of Intel EM64T support -
but now L1_CACHE_SHIFT_MAX can be less than L1_CACHE_SHIFT, which _is_ funny!

Between the few users of ____cacheline_maxaligned_in_smp, we also have (for
example) rcu_ctrlblk, and struct zone, with zone->{lru_,}lock.  I.e.  we have
a lot of excess cacheline bouncing on them.

No correctness issues, obviously.  So this could even be merged for 2.6.14
(I'm not a fan of this idea, though).

CC: Andi Kleen <ak@suse.de>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/cache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/cache.h b/include/asm-x86_64/cache.h
index eda62bae124..33e53424128 100644
--- a/include/asm-x86_64/cache.h
+++ b/include/asm-x86_64/cache.h
@@ -9,6 +9,6 @@
 /* L1 cache line size */
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
-#define L1_CACHE_SHIFT_MAX 6	/* largest L1 which this arch supports */
+#define L1_CACHE_SHIFT_MAX 7	/* largest L1 which this arch supports */
 
 #endif
-- 
cgit v1.2.3


From efbbdce94f6ea54cf06d9a06e4c95f6874ad64a8 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Use common sys_time64

Keeping this function does not makes sense because it's a copied (and
buggy) copy of sys_time.  The only difference is that now.tv_sec (which is
a time_t, i.e.  a 64-bit long) is copied (and truncated) into a int
(32-bit).

The prototype is the same (they both take a long __user *), so let's drop
this and redirect it to sys_time (and make sure it exists by defining
__ARCH_WANT_SYS_TIME).

Only disadvantage is that the sys_stime definition is also compiled (may be
fixed if needed by adding a separate __ARCH_WANT_SYS_STIME macro, and
defining it for all arch's defining __ARCH_WANT_SYS_TIME except x86_64).

Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/unistd.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 3c494b65d33..2c42150bce0 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -462,7 +462,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr)
 #define __NR_tkill	200
 __SYSCALL(__NR_tkill, sys_tkill) 
 #define __NR_time      201
-__SYSCALL(__NR_time, sys_time64)
+__SYSCALL(__NR_time, sys_time)
 #define __NR_futex     202
 __SYSCALL(__NR_futex, sys_futex)
 #define __NR_sched_setaffinity    203
@@ -608,6 +608,7 @@ do { \
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_TIME
 #define __ARCH_WANT_COMPAT_SYS_TIME
 #endif
 
-- 
cgit v1.2.3


From 8893166ff8694f36655009aa9bf8e7f2e1c9339f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 5 Nov 2005 17:25:54 +0100
Subject: [PATCH] x86_64: Increase the maximum number of local APICs to the
 maximum

This is needed for large multinode IBM systems which have a sparse
APIC space in clustered mode, fully covering the available 8 bits.

The previous kernels would limit the local APIC number to 127,
which caused it to reject some of the CPUs at boot.

I increased the maximum and shrunk the apic_version array a bit
to make up for that (the version is only 8 bit, so don't need
an full int to store)

Cc:  Chris McDermott <lcm@us.ibm.com>

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/mpspec.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index 6b375384f5c..6f8a17d105a 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -16,7 +16,7 @@
 /*
  * A maximum of 255 APICs with the current APIC ID architecture.
  */
-#define MAX_APICS 128
+#define MAX_APICS 255
 
 struct intel_mp_floating
 {
@@ -173,7 +173,7 @@ extern int smp_found_config;
 extern void find_smp_config (void);
 extern void get_smp_config (void);
 extern int nr_ioapics;
-extern int apic_version [MAX_APICS];
+extern unsigned char apic_version [MAX_APICS];
 extern int mp_irq_entries;
 extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
 extern int mpc_default_type;
-- 
cgit v1.2.3