aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/earlyprintk.txt101
-rw-r--r--Makefile2
-rw-r--r--arch/arm/kernel/setup.c13
-rw-r--r--arch/arm/mach-at91/pm.c1
-rw-r--r--arch/arm/mm/abort-ev6.S3
-rw-r--r--arch/arm/plat-s3c64xx/irq-eint.c2
-rw-r--r--arch/powerpc/platforms/86xx/gef_sbc610.c4
-rw-r--r--arch/s390/crypto/aes_s390.c2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/i387.h8
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/efi.c7
-rw-r--r--arch/x86/kernel/efi_64.c21
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/math-emu/fpu_aux.c31
-rw-r--r--arch/x86/mm/highmem_32.c9
-rw-r--r--arch/x86/mm/init.c344
-rw-r--r--arch/x86/mm/init_32.c255
-rw-r--r--arch/x86/mm/init_64.c272
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmmio.c149
-rw-r--r--arch/x86/mm/numa_32.c5
-rw-r--r--arch/x86/mm/testmmiotrace.c70
-rw-r--r--crypto/api.c15
-rw-r--r--drivers/crypto/ixp4xx_crypto.c6
-rw-r--r--drivers/crypto/padlock-aes.c2
-rw-r--r--drivers/crypto/padlock-sha.c4
-rw-r--r--drivers/dma/iop-adma.c2
-rw-r--r--drivers/dma/mv_xor.c2
-rw-r--r--drivers/gpu/drm/drm_stub.c2
-rw-r--r--drivers/i2c/busses/i2c-mv64xxx.c4
-rw-r--r--drivers/mtd/nand/orion_nand.c2
-rw-r--r--drivers/net/arm/Makefile2
-rw-r--r--drivers/net/arm/etherh.c10
-rw-r--r--drivers/video/pxafb.c2
-rw-r--r--include/linux/rcuclassic.h6
-rw-r--r--include/linux/rcupdate.h4
-rw-r--r--include/linux/rcupreempt.h15
-rw-r--r--include/linux/rcutree.h6
-rw-r--r--include/linux/sched.h4
-rw-r--r--init/main.c3
-rw-r--r--kernel/rcuclassic.c4
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcupreempt.c3
-rw-r--r--kernel/rcutree.c4
-rw-r--r--kernel/sched.c15
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/user.c18
68 files changed, 1638 insertions, 825 deletions
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 00000000000..607b1a01606
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
+
+Mini-HOWTO for using the earlyprintk=dbgp boot option with a
+USB2 Debug port key and a debug cable, on x86 systems.
+
+You need two computers, the 'USB debug key' special gadget and
+and two USB cables, connected like this:
+
+ [host/target] <-------> [USB debug key] <-------> [client/console]
+
+1. There are three specific hardware requirements:
+
+ a.) Host/target system needs to have USB debug port capability.
+
+ You can check this capability by looking at a 'Debug port' bit in
+ the lspci -vvv output:
+
+ # lspci -vvv
+ ...
+ 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
+ Subsystem: Lenovo ThinkPad T61
+ Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
+ Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
+ Latency: 0
+ Interrupt: pin D routed to IRQ 19
+ Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
+ Capabilities: [50] Power Management version 2
+ Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
+ Status: D0 PME-Enable- DSel=0 DScale=0 PME+
+ Capabilities: [58] Debug port: BAR=1 offset=00a0
+ ^^^^^^^^^^^ <==================== [ HERE ]
+ Kernel driver in use: ehci_hcd
+ Kernel modules: ehci-hcd
+ ...
+
+( If your system does not list a debug port capability then you probably
+ wont be able to use the USB debug key. )
+
+ b.) You also need a Netchip USB debug cable/key:
+
+ http://www.plxtech.com/products/NET2000/NET20DC/default.asp
+
+ This is a small blue plastic connector with two USB connections,
+ it draws power from its USB connections.
+
+ c.) Thirdly, you need a second client/console system with a regular USB port.
+
+2. Software requirements:
+
+ a.) On the host/target system:
+
+ You need to enable the following kernel config option:
+
+ CONFIG_EARLY_PRINTK_DBGP=y
+
+ And you need to add the boot command line: "earlyprintk=dbgp".
+ (If you are using Grub, append it to the 'kernel' line in
+ /etc/grub.conf)
+
+ NOTE: normally earlyprintk console gets turned off once the
+ regular console is alive - use "earlyprintk=dbgp,keep" to keep
+ this channel open beyond early bootup. This can be useful for
+ debugging crashes under Xorg, etc.
+
+ b.) On the client/console system:
+
+ You should enable the following kernel config option:
+
+ CONFIG_USB_SERIAL_DEBUG=y
+
+ On the next bootup with the modified kernel you should
+ get a /dev/ttyUSBx device(s).
+
+ Now this channel of kernel messages is ready to be used: start
+ your favorite terminal emulator (minicom, etc.) and set
+ it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
+ see the raw output.
+
+ c.) On Nvidia Southbridge based systems: the kernel will try to probe
+ and find out which port has debug device connected.
+
+3. Testing that it works fine:
+
+ You can test the output by using earlyprintk=dbgp,keep and provoking
+ kernel messages on the host/target system. You can provoke a harmless
+ kernel message by for example doing:
+
+ echo h > /proc/sysrq-trigger
+
+ On the host/target system you should see this help line in "dmesg" output:
+
+ SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
+
+ On the client/console system do:
+
+ cat /dev/ttyUSB0
+
+ And you should see the help line above displayed shortly after you've
+ provoked it on the host system.
+
+If it does not work then please ask about it on the linux-kernel@vger.kernel.org
+mailing list or contact the x86 maintainers.
diff --git a/Makefile b/Makefile
index 27fb890a2bf..c40d83aedeb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 29
-EXTRAVERSION = -rc6
+EXTRAVERSION = -rc7
NAME = Erotic Pickled Herring
# *DOCUMENTATION*
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 7049815d66d..68d6494c038 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -233,12 +233,13 @@ static void __init cacheid_init(void)
unsigned int cachetype = read_cpuid_cachetype();
unsigned int arch = cpu_architecture();
- if (arch >= CPU_ARCH_ARMv7) {
- cacheid = CACHEID_VIPT_NONALIASING;
- if ((cachetype & (3 << 14)) == 1 << 14)
- cacheid |= CACHEID_ASID_TAGGED;
- } else if (arch >= CPU_ARCH_ARMv6) {
- if (cachetype & (1 << 23))
+ if (arch >= CPU_ARCH_ARMv6) {
+ if ((cachetype & (7 << 29)) == 4 << 29) {
+ /* ARMv7 register format */
+ cacheid = CACHEID_VIPT_NONALIASING;
+ if ((cachetype & (3 << 14)) == 1 << 14)
+ cacheid |= CACHEID_ASID_TAGGED;
+ } else if (cachetype & (1 << 23))
cacheid = CACHEID_VIPT_ALIASING;
else
cacheid = CACHEID_VIPT_NONALIASING;
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 9bb4f043aa2..7ac812dc055 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -332,7 +332,6 @@ static int at91_pm_enter(suspend_state_t state)
at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR));
error:
- sdram_selfrefresh_disable();
target_state = PM_SUSPEND_ON;
at91_irq_resume();
at91_gpio_resume();
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 8a7f65ba14b..94077fbd96b 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -23,7 +23,8 @@ ENTRY(v6_early_abort)
#ifdef CONFIG_CPU_32v6K
clrex
#else
- strex r0, r1, [sp] @ Clear the exclusive monitor
+ sub r1, sp, #4 @ Get unused stack location
+ strex r0, r1, [r1] @ Clear the exclusive monitor
#endif
mrc p15, 0, r1, c5, c0, 0 @ get FSR
mrc p15, 0, r0, c6, c0, 0 @ get FAR
diff --git a/arch/arm/plat-s3c64xx/irq-eint.c b/arch/arm/plat-s3c64xx/irq-eint.c
index 1f7cc0067f5..ebb305ce768 100644
--- a/arch/arm/plat-s3c64xx/irq-eint.c
+++ b/arch/arm/plat-s3c64xx/irq-eint.c
@@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq)
u32 mask;
mask = __raw_readl(S3C64XX_EINT0MASK);
- mask |= eint_irq_to_bit(irq);
+ mask &= ~eint_irq_to_bit(irq);
__raw_writel(mask, S3C64XX_EINT0MASK);
}
diff --git a/arch/powerpc/platforms/86xx/gef_sbc610.c b/arch/powerpc/platforms/86xx/gef_sbc610.c
index fb371f5ce13..d6b772ba3b8 100644
--- a/arch/powerpc/platforms/86xx/gef_sbc610.c
+++ b/arch/powerpc/platforms/86xx/gef_sbc610.c
@@ -142,6 +142,10 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev)
{
unsigned int val;
+ /* Do not do the fixup on other platforms! */
+ if (!machine_is(gef_sbc610))
+ return;
+
printk(KERN_INFO "Running NEC uPD720101 Fixup\n");
/* Ensure ports 1, 2, 3, 4 & 5 are enabled */
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index c42cd898f68..6118890c946 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void)
module_init(aes_s390_init);
module_exit(aes_s390_fini);
-MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-all");
MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f5cef3fbf9a..31758378bcd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -783,6 +783,11 @@ config X86_MCE_AMD
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
+config X86_MCE_THRESHOLD
+ depends on X86_MCE_AMD || X86_MCE_INTEL
+ bool
+ default y
+
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b..bc9514fb3b1 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
#define APIC_ESR_SENDILL 0x00020
#define APIC_ESR_RECVILL 0x00040
#define APIC_ESR_ILLREGA 0x00080
+#define APIC_LVTCMCI 0x2f0
#define APIC_ICR 0x300
#define APIC_DEST_SELF 0x40000
#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b..edc90f23e70 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
#else /* !CONFIG_X86_32 */
-#define MAX_EFI_IO_PAGES 100
-
extern u64 efi_call0(void *fp);
extern u64 efi_call1(void *fp, u64 arg1);
extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dca8f03da5b..63a79c77d22 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -24,9 +24,6 @@
#include <asm/kmap_types.h>
#else
#include <asm/vsyscall.h>
-#ifdef CONFIG_EFI
-#include <asm/efi.h>
-#endif
#endif
/*
@@ -92,13 +89,6 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_EFI
- FIX_EFI_IO_MAP_LAST_PAGE,
- FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
- + MAX_EFI_IO_PAGES - 1,
-#endif
-#endif
#ifdef CONFIG_X86_VISWS_APIC
FIX_CO_CPU, /* Cobalt timer */
FIX_CO_APIC, /* Cobalt APIC Redirection Table */
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c..71c9e518398 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
#else /* CONFIG_X86_32 */
-extern void finit(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_task(struct task_struct *tsk);
+#else
+static inline void finit_task(struct task_struct *tsk)
+{
+}
+#endif
static inline void tolerant_fwait(void)
{
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 00000000000..36fb1a6a510
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_INIT_32_H
+#define _ASM_X86_INIT_32_H
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+#endif
+
+extern unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+
+
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
+
+#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960..563933e06a3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
*/
#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
+#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
+#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
#include <asm/atomic.h>
+void mce_setup(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct sys_device, device_mce);
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
#endif
#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+
+void mce_log_therm_throt_event(__u64 status);
extern atomic_t mce_entry;
extern void do_machine_check(struct pt_regs *, long);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+ MCP_TIMESTAMP = (1 << 0), /* log time stamp */
+ MCP_UC = (1 << 1), /* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
extern int mce_notify_user(void);
#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
#else
#define mcheck_init(c) do { } while (0)
#endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+
+extern void (*mce_threshold_vector)(void);
#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae0..2dbd2314139 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
#define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2 0x00000280
+#define CMCI_EN (1ULL << 30)
+#define CMCI_THRESHOLD_MASK 0xffffULL
+
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603..826ad37006a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
#ifndef __ASSEMBLY__
-struct pgprot;
-
extern int page_is_ram(unsigned long pagenr);
extern int devmem_is_allowed(unsigned long pagenr);
-extern void map_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838..2cd07b9422f 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
#define _ASM_X86_PAT_H
#include <linux/types.h>
+#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_PAT
extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
extern int kernel_map_sync_memtype(u64 base, unsigned long size,
unsigned long flag);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+ struct pgprot vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+ struct pgprot vma_prot);
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe0..2733fad45f9 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
* area for the same reason. ;)
*/
#define VMALLOC_OFFSET (8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
#ifdef CONFIG_X86_PAE
#define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0..b8238dc8786 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern int nx_enabled;
+extern void set_nx(void);
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d5..4c80f155743 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
that might execute the to be patched code.
Other CPUs are not running. */
stop_nmi();
-#ifdef CONFIG_X86_MCE
- stop_mce();
-#endif
+
+ /*
+ * Don't stop machine check exceptions while patching.
+ * MCEs only happen when something got corrupted and in this
+ * case we must do something about the corruption.
+ * Ignoring it is worse than a unlikely patching race.
+ * Also machine checks tend to be broadcast and if one CPU
+ * goes into machine check the others follow quickly, so we don't
+ * expect a machine check to cause undue problems during to code
+ * patching.
+ */
apply_alternatives(__alt_instructions, __alt_instructions_end);
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
(unsigned long)__smp_locks_end);
restart_nmi();
-#ifdef CONFIG_X86_MCE
- restart_mce();
-#endif
}
/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c..30909a258d0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/smp.h>
+#include <asm/mce.h>
unsigned int num_processors;
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
}
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+#endif
+
/*
* Clean APIC state for other OSs:
*/
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_LVT1, value);
preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (smp_processor_id() == 0)
+ cmci_recheck();
+#endif
}
void __cpuinit end_local_APIC_setup(void)
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
}
}
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f..bfbd5323a63 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
@@ -24,6 +26,9 @@
#include <linux/ctype.h>
#include <linux/kmod.h>
#include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -32,7 +37,6 @@
#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
atomic_t mce_entry;
@@ -47,7 +51,7 @@ static int mce_dont_init;
*/
static int tolerant = 1;
static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = smp_processor_id();
+ rdtscll(m->tsc);
+}
+
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
print_symbol("{%s}", m->ip);
printk("\n");
}
- printk(KERN_EMERG "TSC %Lx ", m->tsc);
+ printk(KERN_EMERG "TSC %llx ", m->tsc);
if (m->addr)
- printk("ADDR %Lx ", m->addr);
+ printk("ADDR %llx ", m->addr);
if (m->misc)
- printk("MISC %Lx ", m->misc);
+ printk("MISC %llx ", m->misc);
printk("\n");
printk(KERN_EMERG "This is not a software problem!\n");
printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
panic(msg);
}
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
{
+ if (mce_dont_init)
+ return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
}
/*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce m;
+ int i;
+
+ mce_setup(&m);
+
+ rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ for (i = 0; i < banks; i++) {
+ if (!bank[i] || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+ m.tsc = 0;
+
+ barrier();
+ rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Uncorrected events are handled by the exception handler
+ * when it is enabled. But when the exception is disabled log
+ * everything.
+ *
+ * TBD do the same check for MCI_STATUS_EN here?
+ */
+ if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+ continue;
+
+ if (m.status & MCI_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+
+ /*
+ * Clear state for this bank.
+ */
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
*/
void do_machine_check(struct pt_regs * regs, long error_code)
{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* error.
*/
int kill_it = 0;
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
atomic_inc(&mce_entry);
- if ((regs
- && notify_die(DIE_NMI, "machine check", regs, error_code,
+ if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
- || !banks)
+ goto out2;
+ if (!banks)
goto out2;
- memset(&m, 0, sizeof(struct mce));
- m.cpu = smp_processor_id();
+ mce_setup(&m);
+
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
/* if the restart IP is not valid, we're done for */
if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
barrier();
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS && !bank[i])
+ __clear_bit(i, toclear);
+ if (!bank[i])
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.tsc = 0;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
+ /*
+ * Non uncorrected errors are handled by machine_check_poll
+ * Leave them alone.
+ */
+ if ((m.status & MCI_STATUS_UC) == 0)
+ continue;
+
+ /*
+ * Set taint even when machine check was not enabled.
+ */
+ add_taint(TAINT_MACHINE_CHECK);
+
+ __set_bit(i, toclear);
+
if (m.status & MCI_STATUS_EN) {
/* if PCC was set, there's no way out */
no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
no_way_out = 1;
kill_it = 1;
}
+ } else {
+ /*
+ * Machine check event was not enabled. Clear, but
+ * ignore.
+ */
+ continue;
}
if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
mce_get_rip(&m, regs);
- if (error_code >= 0)
- rdtscll(m.tsc);
- if (error_code != -2)
- mce_log(&m);
+ mce_log(&m);
/* Did this bank cause the exception? */
/* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
panicm = m;
panicm_found = 1;
}
-
- add_taint(TAINT_MACHINE_CHECK);
}
- /* Never do anything final in the polling timer */
- if (!regs)
- goto out;
-
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
/* notify userspace ASAP */
set_thread_flag(TIF_MCE_NOTIFY);
- out:
/* the last thing we do is clear state */
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ for (i = 0; i < banks; i++) {
+ if (test_bit(i, toclear))
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
wrmsrl(MSR_IA32_MCG_STATUS, 0);
out2:
atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* and historically has been the register value of the
* MSR_IA32_THERMAL_STATUS (Intel) msr.
*/
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
{
struct mce m;
- memset(&m, 0, sizeof(m));
- m.cpu = cpu;
+ mce_setup(&m);
m.bank = MCE_THERMAL_BANK;
m.status = status;
- rdtscll(m.tsc);
mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
static int check_interval = 5 * 60; /* 5 minutes */
static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
{
- if (mce_available(&current_cpu_data))
- do_machine_check(NULL, 0);
-}
+ struct timer_list *t = &per_cpu(mce_timer, data);
-static void mcheck_timer(struct work_struct *work)
-{
- on_each_cpu(mcheck_check_cpu, NULL, 1);
+ WARN_ON(smp_processor_id() != data);
+
+ if (mce_available(&current_cpu_data))
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
/*
* Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
(int)round_jiffies_relative(check_interval*HZ));
}
- schedule_delayed_work(&mcheck_work, next_interval);
+ t->expires = jiffies + next_interval;
+ add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
/*
- * This is only called from process context. This is where we do
- * anything we need to alert userspace about new MCEs. This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
*/
int mce_notify_user(void)
{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &notify_user)) {
- static unsigned long last_print;
- unsigned long now = jiffies;
-
wake_up_interruptible(&mce_wait);
- if (trigger[0])
- call_usermodehelper(trigger, trigger_argv, NULL,
- UMH_NO_WAIT);
- if (time_after_eq(now, last_print + (check_interval*HZ))) {
- last_print = now;
+ /*
+ * There is no risk of missing notifications because
+ * work_pending is always cleared before the function is
+ * executed.
+ */
+ if (trigger[0] && !work_pending(&mce_trigger_work))
+ schedule_work(&mce_trigger_work);
+
+ if (__ratelimit(&ratelimit))
printk(KERN_INFO "Machine check events logged\n");
- }
return 1;
}
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
static __init int periodic_mcheck_init(void)
{
- next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
- idle_notifier_register(&mce_idle_notifier);
- return 0;
+ idle_notifier_register(&mce_idle_notifier);
+ return 0;
}
__initcall(periodic_mcheck_init);
-
/*
* Initialize Machine Checks for a CPU.
*/
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
{
u64 cap;
- int i;
+ unsigned b;
rdmsrl(MSR_IA32_MCG_CAP, cap);
- banks = cap & 0xff;
- if (banks > MCE_EXTENDED_BANK) {
- banks = MCE_EXTENDED_BANK;
- printk(KERN_INFO "MCE: warning: using only %d banks\n",
- MCE_EXTENDED_BANK);
+ b = cap & 0xff;
+ if (b > MAX_NR_BANKS) {
+ printk(KERN_WARNING
+ "MCE: Using only %u machine check banks out of %u\n",
+ MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
}
+
+ /* Don't support asymmetric configurations today */
+ WARN_ON(banks != 0 && b != banks);
+ banks = b;
+ if (!bank) {
+ bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+ if (!bank)
+ return -ENOMEM;
+ memset(bank, 0xff, banks * sizeof(u64));
+ }
+
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
- /* Log the machine checks left over from the previous reset.
- This also clears all registers */
- do_machine_check(NULL, mce_bootlog ? -1 : -2);
+ return 0;
+}
+
+static void mce_init(void *dummy)
+{
+ u64 cap;
+ int i;
+ mce_banks_t all_banks;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC, &all_banks);
set_in_cr4(X86_CR4_MCE);
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS)
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- else
- wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
/* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
- if(c->x86 == 15)
+ if (c->x86 == 15 && banks > 4)
/* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, &bank[4]);
+ clear_bit(10, (unsigned long *)&bank[4]);
if(c->x86 <= 17 && mce_bootlog < 0)
/* Lots of broken BIOS around that don't clear them
by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
}
}
+static void mce_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+
+ /* data race harmless because everyone sets to the same value */
+ if (!next_interval)
+ next_interval = check_interval * HZ;
+ if (!next_interval)
+ return;
+ setup_timer(t, mcheck_timer, smp_processor_id());
+ t->expires = round_jiffies_relative(jiffies + next_interval);
+ add_timer(t);
+}
+
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
- mce_cpu_quirks(c);
+ if (!mce_available(c))
+ return;
- if (mce_dont_init ||
- !mce_available(c))
+ if (mce_cap_init() < 0) {
+ mce_dont_init = 1;
return;
+ }
+ mce_cpu_quirks(c);
mce_init(NULL);
mce_cpu_features(c);
+ mce_init_timer();
}
/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
{
unsigned long *cpu_tsc;
static DEFINE_MUTEX(mce_read_mutex);
- unsigned next;
+ unsigned prev, next;
char __user *buf = ubuf;
int i, err;
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
}
err = 0;
- for (i = 0; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i,0, sizeof(struct mce));
- goto timeout;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+
+ while (!mcelog.entry[i].finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(mcelog.entry + i, 0,
+ sizeof(struct mce));
+ goto timeout;
+ }
+ cpu_relax();
}
- cpu_relax();
+ smp_rmb();
+ err |= copy_to_user(buf, mcelog.entry + i,
+ sizeof(struct mce));
+ buf += sizeof(struct mce);
+timeout:
+ ;
}
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
- buf += sizeof(struct mce);
- timeout:
- ;
- }
- memset(mcelog.entry, 0, next * sizeof(struct mce));
- mcelog.next = 0;
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
synchronize_sched();
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
&mce_chrdev_ops,
};
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
/*
* Old style boot options parsing. Only for compatibility.
*/
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
return 1;
}
-/* mce=off disables machine check. Note you can re-enable it later
- using sysfs.
+/* mce=off disables machine check.
mce=TOLERANCELEVEL (number, see above)
mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
* Sysfs support
*/
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+ int i;
+
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+ return mce_disable();
+}
+
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
Only one CPU is active at this time, the others get readded later using
CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
return 0;
}
+static void mce_cpu_restart(void *data)
+{
+ del_timer_sync(&__get_cpu_var(mce_timer));
+ if (mce_available(&current_cpu_data))
+ mce_init(NULL);
+ mce_init_timer();
+}
+
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
- if (next_interval)
- cancel_delayed_work(&mcheck_work);
- /* Timer race is harmless here */
- on_each_cpu(mce_init, NULL, 1);
next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
+ on_each_cpu(mce_cpu_restart, NULL, 1);
}
static struct sysdev_class mce_sysclass = {
+ .suspend = mce_suspend,
+ .shutdown = mce_shutdown,
.resume = mce_resume,
.name = "machinecheck",
};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ char *buf)
+{
+ u64 b = bank[attr - bank_attrs];
+ return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *end;
+ u64 new = simple_strtoull(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ bank[attr - bank_attrs] = new;
+ mce_restart();
+ return end-buf;
+}
static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
ACCESSOR(check_interval,check_interval,mce_restart())
static struct sysdev_attribute *mce_attributes[] = {
- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
NULL
};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
if (err)
goto error;
}
+ for (i = 0; i < banks; i++) {
+ err = sysdev_create_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ if (err)
+ goto error2;
+ }
cpu_set(cpu, mce_device_initialized);
return 0;
+error2:
+ while (--i >= 0) {
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ }
error:
- while (i--) {
+ while (--i >= 0) {
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
}
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
+ for (i = 0; i < banks; i++)
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
cpu_clear(cpu, mce_device_initialized);
}
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_clear();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_reenable();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
switch (action) {
case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
threshold_cpu_callback(action, cpu);
mce_remove_device(cpu);
break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ del_timer_sync(t);
+ smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ t->expires = round_jiffies_relative(jiffies + next_interval);
+ add_timer_on(t, cpu);
+ smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ break;
+ case CPU_POST_DEAD:
+ /* intentionally ignoring frozen here */
+ cmci_rediscover(cpu);
+ break;
}
return NOTIFY_OK;
}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
.notifier_call = mce_cpu_callback,
};
+static __init int mce_init_banks(void)
+{
+ int i;
+
+ bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+ GFP_KERNEL);
+ if (!bank_attrs)
+ return -ENOMEM;
+
+ for (i = 0; i < banks; i++) {
+ struct sysdev_attribute *a = &bank_attrs[i];
+ a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+ if (!a->attr.name)
+ goto nomem;
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+ return 0;
+
+nomem:
+ while (--i >= 0)
+ kfree(bank_attrs[i].attr.name);
+ kfree(bank_attrs);
+ bank_attrs = NULL;
+ return -ENOMEM;
+}
+
static __init int mce_init_device(void)
{
int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
+
+ err = mce_init_banks();
+ if (err)
+ return err;
+
err = sysdev_class_register(&mce_sysclass);
if (err)
return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd46..c5a32f92d07 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+static void amd_threshold_interrupt(void);
+
/*
* CPU Initialization
*/
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
tr.reset = 0;
tr.old_limit = 0;
threshold_restart_bank(&tr);
+
+ mce_threshold_vector = amd_threshold_interrupt;
}
}
}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
{
unsigned int bank, block;
struct mce m;
u32 low = 0, high = 0, address = 0;
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- memset(&m, 0, sizeof(m));
- rdtscll(m.tsc);
- m.cpu = smp_processor_id();
+ mce_setup(&m);
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
/* Log the machine check that caused the threshold
event. */
- do_machine_check(NULL, 0);
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
+ bank * NR_BLOCKS
+ block;
mce_log(&m);
- goto out;
+ return;
}
}
}
-out:
- inc_irq_stat(irq_threshold_count);
- irq_exit();
}
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e..aaa7d973093 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
@@ -13,6 +15,7 @@
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
+#include <asm/apic.h>
asmlinkage void smp_thermal_interrupt(void)
{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & 1))
- mce_log_therm_throt_event(smp_processor_id(), msr_val);
+ mce_log_therm_throt_event(msr_val);
inc_irq_stat(irq_thermal_count);
irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+ u64 cap;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+ if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+ return 0;
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+ if (*hdr == 0)
+ printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+ *hdr = 1;
+ printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+ unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+ int hdr = 0;
+ int i;
+
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+
+ if (test_bit(i, owned))
+ continue;
+
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Already owned by someone else? */
+ if (val & CMCI_EN) {
+ if (test_and_clear_bit(i, owned) || boot)
+ print_update("SHD", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ continue;
+ }
+
+ val |= CMCI_EN | CMCI_THRESHOLD;
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Did the enable bit stick? -- the bank supports CMCI */
+ if (val & CMCI_EN) {
+ if (!test_and_set_bit(i, owned) || boot)
+ print_update("CMCI", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ } else {
+ WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+ }
+ }
+ spin_unlock(&cmci_discover_lock);
+ if (hdr)
+ printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+ return;
+ local_irq_save(flags);
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ int i;
+ int banks;
+ u64 val;
+
+ if (!cmci_supported(&banks))
+ return;
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+ continue;
+ /* Disable CMCI */
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ __clear_bit(i, __get_cpu_var(mce_banks_owned));
+ }
+ spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+ int banks;
+ int cpu;
+ cpumask_var_t old;
+
+ if (!cmci_supported(&banks))
+ return;
+ if (!alloc_cpumask_var(&old, GFP_KERNEL))
+ return;
+ cpumask_copy(old, &current->cpus_allowed);
+
+ for_each_online_cpu (cpu) {
+ if (cpu == dying)
+ continue;
+ if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+ continue;
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+ }
+
+ set_cpus_allowed_ptr(current, old);
+ free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+}
+
+static __cpuinit void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks, 1);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
+ intel_init_cmci();
}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..23ee9e730f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+ printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+ exit_idle();
+ irq_enter();
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ irq_exit();
+ /* Ack only at the end to avoid potential reentry */
+ ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b205272ad39..1736acc4d7a 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
efi_memory_desc_t *md;
efi_status_t status;
unsigned long size;
- u64 end, systab, addr, npages;
+ u64 end, systab, addr, npages, end_pfn;
void *p, *va;
efi.systab = NULL;
@@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
- if (PFN_UP(end) <= max_low_pfn_mapped)
+ end_pfn = PFN_UP(end);
+ if (end_pfn <= max_low_pfn_mapped
+ || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+ && end_pfn <= max_pfn_mapped))
va = __va(md->phys_addr);
else
va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index a4ee29127fd..22c3b7828c5 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void)
void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
{
- static unsigned pages_mapped __initdata;
- unsigned i, pages;
- unsigned long offset;
+ unsigned long last_map_pfn;
- pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
-
- if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+ last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+ if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
return NULL;
- for (i = 0; i < pages; i++) {
- __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
- phys_addr, PAGE_KERNEL);
- phys_addr += PAGE_SIZE;
- pages_mapped++;
- }
-
- return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
- (pages_mapped - pages)) + offset;
+ return (void __iomem *)__va(phys_addr);
}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0..f2f8540a7f3 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
memset(tsk->thread.xstate, 0, xstate_size);
- finit();
+ finit_task(tsk);
set_stopped_child_used_math(tsk);
return 0;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1ba..e8192401da4 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
static struct mpf_intel *mpf_found;
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ mpc = early_ioremap(physptr, PAGE_SIZE);
+ size = mpc->length;
+ early_iounmap(mpc, PAGE_SIZE);
+ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+
+ return size;
+}
+
/*
* Scan the memory blocks for an SMP configuration block.
*/
@@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early)
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
+ struct mpc_table *mpc;
+ unsigned long size;
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_ioremap(mpf->physptr, size);
/*
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
+ if (!smp_read_mpc(mpc, early)) {
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
@@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
"BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. "
"(tell your hw vendor)\n");
+ early_iounmap(mpc, size);
return;
}
+ early_iounmap(mpc, size);
if (early)
return;
@@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+ reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
BOOTMEM_DEFAULT);
if (mpf->physptr) {
- unsigned long size = PAGE_SIZE;
+ unsigned long size = get_mpc_size(mpf->physptr);
#ifdef CONFIG_X86_32
/*
* We cannot access to MPC table to compute
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bb..2aef36d8aca 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
+ { /* Handle problems with rebooting on Dell XPS710 */
+ .callback = set_bios_reboot,
+ .ident = "Dell XPS710",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4c54bc0d8ff..f28c56e6bf9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,7 +202,9 @@ struct ist_info ist_info;
#endif
#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+ .x86_phys_bits = MAX_PHYSMEM_BITS,
+};
EXPORT_SYMBOL(boot_cpu_data);
#endif
@@ -770,6 +772,9 @@ void __init setup_arch(char **cmdline_p)
finish_e820_parsing();
+ if (efi_enabled)
+ efi_init();
+
dmi_scan_machine();
dmi_check_system(bad_bios_dmi_table);
@@ -789,8 +794,6 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
- if (efi_enabled)
- efi_init();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce54..aa098708877 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
}
/* Needs to be externally visible */
-void finit(void)
+void finit_task(struct task_struct *tsk)
{
- control_word = 0x037f;
- partial_status = 0;
- top = 0; /* We don't keep top in the status word internally. */
- fpu_tag_word = 0xffff;
+ struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
+ struct address *oaddr, *iaddr;
+ soft->cwd = 0x037f;
+ soft->swd = 0;
+ soft->ftop = 0; /* We don't keep top in the status word internally. */
+ soft->twd = 0xffff;
/* The behaviour is different from that detailed in
Section 15.1.6 of the Intel manual */
- operand_address.offset = 0;
- operand_address.selector = 0;
- instruction_address.offset = 0;
- instruction_address.selector = 0;
- instruction_address.opcode = 0;
- no_ip_update = 1;
+ oaddr = (struct address *)&soft->foo;
+ oaddr->offset = 0;
+ oaddr->selector = 0;
+ iaddr = (struct address *)&soft->fip;
+ iaddr->offset = 0;
+ iaddr->selector = 0;
+ iaddr->opcode = 0;
+ soft->no_update = 1;
+}
+
+void finit(void)
+{
+ finit_task(current);
}
/*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0..d11745334a6 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
-#ifdef CONFIG_NUMA
void __init set_highmem_pages_init(void)
{
struct zone *zone;
@@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void)
}
totalram_pages += totalhigh_pages;
}
-#else
-void __init set_highmem_pages_init(void)
-{
- add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
- totalram_pages += totalhigh_pages;
-}
-#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722587d..6d63e3d1253 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,8 +1,345 @@
+#include <linux/ioport.h>
#include <linux/swap.h>
+
#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
+{
+ unsigned long puds, pmds, ptes, tables, start;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+ if (use_gbpages) {
+ unsigned long extra;
+
+ extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+ pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+ if (use_pse) {
+ unsigned long extra;
+
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+ extra += PMD_SIZE;
+#endif
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+ /* for fixmap */
+ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+ /*
+ * RED-PEN putting page tables only on node 0 could
+ * cause a hotspot and fill up ZONE_DMA. The page tables
+ * need roughly 0.5KB per GB.
+ */
+#ifdef CONFIG_X86_32
+ start = 0x7000;
+ e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+ start = 0x8000;
+ e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+ if (e820_table_start == -1UL)
+ panic("Cannot find space for the kernel page tables");
+
+ e820_table_start >>= PAGE_SHIFT;
+ e820_table_end = e820_table_start;
+ e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ unsigned long page_size_mask = 0;
+ unsigned long start_pfn, end_pfn;
+ unsigned long pos;
+ unsigned long ret;
+
+ struct map_range mr[NR_RANGE_MR];
+ int nr_range, i;
+ int use_pse, use_gbpages;
+
+ printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+ if (!after_bootmem)
+ init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+ set_nx();
+ if (nx_enabled)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+#endif
+
+ if (use_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (use_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = 0;
+
+ /* head if not big page alignment ? */
+ start_pfn = start >> PAGE_SHIFT;
+ pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (pos == 0)
+ end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ else
+ end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+ if (end_pfn > (end >> PAGE_SHIFT))
+ end_pfn = end >> PAGE_SHIFT;
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* big page (2M) range */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+#ifdef CONFIG_X86_64
+ /* big page (1G) range */
+ start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+#endif
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = pos>>PAGE_SHIFT;
+ end_pfn = end>>PAGE_SHIFT;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof(struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
+ }
+
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ mr[i].start, mr[i].end,
+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ *
+ * Later we should allocate these tables in the local node of the
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
+ */
+ if (!after_bootmem)
+ find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+ for (i = 0; i < nr_range; i++)
+ kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+ ret = end;
+#else /* CONFIG_X86_64 */
+ for (i = 0; i < nr_range; i++)
+ ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+ early_ioremap_page_table_range_init();
+
+ load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+ if (!after_bootmem)
+ mmu_cr4_features = read_cr4();
+#endif
+ __flush_tlb_all();
+
+ if (!after_bootmem && e820_table_end > e820_table_start)
+ reserve_early(e820_table_start << PAGE_SHIFT,
+ e820_table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start, end);
+
+ return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
+ if (!page_is_ram(pagenr))
+ return 1;
+ return 0;
+}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
@@ -47,3 +384,10 @@ void free_initmem(void)
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 47df0e1bbeb..2966c6b8d30 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
+#include <asm/init.h>
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
-
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;
static __init void *alloc_low_page(void)
{
- unsigned long pfn = table_end++;
+ unsigned long pfn = e820_table_end++;
void *adr;
- if (pfn >= table_top)
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_init_bootmem)
+ if (after_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = NULL;
- if (after_init_bootmem) {
+ if (after_bootmem) {
#ifdef CONFIG_DEBUG_PAGEALLOC
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
@@ -168,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
&& (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < table_start
- || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+ && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+ || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
pte_t *newpte;
int i;
- BUG_ON(after_init_bootmem);
+ BUG_ON(after_bootmem);
newpte = alloc_low_page();
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
@@ -242,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
- unsigned long start_pfn,
- unsigned long end_pfn,
- int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
+ int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+ unsigned long start_pfn, end_pfn;
+ pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
@@ -255,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
unsigned pages_2m, pages_4k;
int mapping_iter;
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
/*
* First iteration will setup identity mapping using large/small pages
* based on use_pse, with other attributes same as set by
@@ -369,26 +371,6 @@ repeat:
mapping_iter = 2;
goto repeat;
}
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (!page_is_ram(pagenr))
- return 1;
return 0;
}
@@ -545,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
{
+ pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
/*
@@ -641,7 +624,7 @@ static int __init noexec_setup(char *str)
}
early_param("noexec", noexec_setup);
-static void __init set_nx(void)
+void __init set_nx(void)
{
unsigned int v[4], l, h;
@@ -793,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
+ __vmalloc_start_set = true;
+
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
@@ -814,176 +799,61 @@ static void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+static unsigned long __init setup_node_bootmem(int nodeid,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long bootmap)
+{
+ unsigned long bootmap_size;
+
+ if (start_pfn > max_low_pfn)
+ return bootmap;
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+ bootmap >> PAGE_SHIFT,
+ start_pfn, end_pfn);
+ printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
+ nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
+ nodeid, bootmap, bootmap + bootmap_size);
+ free_bootmem_with_active_regions(nodeid, end_pfn);
+ early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+ return bootmap + bootmap_size;
+}
+
void __init setup_bootmem_allocator(void)
{
- int i;
+ int nodeid;
unsigned long bootmap_size, bootmap;
/*
* Initialize the boot-time allocator (with low memory only):
*/
bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
- max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+ bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
- /* don't touch min_low_pfn */
- bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
- min_low_pfn, max_low_pfn);
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
- printk(KERN_INFO " low ram: %08lx - %08lx\n",
- min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
- printk(KERN_INFO " bootmap %08lx - %08lx\n",
- bootmap, bootmap + bootmap_size);
- for_each_online_node(i)
- free_bootmem_with_active_regions(i, max_low_pfn);
- early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-
- after_init_bootmem = 1;
-}
-
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
- unsigned long puds, pmds, ptes, tables, start;
-
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
- if (use_pse) {
- unsigned long extra;
-
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
- extra += PMD_SIZE;
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-
- /*
- * RED-PEN putting page tables only on node 0 could
- * cause a hotspot and fill up ZONE_DMA. The page tables
- * need roughly 0.5KB per GB.
- */
- start = 0x7000;
- table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
- tables, PAGE_SIZE);
- if (table_start == -1UL)
- panic("Cannot find space for the kernel page tables");
-
- table_start >>= PAGE_SHIFT;
- table_end = table_start;
- table_top = table_start + (tables>>PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT,
- (table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- pgd_t *pgd_base = swapper_pg_dir;
- unsigned long start_pfn, end_pfn;
- unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- int use_pse = 0;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ for_each_online_node(nodeid)
+ bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
+ node_end_pfn[nodeid], bootmap);
#else
- int use_pse = cpu_has_pse;
-#endif
-
- /*
- * Find space for the kernel direct mapping tables.
- */
- if (!after_init_bootmem)
- find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
- set_nx();
- if (nx_enabled)
- printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+ bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
#endif
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
- }
-
- /*
- * Don't use a large page for the first 2/4MB of memory
- * because there are often fixed size MTRRs in there
- * and overlapping MTRRs into large pages can cause
- * slowdowns.
- */
- big_page_start = PMD_SIZE;
-
- if (start < big_page_start) {
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
- } else {
- /* head is not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- }
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
- /* big page range */
- start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < (big_page_start >> PAGE_SHIFT))
- start_pfn = big_page_start >> PAGE_SHIFT;
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
- use_pse);
-
- /* tail is not big page alignment ? */
- start_pfn = end_pfn;
- if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn,
- end_pfn, 0);
- }
-
- early_ioremap_page_table_range_init(pgd_base);
-
- load_cr3(swapper_pg_dir);
-
- __flush_tlb_all();
-
- if (!after_init_bootmem)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-
- if (!after_init_bootmem)
- early_memtest(start, end);
-
- return end >> PAGE_SHIFT;
+ after_bootmem = 1;
}
-
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
@@ -1217,13 +1087,6 @@ void mark_rodata_ro(void)
}
#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 07f44d491df..8a853bc3b28 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
#include <asm/kdebug.h>
#include <asm/numa.h>
#include <asm/cacheflush.h>
+#include <asm/init.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
-
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -87,8 +82,6 @@ early_param("gbpages", parse_direct_gbpages_on);
* around without checking the pgd every time.
*/
-int after_bootmem;
-
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
@@ -325,13 +318,9 @@ void __init cleanup_highmap(void)
}
}
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
static __ref void *alloc_low_page(unsigned long *phys)
{
- unsigned long pfn = table_end++;
+ unsigned long pfn = e820_table_end++;
void *adr;
if (after_bootmem) {
@@ -341,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
return adr;
}
- if (pfn >= table_top)
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -581,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
-{
- unsigned long puds, pmds, ptes, tables, start;
-
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
- if (use_gbpages) {
- unsigned long extra;
- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
- if (use_pse) {
- unsigned long extra;
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
- /*
- * RED-PEN putting page tables only on node 0 could
- * cause a hotspot and fill up ZONE_DMA. The page tables
- * need roughly 0.5KB per GB.
- */
- start = 0x8000;
- table_start = find_e820_area(start, end, tables, PAGE_SIZE);
- if (table_start == -1UL)
- panic("Cannot find space for the kernel page tables");
-
- table_start >>= PAGE_SHIFT;
- table_end = table_start;
- table_top = table_start + (tables >> PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
- unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
unsigned long next, last_map_addr = end;
@@ -669,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
return last_map_addr;
}
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
- unsigned long start_pfn, unsigned long end_pfn,
- unsigned long page_size_mask)
-{
-
- if (start_pfn < end_pfn) {
- if (nr_range >= NR_RANGE_MR)
- panic("run out of range for init_memory_mapping\n");
- mr[nr_range].start = start_pfn<<PAGE_SHIFT;
- mr[nr_range].end = end_pfn<<PAGE_SHIFT;
- mr[nr_range].page_size_mask = page_size_mask;
- nr_range++;
- }
-
- return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- unsigned long last_map_addr = 0;
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
-
- printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
-#endif
-
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
-
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
-
- /* head if not big page alignment ?*/
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (2M) range*/
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask &
- ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
- /* try to merge same page size and continuous */
- for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
- unsigned long old_start;
- if (mr[i].end != mr[i+1].start ||
- mr[i].page_size_mask != mr[i+1].page_size_mask)
- continue;
- /* move it */
- old_start = mr[i].start;
- memmove(&mr[i], &mr[i+1],
- (nr_range - 1 - i) * sizeof (struct map_range));
- mr[i--].start = old_start;
- nr_range--;
- }
-
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " %010lx - %010lx page %s\n",
- mr[i].start, mr[i].end,
- (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
- (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
- if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
-
- for (i = 0; i < nr_range; i++)
- last_map_addr = kernel_physical_mapping_init(
- mr[i].start, mr[i].end,
- mr[i].page_size_mask);
-
- if (!after_bootmem)
- mmu_cr4_features = read_cr4();
- __flush_tlb_all();
-
- if (!after_bootmem && table_end > table_start)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-
- printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
- last_map_addr, end);
-
- if (!after_bootmem)
- early_memtest(start, end);
-
- return last_map_addr >> PAGE_SHIFT;
-}
-
#ifndef CONFIG_NUMA
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -910,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (!page_is_ram(pagenr))
- return 1;
- return 0;
-}
-
-
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
kcore_modules, kcore_vsyscall;
@@ -1019,13 +768,6 @@ void mark_rodata_ro(void)
#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648..62773abdf08 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x)
} else {
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
- !phys_addr_valid(x));
+ VIRTUAL_BUG_ON(!phys_addr_valid(x));
}
return x;
}
@@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x)
if (x < PAGE_OFFSET)
return false;
x -= PAGE_OFFSET;
- if (system_state == SYSTEM_BOOTING ?
- x > MAXMEM : !phys_addr_valid(x)) {
+ if (!phys_addr_valid(x))
return false;
- }
}
return pfn_valid(x >> PAGE_SHIFT);
@@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr)
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- /* VMALLOC_* aren't constants; not available at the boot time */
+ /* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
- is_vmalloc_addr((void *) x));
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
return x - PAGE_OFFSET;
}
EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +85,7 @@ bool __virt_addr_valid(unsigned long x)
{
if (x < PAGE_OFFSET)
return false;
- if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
return false;
return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 93d82038af4..9f205030d9a 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,11 +32,14 @@ struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
unsigned long page; /* location of the fault page */
+ bool old_presence; /* page presence prior to arming */
+ bool armed;
/*
* Number of times this page has been registered as a part
* of a probe. If zero, page is disarmed and this may be freed.
- * Used only by writers (RCU).
+ * Used only by writers (RCU) and post_kmmio_handler().
+ * Protected by kmmio_lock, when linked into kmmio_page_table.
*/
int count;
};
@@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
return NULL;
}
-static void set_page_present(unsigned long addr, bool present,
- unsigned int *pglevel)
+static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+{
+ pmdval_t v = pmd_val(*pmd);
+ *old = !!(v & _PAGE_PRESENT);
+ v &= ~_PAGE_PRESENT;
+ if (present)
+ v |= _PAGE_PRESENT;
+ set_pmd(pmd, __pmd(v));
+}
+
+static void set_pte_presence(pte_t *pte, bool present, bool *old)
+{
+ pteval_t v = pte_val(*pte);
+ *old = !!(v & _PAGE_PRESENT);
+ v &= ~_PAGE_PRESENT;
+ if (present)
+ v |= _PAGE_PRESENT;
+ set_pte_atomic(pte, __pte(v));
+}
+
+static int set_page_presence(unsigned long addr, bool present, bool *old)
{
- pteval_t pteval;
- pmdval_t pmdval;
unsigned int level;
- pmd_t *pmd;
pte_t *pte = lookup_address(addr, &level);
if (!pte) {
pr_err("kmmio: no pte for page 0x%08lx\n", addr);
- return;
+ return -1;
}
- if (pglevel)
- *pglevel = level;
-
switch (level) {
case PG_LEVEL_2M:
- pmd = (pmd_t *)pte;
- pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
- if (present)
- pmdval |= _PAGE_PRESENT;
- set_pmd(pmd, __pmd(pmdval));
+ set_pmd_presence((pmd_t *)pte, present, old);
break;
-
case PG_LEVEL_4K:
- pteval = pte_val(*pte) & ~_PAGE_PRESENT;
- if (present)
- pteval |= _PAGE_PRESENT;
- set_pte_atomic(pte, __pte(pteval));
+ set_pte_presence(pte, present, old);
break;
-
default:
pr_err("kmmio: unexpected page level 0x%x.\n", level);
- return;
+ return -1;
}
__flush_tlb_one(addr);
+ return 0;
}
-/** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
{
- set_page_present(page & PAGE_MASK, false, pglevel);
+ int ret;
+ WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
+ if (f->armed) {
+ pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
+ f->page, f->count, f->old_presence);
+ }
+ ret = set_page_presence(f->page, false, &f->old_presence);
+ WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
+ f->armed = true;
+ return ret;
}
-/** Mark the given page as present. */
-static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
- set_page_present(page & PAGE_MASK, true, pglevel);
+ bool tmp;
+ int ret = set_page_presence(f->page, f->old_presence, &tmp);
+ WARN_ONCE(ret < 0,
+ KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+ f->armed = false;
}
/*
@@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
- disarm_kmmio_fault_page(faultpage->page, NULL);
if (addr == ctx->addr) {
/*
- * On SMP we sometimes get recursive probe hits on the
- * same address. Context is already saved, fall out.
+ * A second fault on the same page means some other
+ * condition needs handling by do_page_fault(), the
+ * page really not being present is the most common.
*/
- pr_debug("kmmio: duplicate probe hit on CPU %d, for "
- "address 0x%08lx.\n",
- smp_processor_id(), addr);
- ret = 1;
- goto no_kmmio_ctx;
- }
- /*
- * Prevent overwriting already in-flight context.
- * This should not happen, let's hope disarming at least
- * prevents a panic.
- */
- pr_emerg("kmmio: recursive probe hit on CPU %d, "
+ pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
+ addr, smp_processor_id());
+
+ if (!faultpage->old_presence)
+ pr_info("kmmio: unexpected secondary hit for "
+ "address 0x%08lx on CPU %d.\n", addr,
+ smp_processor_id());
+ } else {
+ /*
+ * Prevent overwriting already in-flight context.
+ * This should not happen, let's hope disarming at
+ * least prevents a panic.
+ */
+ pr_emerg("kmmio: recursive probe hit on CPU %d, "
"for address 0x%08lx. Ignoring.\n",
smp_processor_id(), addr);
- pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
- ctx->addr);
+ pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+ ctx->addr);
+ disarm_kmmio_fault_page(faultpage);
+ }
goto no_kmmio_ctx;
}
ctx->active++;
@@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
regs->flags &= ~X86_EFLAGS_IF;
/* Now we set present bit in PTE and single step. */
- disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+ disarm_kmmio_fault_page(ctx->fpage);
/*
* If another cpu accesses the same page while we are stepping,
@@ -275,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
if (!ctx->active) {
- pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+ pr_warning("kmmio: spurious debug trap on CPU %d.\n",
smp_processor_id());
goto out;
}
@@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
if (ctx->probe && ctx->probe->post_handler)
ctx->probe->post_handler(ctx->probe, condition, regs);
- arm_kmmio_fault_page(ctx->fpage->page, NULL);
+ /* Prevent racing against release_kmmio_fault_page(). */
+ spin_lock(&kmmio_lock);
+ if (ctx->fpage->count)
+ arm_kmmio_fault_page(ctx->fpage);
+ spin_unlock(&kmmio_lock);
regs->flags &= ~X86_EFLAGS_TF;
regs->flags |= ctx->saved_flags;
@@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page)
f = get_kmmio_fault_page(page);
if (f) {
if (!f->count)
- arm_kmmio_fault_page(f->page, NULL);
+ arm_kmmio_fault_page(f);
f->count++;
return 0;
}
- f = kmalloc(sizeof(*f), GFP_ATOMIC);
+ f = kzalloc(sizeof(*f), GFP_ATOMIC);
if (!f)
return -1;
f->count = 1;
f->page = page;
- list_add_rcu(&f->list, kmmio_page_list(f->page));
- arm_kmmio_fault_page(f->page, NULL);
+ if (arm_kmmio_fault_page(f)) {
+ kfree(f);
+ return -1;
+ }
+
+ list_add_rcu(&f->list, kmmio_page_list(f->page));
return 0;
}
@@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page,
f->count--;
BUG_ON(f->count < 0);
if (!f->count) {
- disarm_kmmio_fault_page(f->page, NULL);
+ disarm_kmmio_fault_page(f);
f->release_next = *release_list;
*release_list = f;
}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95a035..3daefa04ace 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
for_each_online_node(nid)
propagate_e820_map_node(nid);
- for_each_online_node(nid)
+ for_each_online_node(nid) {
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+ }
- NODE_DATA(0)->bdata = &bootmem_node_data[0];
setup_bootmem_allocator();
}
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index ab50a8d7402..427fd1b56df 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,5 +1,5 @@
/*
- * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
*/
#include <linux/module.h>
#include <linux/io.h>
@@ -9,35 +9,74 @@
static unsigned long mmio_address;
module_param(mmio_address, ulong, 0);
-MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
+ "(or 8 MB if read_far is non-zero).");
+
+static unsigned long read_far = 0x400100;
+module_param(read_far, ulong, 0);
+MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
+ "(default: 0x400100).");
+
+static unsigned v16(unsigned i)
+{
+ return i * 12 + 7;
+}
+
+static unsigned v32(unsigned i)
+{
+ return i * 212371 + 13;
+}
static void do_write_test(void __iomem *p)
{
unsigned int i;
+ pr_info(MODULE_NAME ": write test.\n");
mmiotrace_printk("Write test.\n");
+
for (i = 0; i < 256; i++)
iowrite8(i, p + i);
+
for (i = 1024; i < (5 * 1024); i += 2)
- iowrite16(i * 12 + 7, p + i);
+ iowrite16(v16(i), p + i);
+
for (i = (5 * 1024); i < (16 * 1024); i += 4)
- iowrite32(i * 212371 + 13, p + i);
+ iowrite32(v32(i), p + i);
}
static void do_read_test(void __iomem *p)
{
unsigned int i;
+ unsigned errs[3] = { 0 };
+ pr_info(MODULE_NAME ": read test.\n");
mmiotrace_printk("Read test.\n");
+
for (i = 0; i < 256; i++)
- ioread8(p + i);
+ if (ioread8(p + i) != i)
+ ++errs[0];
+
for (i = 1024; i < (5 * 1024); i += 2)
- ioread16(p + i);
+ if (ioread16(p + i) != v16(i))
+ ++errs[1];
+
for (i = (5 * 1024); i < (16 * 1024); i += 4)
- ioread32(p + i);
+ if (ioread32(p + i) != v32(i))
+ ++errs[2];
+
+ mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
+ errs[0], errs[1], errs[2]);
}
-static void do_test(void)
+static void do_read_far_test(void __iomem *p)
{
- void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
+ pr_info(MODULE_NAME ": read far test.\n");
+ mmiotrace_printk("Read far test.\n");
+
+ ioread32(p + read_far);
+}
+
+static void do_test(unsigned long size)
+{
+ void __iomem *p = ioremap_nocache(mmio_address, size);
if (!p) {
pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
return;
@@ -45,11 +84,15 @@ static void do_test(void)
mmiotrace_printk("ioremap returned %p.\n", p);
do_write_test(p);
do_read_test(p);
+ if (read_far && read_far < size - 4)
+ do_read_far_test(p);
iounmap(p);
}
static int __init init(void)
{
+ unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+
if (mmio_address == 0) {
pr_err(MODULE_NAME ": you have to use the module argument "
"mmio_address.\n");
@@ -58,10 +101,11 @@ static int __init init(void)
return -ENXIO;
}
- pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
- "in PCI address space, and writing "
- "rubbish in there.\n", mmio_address);
- do_test();
+ pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
+ "address space, and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
+ do_test(size);
+ pr_info(MODULE_NAME ": All done.\n");
return 0;
}
diff --git a/crypto/api.c b/crypto/api.c
index efe77df6863..38a2bc02a98 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -215,8 +215,19 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask)
mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
type &= mask;
- alg = try_then_request_module(crypto_alg_lookup(name, type, mask),
- name);
+ alg = crypto_alg_lookup(name, type, mask);
+ if (!alg) {
+ char tmp[CRYPTO_MAX_ALG_NAME];
+
+ request_module(name);
+
+ if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) &&
+ snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp))
+ request_module(tmp);
+
+ alg = crypto_alg_lookup(name, type, mask);
+ }
+
if (alg)
return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg;
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 2d637e0fbc0..d9e751be8c5 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -457,10 +457,12 @@ static int init_ixp_crypto(void)
if (!ctx_pool) {
goto err;
}
- ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0);
+ ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0,
+ "ixp_crypto:out", NULL);
if (ret)
goto err;
- ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0);
+ ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0,
+ "ixp_crypto:in", NULL);
if (ret) {
qmgr_release_queue(SEND_QID);
goto err;
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 856b3cc2558..3f0fdd18255 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Michal Ludvig");
-MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-all");
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index a7fbadebf62..a2c8e8514b6 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support.");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Michal Ludvig");
-MODULE_ALIAS("sha1");
-MODULE_ALIAS("sha256");
+MODULE_ALIAS("sha1-all");
+MODULE_ALIAS("sha256-all");
MODULE_ALIAS("sha1-padlock");
MODULE_ALIAS("sha256-padlock");
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index ea5440dd10d..647374acba9 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma");
static struct platform_driver iop_adma_driver = {
.probe = iop_adma_probe,
- .remove = iop_adma_remove,
+ .remove = __devexit_p(iop_adma_remove),
.driver = {
.owner = THIS_MODULE,
.name = "iop-adma",
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index d35cbd1ff0b..5d5d5b31867 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp,
static struct platform_driver mv_xor_driver = {
.probe = mv_xor_probe,
- .remove = mv_xor_remove,
+ .remove = __devexit_p(mv_xor_remove),
.driver = {
.owner = THIS_MODULE,
.name = MV_XOR_NAME,
diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c
index 096e2a37446..7c8b15b22bf 100644
--- a/drivers/gpu/drm/drm_stub.c
+++ b/drivers/gpu/drm/drm_stub.c
@@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data,
file_priv->minor->master != file_priv->master) {
mutex_lock(&dev->struct_mutex);
file_priv->minor->master = drm_master_get(file_priv->master);
- mutex_lock(&dev->struct_mutex);
+ mutex_unlock(&dev->struct_mutex);
}
return 0;
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index eeda276f8f1..7f186bbcb99 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c
@@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd,
return 0;
}
-static void __devexit
+static void
mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data)
{
if (drv_data->reg_base) {
@@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev)
static struct platform_driver mv64xxx_i2c_driver = {
.probe = mv64xxx_i2c_probe,
- .remove = mv64xxx_i2c_remove,
+ .remove = __devexit_p(mv64xxx_i2c_remove),
.driver = {
.owner = THIS_MODULE,
.name = MV64XXX_I2C_CTLR_NAME,
diff --git a/drivers/mtd/nand/orion_nand.c b/drivers/mtd/nand/orion_nand.c
index 917cf8d3ae9..c2dfd3ea353 100644
--- a/drivers/mtd/nand/orion_nand.c
+++ b/drivers/mtd/nand/orion_nand.c
@@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev)
static struct platform_driver orion_nand_driver = {
.probe = orion_nand_probe,
- .remove = orion_nand_remove,
+ .remove = __devexit_p(orion_nand_remove),
.driver = {
.name = "orion_nand",
.owner = THIS_MODULE,
diff --git a/drivers/net/arm/Makefile b/drivers/net/arm/Makefile
index c69c0cdba4a..811a3ccd14c 100644
--- a/drivers/net/arm/Makefile
+++ b/drivers/net/arm/Makefile
@@ -4,7 +4,7 @@
#
obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o
-obj-$(CONFIG_ARM_ETHERH) += etherh.o ../8390.o
+obj-$(CONFIG_ARM_ETHERH) += etherh.o
obj-$(CONFIG_ARM_ETHER3) += ether3.o
obj-$(CONFIG_ARM_ETHER1) += ether1.o
obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o
diff --git a/drivers/net/arm/etherh.c b/drivers/net/arm/etherh.c
index 54b52e5b182..f52f668c49b 100644
--- a/drivers/net/arm/etherh.c
+++ b/drivers/net/arm/etherh.c
@@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = {
.ndo_open = etherh_open,
.ndo_stop = etherh_close,
.ndo_set_config = etherh_set_config,
- .ndo_start_xmit = ei_start_xmit,
- .ndo_tx_timeout = ei_tx_timeout,
- .ndo_get_stats = ei_get_stats,
- .ndo_set_multicast_list = ei_set_multicast_list,
+ .ndo_start_xmit = __ei_start_xmit,
+ .ndo_tx_timeout = __ei_tx_timeout,
+ .ndo_get_stats = __ei_get_stats,
+ .ndo_set_multicast_list = __ei_set_multicast_list,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = eth_change_mtu,
#ifdef CONFIG_NET_POLL_CONTROLLER
- .ndo_poll_controller = ei_poll,
+ .ndo_poll_controller = __ei_poll,
#endif
};
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c
index 48ff701d3a7..2552b9f325e 100644
--- a/drivers/video/pxafb.c
+++ b/drivers/video/pxafb.c
@@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev)
static struct platform_driver pxafb_driver = {
.probe = pxafb_probe,
- .remove = pxafb_remove,
+ .remove = __devexit_p(pxafb_remove),
.suspend = pxafb_suspend,
.resume = pxafb_resume,
.driver = {
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index f3f697df1d7..80044a4f3ab 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void);
#define rcu_enter_nohz() do { } while (0)
#define rcu_exit_nohz() do { } while (0)
+/* A context switch is a grace period for rcuclassic. */
+static inline int rcu_blocking_is_gp(void)
+{
+ return num_online_cpus() == 1;
+}
+
#endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 921340a7b71..528343e6da5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,6 +52,9 @@ struct rcu_head {
void (*func)(struct rcu_head *head);
};
+/* Internal to kernel, but needed by rcupreempt.h. */
+extern int rcu_scheduler_active;
+
#if defined(CONFIG_CLASSIC_RCU)
#include <linux/rcuclassic.h>
#elif defined(CONFIG_TREE_RCU)
@@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void);
/* Internal to kernel */
extern void rcu_init(void);
+extern void rcu_scheduler_starting(void);
extern int rcu_needs_cpu(int cpu);
#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 3e05c09b54a..74304b4538d 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void)
#define rcu_exit_nohz() do { } while (0)
#endif /* CONFIG_NO_HZ */
+/*
+ * A context switch is a grace period for rcupreempt synchronize_rcu()
+ * only during early boot, before the scheduler has been initialized.
+ * So, how the heck do we get a context switch? Well, if the caller
+ * invokes synchronize_rcu(), they are willing to accept a context
+ * switch, so we simply pretend that one happened.
+ *
+ * After boot, there might be a blocked or preempted task in an RCU
+ * read-side critical section, so we cannot then take the fastpath.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+ return num_online_cpus() == 1 && !rcu_scheduler_active;
+}
+
#endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d4368b7975c..a722fb67bb2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void)
}
#endif /* CONFIG_NO_HZ */
+/* A context switch is a grace period for rcutree. */
+static inline int rcu_blocking_is_gp(void)
+{
+ return num_online_cpus() == 1;
+}
+
#endif /* __LINUX_RCUTREE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f0a50b20e8a..a7c7698583b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2303,9 +2303,13 @@ extern long sched_group_rt_runtime(struct task_group *tg);
extern int sched_group_set_rt_period(struct task_group *tg,
long rt_period_us);
extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
#endif
#endif
+extern int task_can_switch_user(struct user_struct *up,
+ struct task_struct *tsk);
+
#ifdef CONFIG_TASK_XACCT
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
diff --git a/init/main.c b/init/main.c
index 6441083f827..6bf83afd654 100644
--- a/init/main.c
+++ b/init/main.c
@@ -98,7 +98,7 @@ static inline void mark_rodata_ro(void) { }
extern void tc_init(void);
#endif
-enum system_states system_state;
+enum system_states system_state __read_mostly;
EXPORT_SYMBOL(system_state);
/*
@@ -464,6 +464,7 @@ static noinline void __init_refok rest_init(void)
* at least once to get things moving:
*/
init_idle_bootup_task(current);
+ rcu_scheduler_starting();
preempt_enable_no_resched();
schedule();
preempt_disable();
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index bd5a9003497..654c640a6b9 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu)
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
- (idle_cpu(cpu) && !in_softirq() &&
- hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+ (idle_cpu(cpu) && rcu_scheduler_active &&
+ !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
/*
* Get here if this CPU took its interrupt from user
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d92a76a881a..cae8a059cf4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,6 +44,7 @@
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/module.h>
+#include <linux/kernel_stat.h>
enum rcu_barrier {
RCU_BARRIER_STD,
@@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
+int rcu_scheduler_active __read_mostly;
/*
* Awaken the corresponding synchronize_rcu() instance now that a
@@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head)
void synchronize_rcu(void)
{
struct rcu_synchronize rcu;
+
+ if (rcu_blocking_is_gp())
+ return;
+
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu(&rcu.head, wakeme_after_rcu);
@@ -175,3 +181,9 @@ void __init rcu_init(void)
__rcu_init();
}
+void rcu_scheduler_starting(void)
+{
+ WARN_ON(num_online_cpus() != 1);
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 33cfc50781f..5d59e850fb7 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1181,6 +1181,9 @@ void __synchronize_sched(void)
{
struct rcu_synchronize rcu;
+ if (num_online_cpus() == 1)
+ return; /* blocking is gp if only one CPU! */
+
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu_sched(&rcu.head, wakeme_after_rcu);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b2fd602a6f6..97ce31579ec 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
- (idle_cpu(cpu) && !in_softirq() &&
- hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+ (idle_cpu(cpu) && rcu_scheduler_active &&
+ !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
/*
* Get here if this CPU took its interrupt from user
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e5c38e1c8b..0a76d0b6f21 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
- if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
@@ -9219,6 +9219,16 @@ static int sched_rt_global_constraints(void)
return ret;
}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
#else /* !CONFIG_RT_GROUP_SCHED */
static int sched_rt_global_constraints(void)
{
@@ -9312,8 +9322,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk)
{
#ifdef CONFIG_RT_GROUP_SCHED
- /* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+ if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
diff --git a/kernel/sys.c b/kernel/sys.c
index f145c415bc1..37f458e6882 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -559,7 +559,7 @@ error:
abort_creds(new);
return retval;
}
-
+
/*
* change the user struct in a credentials set to match the new UID
*/
@@ -571,6 +571,11 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;
+ if (!task_can_switch_user(new_user, current)) {
+ free_uid(new_user);
+ return -EINVAL;
+ }
+
if (atomic_read(&new_user->processes) >=
current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
new_user != INIT_USER) {
@@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
goto error;
}
- retval = -EAGAIN;
- if (new->uid != old->uid && set_user(new) < 0)
- goto error;
-
+ if (new->uid != old->uid) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
+ }
if (ruid != (uid_t) -1 ||
(euid != (uid_t) -1 && euid != old->uid))
new->suid = new->euid;
@@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
retval = -EPERM;
if (capable(CAP_SETUID)) {
new->suid = new->uid = uid;
- if (uid != old->uid && set_user(new) < 0) {
- retval = -EAGAIN;
- goto error;
+ if (uid != old->uid) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
}
} else if (uid != old->uid && uid != new->suid) {
goto error;
@@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
goto error;
}
- retval = -EAGAIN;
if (ruid != (uid_t) -1) {
new->uid = ruid;
- if (ruid != old->uid && set_user(new) < 0)
- goto error;
+ if (ruid != old->uid) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
+ }
}
if (euid != (uid_t) -1)
new->euid = euid;
diff --git a/kernel/user.c b/kernel/user.c
index 3551ac74239..6a9b696128c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags)
#endif
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
+/*
+ * We need to check if a setuid can take place. This function should be called
+ * before successfully completing the setuid.
+ */
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+
+ return sched_rt_can_attach(up->tg, tsk);
+
+}
+#else
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+ return 1;
+}
+#endif
+
/*
* Locate the user_struct for the passed UID. If found, take a ref on it. The
* caller must undo that ref with free_uid().