aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-03-30 11:38:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2009-03-30 11:38:31 -0700
commit019abbc87025a030fd25008612afd4eff8a375f7 (patch)
tree6d745dedcf90ceff8f5b7b996a17f666b7c574e3
parent2d25ee36c84d5b2d6be8bfaf80256ecad69a06ca (diff)
parent5a3c8fe7353f78b73b9636353c6f7b881f19ebea (diff)
Merge branch 'x86-stage-3-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-stage-3-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (190 commits) Revert "cpuacct: reduce one NULL check in fast-path" Revert "x86: don't compile vsmp_64 for 32bit" x86: Correct behaviour of irq affinity x86: early_ioremap_init(), use __fix_to_virt(), because we are sure it's safe x86: use default_cpu_mask_to_apicid for 64bit x86: fix set_extra_move_desc calling x86, PAT, PCI: Change vma prot in pci_mmap to reflect inherited prot x86/dmi: fix dmi_alloc() section mismatches x86: e820 fix various signedness issues in setup.c and e820.c x86: apic/io_apic.c define msi_ir_chip and ir_ioapic_chip all the time x86: irq.c keep CONFIG_X86_LOCAL_APIC interrupts together x86: irq.c use same path for show_interrupts x86: cpu/cpu.h cleanup x86: Fix a couple of sparse warnings in arch/x86/kernel/apic/io_apic.c Revert "x86: create a non-zero sized bm_pte only when needed" x86: pci-nommu.c cleanup x86: io_delay.c cleanup x86: rtc.c cleanup x86: i8253 cleanup x86: kdebugfs.c cleanup ...
-rw-r--r--Documentation/x86/earlyprintk.txt101
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/Kconfig.cpu17
-rw-r--r--arch/x86/Makefile29
-rw-r--r--arch/x86/boot/Makefile53
-rw-r--r--arch/x86/boot/header.S29
-rw-r--r--arch/x86/boot/pm.c44
-rw-r--r--arch/x86/boot/pmjump.S1
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/boot/tools/build.c9
-rw-r--r--arch/x86/boot/video-vga.c22
-rw-r--r--arch/x86/include/asm/apic.h37
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/boot.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h3
-rwxr-xr-xarch/x86/include/asm/cpu_debug.h226
-rw-r--r--arch/x86/include/asm/desc.h3
-rw-r--r--arch/x86/include/asm/dmi.h19
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/io_apic.h5
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/kexec.h13
-rw-r--r--arch/x86/include/asm/linkage.h19
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/msidef.h1
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/page_32_types.h5
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h19
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pgtable-2level.h7
-rw-r--r--arch/x86/include/asm/pgtable-3level.h17
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h3
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h5
-rw-r--r--arch/x86/include/asm/sections.h7
-rw-r--r--arch/x86/include/asm/setup.h39
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/apic/apic.c35
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c18
-rw-r--r--arch/x86/kernel/apic/io_apic.c292
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c6
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c54
-rw-r--r--arch/x86/kernel/cpu/centaur.c36
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rw-r--r--arch/x86/kernel/cpu/common.c396
-rw-r--r--arch/x86/kernel/cpu/cpu.h25
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c901
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c32
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c62
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1101
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c202
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1069
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/umc.c2
-rw-r--r--arch/x86/kernel/e820.c142
-rw-r--r--arch/x86/kernel/early_printk.c20
-rw-r--r--arch/x86/kernel/entry_32.S18
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S76
-rw-r--r--arch/x86/kernel/i8253.c68
-rw-r--r--arch/x86/kernel/io_delay.c27
-rw-r--r--arch/x86/kernel/irq.c88
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/kdebugfs.c82
-rw-r--r--arch/x86/kernel/kprobes.c2
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/machine_kexec_32.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c99
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c384
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-nommu.c20
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S24
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S189
-rw-r--r--arch/x86/kernel/rtc.c20
-rw-r--r--arch/x86/kernel/setup.c58
-rw-r--r--arch/x86/kernel/signal.c48
-rw-r--r--arch/x86/kernel/smpboot.c78
-rw-r--r--arch/x86/kernel/tlb_uv.c3
-rw-r--r--arch/x86/kernel/topology.c14
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/visws_quirks.c2
-rw-r--r--arch/x86/kernel/vmi_32.c6
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S21
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S101
-rw-r--r--arch/x86/kernel/vsmp_64.c12
-rw-r--r--arch/x86/lguest/boot.c8
-rw-r--r--arch/x86/lib/memcpy_64.S143
-rw-r--r--arch/x86/mm/highmem_32.c24
-rw-r--r--arch/x86/mm/init.c344
-rw-r--r--arch/x86/mm/init_32.c256
-rw-r--r--arch/x86/mm/init_64.c280
-rw-r--r--arch/x86/mm/iomap_32.c26
-rw-r--r--arch/x86/mm/ioremap.c52
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/memtest.c3
-rw-r--r--arch/x86/mm/numa_32.c5
-rw-r--r--arch/x86/mm/pageattr.c147
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/tlb.c5
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/pci/i386.c3
-rw-r--r--arch/x86/xen/mmu.c7
-rw-r--r--drivers/pci/dmar.c296
-rw-r--r--drivers/pci/intel-iommu.c220
-rw-r--r--drivers/pci/intr_remapping.c110
-rw-r--r--include/linux/dmar.h52
-rw-r--r--include/linux/intel-iommu.h5
-rw-r--r--include/linux/mm.h3
-rw-r--r--mm/memory.c6
145 files changed, 6465 insertions, 3583 deletions
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 00000000000..607b1a01606
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
+
+Mini-HOWTO for using the earlyprintk=dbgp boot option with a
+USB2 Debug port key and a debug cable, on x86 systems.
+
+You need two computers, the 'USB debug key' special gadget and
+and two USB cables, connected like this:
+
+ [host/target] <-------> [USB debug key] <-------> [client/console]
+
+1. There are three specific hardware requirements:
+
+ a.) Host/target system needs to have USB debug port capability.
+
+ You can check this capability by looking at a 'Debug port' bit in
+ the lspci -vvv output:
+
+ # lspci -vvv
+ ...
+ 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
+ Subsystem: Lenovo ThinkPad T61
+ Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
+ Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
+ Latency: 0
+ Interrupt: pin D routed to IRQ 19
+ Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
+ Capabilities: [50] Power Management version 2
+ Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
+ Status: D0 PME-Enable- DSel=0 DScale=0 PME+
+ Capabilities: [58] Debug port: BAR=1 offset=00a0
+ ^^^^^^^^^^^ <==================== [ HERE ]
+ Kernel driver in use: ehci_hcd
+ Kernel modules: ehci-hcd
+ ...
+
+( If your system does not list a debug port capability then you probably
+ wont be able to use the USB debug key. )
+
+ b.) You also need a Netchip USB debug cable/key:
+
+ http://www.plxtech.com/products/NET2000/NET20DC/default.asp
+
+ This is a small blue plastic connector with two USB connections,
+ it draws power from its USB connections.
+
+ c.) Thirdly, you need a second client/console system with a regular USB port.
+
+2. Software requirements:
+
+ a.) On the host/target system:
+
+ You need to enable the following kernel config option:
+
+ CONFIG_EARLY_PRINTK_DBGP=y
+
+ And you need to add the boot command line: "earlyprintk=dbgp".
+ (If you are using Grub, append it to the 'kernel' line in
+ /etc/grub.conf)
+
+ NOTE: normally earlyprintk console gets turned off once the
+ regular console is alive - use "earlyprintk=dbgp,keep" to keep
+ this channel open beyond early bootup. This can be useful for
+ debugging crashes under Xorg, etc.
+
+ b.) On the client/console system:
+
+ You should enable the following kernel config option:
+
+ CONFIG_USB_SERIAL_DEBUG=y
+
+ On the next bootup with the modified kernel you should
+ get a /dev/ttyUSBx device(s).
+
+ Now this channel of kernel messages is ready to be used: start
+ your favorite terminal emulator (minicom, etc.) and set
+ it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
+ see the raw output.
+
+ c.) On Nvidia Southbridge based systems: the kernel will try to probe
+ and find out which port has debug device connected.
+
+3. Testing that it works fine:
+
+ You can test the output by using earlyprintk=dbgp,keep and provoking
+ kernel messages on the host/target system. You can provoke a harmless
+ kernel message by for example doing:
+
+ echo h > /proc/sysrq-trigger
+
+ On the host/target system you should see this help line in "dmesg" output:
+
+ SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
+
+ On the client/console system do:
+
+ cat /dev/ttyUSB0
+
+ And you should see the help line above displayed shortly after you've
+ provoked it on the host system.
+
+If it does not work then please ask about it on the linux-kernel@vger.kernel.org
+mailing list or contact the x86 maintainers.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 06c02c00d7d..34bc3a89228 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -786,6 +786,11 @@ config X86_MCE_AMD
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
+config X86_MCE_THRESHOLD
+ depends on X86_MCE_AMD || X86_MCE_INTEL
+ bool
+ default y
+
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE
@@ -929,6 +934,12 @@ config X86_CPUID
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
/dev/cpu/31/cpuid.
+config X86_CPU_DEBUG
+ tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
+ ---help---
+ If you select this option, this will provide various x86 CPUs
+ information through debugfs.
+
choice
prompt "High Memory Support"
default HIGHMEM4G if !X86_NUMAQ
@@ -1121,7 +1132,7 @@ config NUMA_EMU
config NODES_SHIFT
int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
- range 1 9 if X86_64
+ range 1 9
default "9" if MAXSMP
default "6" if X86_64
default "4" if X86_NUMAQ
@@ -1429,7 +1440,7 @@ config CRASH_DUMP
config KEXEC_JUMP
bool "kexec jump (EXPERIMENTAL)"
depends on EXPERIMENTAL
- depends on KEXEC && HIBERNATION && X86_32
+ depends on KEXEC && HIBERNATION
---help---
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a95eaf0e582..924e156a85a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -456,24 +456,9 @@ config CPU_SUP_AMD
If unsure, say N.
-config CPU_SUP_CENTAUR_32
+config CPU_SUP_CENTAUR
default y
bool "Support Centaur processors" if PROCESSOR_SELECT
- depends on !64BIT
- ---help---
- This enables detection, tunings and quirks for Centaur processors
-
- You need this enabled if you want your kernel to run on a
- Centaur CPU. Disabling this option on other types of CPUs
- makes the kernel a tiny bit smaller. Disabling it on a Centaur
- CPU might render the kernel unbootable.
-
- If unsure, say N.
-
-config CPU_SUP_CENTAUR_64
- default y
- bool "Support Centaur processors" if PROCESSOR_SELECT
- depends on 64BIT
---help---
This enables detection, tunings and quirks for Centaur processors
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1836191839e..f05d8c91d9e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,34 +153,23 @@ endif
boot := arch/x86/boot
-PHONY += zImage bzImage compressed zlilo bzlilo \
- zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
+BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
+
+PHONY += bzImage $(BOOT_TARGETS)
# Default kernel to build
all: bzImage
# KBUILD_IMAGE specify target image being built
- KBUILD_IMAGE := $(boot)/bzImage
-zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
+KBUILD_IMAGE := $(boot)/bzImage
-zImage bzImage: vmlinux
+bzImage: vmlinux
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
-compressed: zImage
-
-zlilo bzlilo: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
-
-zdisk bzdisk: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
-
-fdimage fdimage144 fdimage288 isoimage: vmlinux
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-install:
- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
+$(BOOT_TARGETS): vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) $@
PHONY += vdso_install
vdso_install:
@@ -205,7 +194,3 @@ define archhelp
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
endef
-
-CLEAN_FILES += arch/x86/boot/fdimage \
- arch/x86/boot/image.iso \
- arch/x86/boot/mtools.conf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c70eff69a1f..fb737ce5888 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -6,26 +6,24 @@
# for more details.
#
# Copyright (C) 1994 by Linus Torvalds
+# Changed by many, many contributors over the years.
#
# ROOT_DEV specifies the default root-device when making the image.
# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
# the default of FLOPPY is used by 'build'.
-ROOT_DEV := CURRENT
+ROOT_DEV := CURRENT
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
# The number is the same as you would ordinarily press at bootup.
-SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
-# If you want the RAM disk device, define this to be the size in blocks.
-
-#RAMDISK := -DRAMDISK=512
-
-targets := vmlinux.bin setup.bin setup.elf zImage bzImage
+targets := vmlinux.bin setup.bin setup.elf bzImage
+targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
@@ -71,17 +69,13 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
KBUILD_CFLAGS += $(call cc-option,-m32)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
-$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
-$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
-$(obj)/bzImage: BUILDFLAGS := -b
+$(obj)/bzImage: asflags-y := $(SVGA_MODE)
quiet_cmd_image = BUILD $@
-cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \
- $(obj)/vmlinux.bin $(ROOT_DEV) > $@
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
+ $(ROOT_DEV) > $@
-$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
- $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
$(call if_changed,image)
@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
@@ -116,9 +110,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
$(obj)/compressed/vmlinux: FORCE
$(Q)$(MAKE) $(build)=$(obj)/compressed $@
-# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
+# Set this if you want to pass append arguments to the
+# bzdisk/fdimage/isoimage kernel
FDARGS =
-# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
+# Set this if you want an initrd included with the
+# bzdisk/fdimage/isoimage kernel
FDINITRD =
image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
@@ -127,7 +123,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.in
sed -e 's|@OBJ@|$(obj)|g' < $< > $@
# This requires write access to /dev/fd0
-zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+bzdisk: $(obj)/bzImage $(obj)/mtools.conf
MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
syslinux /dev/fd0 ; sync
echo '$(image_cmdline)' | \
@@ -135,10 +131,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
if [ -f '$(FDINITRD)' ] ; then \
MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
# These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
syslinux $(obj)/fdimage ; sync
@@ -147,9 +143,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
if [ -f '$(FDINITRD)' ] ; then \
MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
-fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage288: $(obj)/bzImage $(obj)/mtools.conf
dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
syslinux $(obj)/fdimage ; sync
@@ -158,9 +154,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
if [ -f '$(FDINITRD)' ] ; then \
MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
-isoimage: $(BOOTIMAGE)
+isoimage: $(obj)/bzImage
-rm -rf $(obj)/isoimage
mkdir $(obj)/isoimage
for i in lib lib64 share end ; do \
@@ -170,7 +166,7 @@ isoimage: $(BOOTIMAGE)
fi ; \
if [ $$i = end ] ; then exit 1 ; fi ; \
done
- cp $(BOOTIMAGE) $(obj)/isoimage/linux
+ cp $(obj)/bzImage $(obj)/isoimage/linux
echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
if [ -f '$(FDINITRD)' ] ; then \
cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
@@ -181,12 +177,13 @@ isoimage: $(BOOTIMAGE)
isohybrid $(obj)/image.iso 2>/dev/null || true
rm -rf $(obj)/isoimage
-zlilo: $(BOOTIMAGE)
+bzlilo: $(obj)/bzImage
if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
- cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
+ cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
cp System.map $(INSTALL_PATH)/
if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
+ sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+ System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 7ccff4884a2..5d84d1c74e4 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -24,12 +24,8 @@
#include "boot.h"
#include "offsets.h"
-SETUPSECTS = 4 /* default nr of setup-sectors */
BOOTSEG = 0x07C0 /* original address of boot-sector */
-SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
-SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
- /* to be loaded */
-ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
+SYSSEG = 0x1000 /* historical load address >> 4 */
#ifndef SVGA_MODE
#define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
.section ".header", "a"
.globl hdr
hdr:
-setup_sects: .byte SETUPSECTS
+setup_sects: .byte 0 /* Filled in by build.c */
root_flags: .word ROOT_RDONLY
-syssize: .long SYSSIZE
-ram_size: .word RAMDISK
+syssize: .long 0 /* Filled in by build.c */
+ram_size: .word 0 /* Obsolete */
vid_mode: .word SVGA_MODE
-root_dev: .word ROOT_DEV
+root_dev: .word 0 /* Filled in by build.c */
boot_flag: .word 0xAA55
# offset 512, entry point
@@ -123,14 +119,15 @@ _start:
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
-start_sys_seg: .word SYSSEG
+start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
+ # in case something decided to "use" it
.word kernel_version-512 # pointing to kernel version string
# above section of header is compatible
# with loadlin-1.5 (header v1.5). Don't
# change it.
-type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
- # Bootlin, SYSLX, bootsect...)
+type_of_loader: .byte 0 # 0 means ancient bootloader, newer
+ # bootloaders know to change this.
# See Documentation/i386/boot.txt for
# assigned ids
@@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the loader also has set
# space behind setup.S can be used for
# heap purposes.
# Only the loader knows what is free
-#ifndef __BIG_KERNEL__
- .byte 0
-#else
.byte LOADED_HIGH
-#endif
setup_move_size: .word 0x8000 # size to move, when setup is not
# loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size to move, when setup is not
code32_start: # here loaders can put a different
# start address for 32-bit code.
-#ifndef __BIG_KERNEL__
- .long 0x1000 # 0x1000 = default for zImage
-#else
.long 0x100000 # 0x100000 = default for big kernel
-#endif
ramdisk_image: .long 0 # address of loaded ramdisk image
# Here the loader puts the 32-bit
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 85a1cd8a8ff..8062f891525 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -33,47 +33,6 @@ static void realmode_switch_hook(void)
}
/*
- * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
- * A bzImage kernel is loaded and runs at 0x100000.
- */
-static void move_kernel_around(void)
-{
- /* Note: rely on the compile-time option here rather than
- the LOADED_HIGH flag. The Qemu kernel loader unconditionally
- sets the loadflags to zero. */
-#ifndef __BIG_KERNEL__
- u16 dst_seg, src_seg;
- u32 syssize;
-
- dst_seg = 0x1000 >> 4;
- src_seg = 0x10000 >> 4;
- syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
-
- while (syssize) {
- int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
- int dwords = paras << 2;
-
- asm volatile("pushw %%es ; "
- "pushw %%ds ; "
- "movw %1,%%es ; "
- "movw %2,%%ds ; "
- "xorw %%di,%%di ; "
- "xorw %%si,%%si ; "
- "rep;movsl ; "
- "popw %%ds ; "
- "popw %%es"
- : "+c" (dwords)
- : "r" (dst_seg), "r" (src_seg)
- : "esi", "edi");
-
- syssize -= paras;
- dst_seg += paras;
- src_seg += paras;
- }
-#endif
-}
-
-/*
* Disable all interrupts at the legacy PIC.
*/
static void mask_all_interrupts(void)
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
/* Hook before leaving real mode, also disables interrupts */
realmode_switch_hook();
- /* Move the kernel/setup to their final resting places */
- move_kernel_around();
-
/* Enable the A20 gate */
if (enable_a20()) {
puts("A20 gate not responding, unable to boot...\n");
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index 019c17a7585..3e0edc6d2a2 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -47,6 +47,7 @@ GLOBAL(protected_mode_jump)
ENDPROC(protected_mode_jump)
.code32
+ .section ".text32","ax"
GLOBAL(in_pm32)
# Set up data segments for flat 32-bit mode
movl %ecx, %ds
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index df9234b3a5e..bb8dc2de796 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -17,7 +17,8 @@ SECTIONS
.header : { *(.header) }
.inittext : { *(.inittext) }
.initdata : { *(.initdata) }
- .text : { *(.text*) }
+ .text : { *(.text) }
+ .text32 : { *(.text32) }
. = ALIGN(16);
.rodata : { *(.rodata*) }
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 44dc1923c0e..ee3a4ea923a 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
static void usage(void)
{
- die("Usage: build [-b] setup system [rootdev] [> image]");
+ die("Usage: build setup system [rootdev] [> image]");
}
int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
void *kernel;
u32 crc = 0xffffffffUL;
- if (argc > 2 && !strcmp(argv[1], "-b"))
- {
- is_big_kernel = 1;
- argc--, argv++;
- }
if ((argc < 3) || (argc > 4))
usage();
if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
die("Unable to mmap '%s': %m", argv[2]);
/* Number of 16-byte paragraphs, including space for a 4-byte CRC */
sys_size = (sz + 15 + 4) / 16;
- if (!is_big_kernel && sys_size > DEF_SYSSIZE)
- die("System is too big. Try using bzImage or modules.");
/* Patch the setup code with the appropriate size parameters */
buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 5d4742ed4aa..95d86ce0421 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -129,41 +129,45 @@ u16 vga_crtc(void)
return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
}
-static void vga_set_480_scanlines(int end)
+static void vga_set_480_scanlines(int lines)
{
- u16 crtc;
- u8 csel;
+ u16 crtc; /* CRTC base address */
+ u8 csel; /* CRTC miscellaneous output register */
+ u8 ovfw; /* CRTC overflow register */
+ int end = lines-1;
crtc = vga_crtc();
+ ovfw = 0x3c | ((end >> (8-1)) & 0x02) | ((end >> (9-6)) & 0x40);
+
out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
out_idx(0x0b, crtc, 0x06); /* Vertical total */
- out_idx(0x3e, crtc, 0x07); /* Vertical overflow */
+ out_idx(ovfw, crtc, 0x07); /* Vertical overflow */
out_idx(0xea, crtc, 0x10); /* Vertical sync start */
- out_idx(end, crtc, 0x12); /* Vertical display end */
+ out_idx(end, crtc, 0x12); /* Vertical display end */
out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
out_idx(0x04, crtc, 0x16); /* Vertical blank end */
csel = inb(0x3cc);
csel &= 0x0d;
csel |= 0xe2;
- outb(csel, 0x3cc);
+ outb(csel, 0x3c2);
}
static void vga_set_80x30(void)
{
- vga_set_480_scanlines(0xdf);
+ vga_set_480_scanlines(30*16);
}
static void vga_set_80x34(void)
{
vga_set_14font();
- vga_set_480_scanlines(0xdb);
+ vga_set_480_scanlines(34*14);
}
static void vga_set_80x60(void)
{
vga_set_8font();
- vga_set_480_scanlines(0xdf);
+ vga_set_480_scanlines(60*8);
}
static int vga_set_mode(struct mode_info *mode)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4ef949c1972..df8a300dfe6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -75,7 +75,7 @@ static inline void default_inquire_remote_apic(int apicid)
#define setup_secondary_clock setup_secondary_APIC_clock
#endif
-#ifdef CONFIG_X86_VSMP
+#ifdef CONFIG_X86_64
extern int is_vsmp_box(void);
#else
static inline int is_vsmp_box(void)
@@ -108,6 +108,16 @@ extern void native_apic_icr_write(u32 low, u32 id);
extern u64 native_apic_icr_read(void);
#ifdef CONFIG_X86_X2APIC
+/*
+ * Make previous memory operations globally visible before
+ * sending the IPI through x2apic wrmsr. We need a serializing instruction or
+ * mfence for this.
+ */
+static inline void x2apic_wrmsr_fence(void)
+{
+ asm volatile("mfence" : : : "memory");
+}
+
static inline void native_apic_msr_write(u32 reg, u32 v)
{
if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -184,6 +194,9 @@ static inline int x2apic_enabled(void)
{
return 0;
}
+
+#define x2apic 0
+
#endif
extern int get_physical_broadcast(void);
@@ -379,6 +392,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
static inline void ack_APIC_irq(void)
{
+#ifdef CONFIG_X86_LOCAL_APIC
/*
* ack_APIC_irq() actually gets compiled as a single instruction
* ... yummie.
@@ -386,6 +400,7 @@ static inline void ack_APIC_irq(void)
/* Docs say use 0 for future compatibility */
apic_write(APIC_EOI, 0);
+#endif
}
static inline unsigned default_get_apic_id(unsigned long x)
@@ -474,10 +489,19 @@ static inline int default_apic_id_registered(void)
return physid_isset(read_apic_id(), phys_cpu_present_map);
}
+static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+extern int default_apicid_to_node(int logical_apicid);
+
+#endif
+
static inline unsigned int
default_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- return cpumask_bits(cpumask)[0];
+ return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
}
static inline unsigned int
@@ -491,15 +515,6 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return (unsigned int)(mask1 & mask2 & mask3);
}
-static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-extern int default_apicid_to_node(int logical_apicid);
-
-#endif
-
static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
{
return physid_isset(apicid, bitmap);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b..bc9514fb3b1 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
#define APIC_ESR_SENDILL 0x00020
#define APIC_ESR_RECVILL 0x00040
#define APIC_ESR_ILLREGA 0x00080
+#define APIC_LVTCMCI 0x2f0
#define APIC_ICR 0x300
#define APIC_DEST_SELF 0x40000
#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6526cf08b0e..6ba23dd9fc9 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,10 +1,6 @@
#ifndef _ASM_X86_BOOT_H
#define _ASM_X86_BOOT_H
-/* Don't touch these, unless you really know what you're doing. */
-#define DEF_SYSSEG 0x1000
-#define DEF_SYSSIZE 0x7F00
-
/* Internal svga startup constants */
#define NORMAL_VGA 0xffff /* 80x25 mode */
#define EXTENDED_VGA 0xfffe /* 80x50 mode */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 5b301b7ff5f..b3894bf52fc 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -90,6 +90,9 @@ int set_memory_4k(unsigned long addr, int numpages);
int set_memory_array_uc(unsigned long *addr, int addrinarray);
int set_memory_array_wb(unsigned long *addr, int addrinarray);
+int set_pages_array_uc(struct page **pages, int addrinarray);
+int set_pages_array_wb(struct page **pages, int addrinarray);
+
/*
* For legacy compatibility with the old APIs, a few functions
* are provided that work on a "struct page".
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755
index 00000000000..222802029fa
--- /dev/null
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -0,0 +1,226 @@
+#ifndef _ASM_X86_CPU_DEBUG_H
+#define _ASM_X86_CPU_DEBUG_H
+
+/*
+ * CPU x86 architecture debug
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ */
+
+/* Register flags */
+enum cpu_debug_bit {
+/* Model Specific Registers (MSRs) */
+ CPU_MC_BIT, /* Machine Check */
+ CPU_MONITOR_BIT, /* Monitor */
+ CPU_TIME_BIT, /* Time */
+ CPU_PMC_BIT, /* Performance Monitor */
+ CPU_PLATFORM_BIT, /* Platform */
+ CPU_APIC_BIT, /* APIC */
+ CPU_POWERON_BIT, /* Power-on */
+ CPU_CONTROL_BIT, /* Control */
+ CPU_FEATURES_BIT, /* Features control */
+ CPU_LBRANCH_BIT, /* Last Branch */
+ CPU_BIOS_BIT, /* BIOS */
+ CPU_FREQ_BIT, /* Frequency */
+ CPU_MTTR_BIT, /* MTRR */
+ CPU_PERF_BIT, /* Performance */
+ CPU_CACHE_BIT, /* Cache */
+ CPU_SYSENTER_BIT, /* Sysenter */
+ CPU_THERM_BIT, /* Thermal */
+ CPU_MISC_BIT, /* Miscellaneous */
+ CPU_DEBUG_BIT, /* Debug */
+ CPU_PAT_BIT, /* PAT */
+ CPU_VMX_BIT, /* VMX */
+ CPU_CALL_BIT, /* System Call */
+ CPU_BASE_BIT, /* BASE Address */
+ CPU_VER_BIT, /* Version ID */
+ CPU_CONF_BIT, /* Configuration */
+ CPU_SMM_BIT, /* System mgmt mode */
+ CPU_SVM_BIT, /*Secure Virtual Machine*/
+ CPU_OSVM_BIT, /* OS-Visible Workaround*/
+/* Standard Registers */
+ CPU_TSS_BIT, /* Task Stack Segment */
+ CPU_CR_BIT, /* Control Registers */
+ CPU_DT_BIT, /* Descriptor Table */
+/* End of Registers flags */
+ CPU_REG_ALL_BIT, /* Select all Registers */
+};
+
+#define CPU_REG_ALL (~0) /* Select all Registers */
+
+#define CPU_MC (1 << CPU_MC_BIT)
+#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
+#define CPU_TIME (1 << CPU_TIME_BIT)
+#define CPU_PMC (1 << CPU_PMC_BIT)
+#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
+#define CPU_APIC (1 << CPU_APIC_BIT)
+#define CPU_POWERON (1 << CPU_POWERON_BIT)
+#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
+#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
+#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
+#define CPU_BIOS (1 << CPU_BIOS_BIT)
+#define CPU_FREQ (1 << CPU_FREQ_BIT)
+#define CPU_MTRR (1 << CPU_MTTR_BIT)
+#define CPU_PERF (1 << CPU_PERF_BIT)
+#define CPU_CACHE (1 << CPU_CACHE_BIT)
+#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
+#define CPU_THERM (1 << CPU_THERM_BIT)
+#define CPU_MISC (1 << CPU_MISC_BIT)
+#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
+#define CPU_PAT (1 << CPU_PAT_BIT)
+#define CPU_VMX (1 << CPU_VMX_BIT)
+#define CPU_CALL (1 << CPU_CALL_BIT)
+#define CPU_BASE (1 << CPU_BASE_BIT)
+#define CPU_VER (1 << CPU_VER_BIT)
+#define CPU_CONF (1 << CPU_CONF_BIT)
+#define CPU_SMM (1 << CPU_SMM_BIT)
+#define CPU_SVM (1 << CPU_SVM_BIT)
+#define CPU_OSVM (1 << CPU_OSVM_BIT)
+#define CPU_TSS (1 << CPU_TSS_BIT)
+#define CPU_CR (1 << CPU_CR_BIT)
+#define CPU_DT (1 << CPU_DT_BIT)
+
+/* Register file flags */
+enum cpu_file_bit {
+ CPU_INDEX_BIT, /* index */
+ CPU_VALUE_BIT, /* value */
+};
+
+#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
+
+/*
+ * DisplayFamily_DisplayModel Processor Families/Processor Number Series
+ * -------------------------- ------------------------------------------
+ * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
+ *
+ * 06_01 Pentium Pro
+ * 06_03, 06_05 Pentium II Xeon, Pentium II
+ * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
+ *
+ * 06_09, 060D Pentium M
+ *
+ * 06_0E Core Duo, Core Solo
+ *
+ * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
+ * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
+ * Pentium dual-core
+ * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
+ *
+ * 06_1C Atom
+ *
+ * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
+ * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
+ *
+ * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
+ * Pentium 4, Pentium D
+ */
+
+/* Register processors bits */
+enum cpu_processor_bit {
+ CPU_NONE,
+/* Intel */
+ CPU_INTEL_PENTIUM_BIT,
+ CPU_INTEL_P6_BIT,
+ CPU_INTEL_PENTIUM_M_BIT,
+ CPU_INTEL_CORE_BIT,
+ CPU_INTEL_CORE2_BIT,
+ CPU_INTEL_ATOM_BIT,
+ CPU_INTEL_XEON_P4_BIT,
+ CPU_INTEL_XEON_MP_BIT,
+/* AMD */
+ CPU_AMD_K6_BIT,
+ CPU_AMD_K7_BIT,
+ CPU_AMD_K8_BIT,
+ CPU_AMD_0F_BIT,
+ CPU_AMD_10_BIT,
+ CPU_AMD_11_BIT,
+};
+
+#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
+#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
+#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
+#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
+#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
+#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
+#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
+#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
+
+#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
+#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
+#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
+#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
+#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
+#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
+#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
+#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
+#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
+#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
+#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
+#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
+#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
+#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
+#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
+#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
+#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
+#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
+#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
+
+/* Select all supported Intel CPUs */
+#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
+
+#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
+#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
+#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
+#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
+#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
+#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
+
+#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
+#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
+#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
+#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
+
+/* Select all supported AMD CPUs */
+#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
+
+/* Select all supported CPUs */
+#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
+
+#define MAX_CPU_FILES 512
+
+struct cpu_private {
+ unsigned cpu;
+ unsigned type;
+ unsigned reg;
+ unsigned file;
+};
+
+struct cpu_debug_base {
+ char *name; /* Register name */
+ unsigned flag; /* Register flag */
+ unsigned write; /* Register write flag */
+};
+
+/*
+ * Currently it looks similar to cpu_debug_base but once we add more files
+ * cpu_file_base will go in different direction
+ */
+struct cpu_file_base {
+ char *name; /* Register file name */
+ unsigned flag; /* Register file flag */
+ unsigned write; /* Register write flag */
+};
+
+struct cpu_cpuX_base {
+ struct dentry *dentry; /* Register dentry */
+ int init; /* Register index file */
+};
+
+struct cpu_debug_range {
+ unsigned min; /* Register range min */
+ unsigned max; /* Register range max */
+ unsigned flag; /* Supported flags */
+ unsigned model; /* Supported models */
+};
+
+#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index dc27705f544..5623c50d67b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
#define store_gdt(dtr) native_store_gdt(dtr)
#define store_idt(dtr) native_store_idt(dtr)
#define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
#define load_TLS(t, cpu) native_load_tls(t, cpu)
#define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
}
#endif /* CONFIG_PARAVIRT */
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
static inline void native_write_idt_entry(gate_desc *idt, int entry,
const gate_desc *gate)
{
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index bc68212c6bc..fd8f9e2ca35 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -1,22 +1,15 @@
#ifndef _ASM_X86_DMI_H
#define _ASM_X86_DMI_H
-#include <asm/io.h>
-
-#define DMI_MAX_DATA 2048
+#include <linux/compiler.h>
+#include <linux/init.h>
-extern int dmi_alloc_index;
-extern char dmi_alloc_data[DMI_MAX_DATA];
+#include <asm/io.h>
+#include <asm/setup.h>
-/* This is so early that there is no good way to allocate dynamic memory.
- Allocate data in an BSS array. */
-static inline void *dmi_alloc(unsigned len)
+static __always_inline __init void *dmi_alloc(unsigned len)
{
- int idx = dmi_alloc_index;
- if ((dmi_alloc_index + len) > DMI_MAX_DATA)
- return NULL;
- dmi_alloc_index += len;
- return dmi_alloc_data + idx;
+ return extend_brk(len, sizeof(int));
}
/* Use early IO mappings for DMI because it's initialized early */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 00d41ce4c84..7ecba4d8508 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,7 +72,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type);
extern void e820_add_region(u64 start, u64 size, int type);
extern void e820_print_map(char *who);
extern int
-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type);
extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538ae85..c2e6bedaf25 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
smp_invalidate_interrupt)
#endif
+BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
+
/*
* every pentium local APIC has two 'local interrupts', with a
* soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 176f058e715..039db6aa8e0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
unsigned int apic_timer_irqs; /* arch dependent */
unsigned int irq_spurious_count;
#endif
+ unsigned int generic_irqs; /* arch dependent */
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index bf9276bea66..014c2b85ae4 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
void *kmap_atomic(struct page *page, enum km_type type);
void kunmap_atomic(void *kvaddr, enum km_type type);
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 370e1c83bb4..b762ea49bd7 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
/* Interrupt handlers registered during init_IRQ */
extern void apic_timer_interrupt(void);
+extern void generic_interrupt(void);
extern void error_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 00000000000..36fb1a6a510
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_INIT_32_H
+#define _ASM_X86_INIT_32_H
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+#endif
+
+extern unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+
+
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
+
+#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 59cb4a1317b..373cc2bbcad 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -162,7 +162,8 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
extern void ioapic_init_mappings(void);
#ifdef CONFIG_X86_64
-extern int save_mask_IO_APIC_setup(void);
+extern int save_IO_APIC_setup(void);
+extern void mask_IO_APIC_setup(void);
extern void restore_IO_APIC_setup(void);
extern void reinit_intr_remapped_IO_APIC(int);
#endif
@@ -172,7 +173,7 @@ extern void probe_nr_irqs_gsi(void);
extern int setup_ioapic_entry(int apic, int irq,
struct IO_APIC_route_entry *entry,
unsigned int destination, int trigger,
- int polarity, int vector);
+ int polarity, int vector, int pin);
extern void ioapic_write_entry(int apic, int pin,
struct IO_APIC_route_entry e);
#else /* !CONFIG_X86_IO_APIC */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb219669..f38481bcd45 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
extern void fixup_irqs(void);
#endif
+extern void (*generic_interrupt_extension)(void);
extern void init_IRQ(void);
extern void native_init_IRQ(void);
extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 20e1fd588db..0396760fccb 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,8 +1,6 @@
#ifndef _ASM_X86_IRQ_REMAPPING_H
#define _ASM_X86_IRQ_REMAPPING_H
-extern int x2apic;
-
#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f356f8..3cbd79bbb47 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
#define LOCAL_PERF_VECTOR 0xee
/*
+ * Generic system vector for platform specific use
+ */
+#define GENERIC_INTERRUPT_VECTOR 0xed
+
+/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
* start at 0x31(0x41) to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d19ed3..317ff1703d0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
# define PAGES_NR 4
#else
# define PA_CONTROL_PAGE 0
-# define PA_TABLE_PAGE 1
-# define PAGES_NR 2
+# define VA_CONTROL_PAGE 1
+# define PA_TABLE_PAGE 2
+# define PA_SWAP_PAGE 3
+# define PAGES_NR 4
#endif
-#ifdef CONFIG_X86_32
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
-#endif
#ifndef __ASSEMBLY__
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
unsigned int has_pae,
unsigned int preserve_context);
#else
-NORET_TYPE void
+unsigned long
relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
- unsigned long start_address) ATTRIB_NORET;
+ unsigned long start_address,
+ unsigned int preserve_context);
#endif
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a8a26..12d55e773eb 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,14 +1,11 @@
#ifndef _ASM_X86_LINKAGE_H
#define _ASM_X86_LINKAGE_H
+#include <linux/stringify.h>
+
#undef notrace
#define notrace __attribute__((no_instrument_function))
-#ifdef CONFIG_X86_64
-#define __ALIGN .p2align 4,,15
-#define __ALIGN_STR ".p2align 4,,15"
-#endif
-
#ifdef CONFIG_X86_32
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
/*
@@ -50,16 +47,20 @@
__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
"g" (arg4), "g" (arg5), "g" (arg6))
-#endif
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
#define GLOBAL(name) \
.globl name; \
name:
-#ifdef CONFIG_X86_ALIGNMENT_16
-#define __ALIGN .align 16,0x90
-#define __ALIGN_STR ".align 16,0x90"
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
+#define __ALIGN .p2align 4, 0x90
+#define __ALIGN_STR __stringify(__ALIGN)
#endif
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960..563933e06a3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
*/
#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
+#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
+#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
#include <asm/atomic.h>
+void mce_setup(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct sys_device, device_mce);
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
#endif
#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+
+void mce_log_therm_throt_event(__u64 status);
extern atomic_t mce_entry;
extern void do_machine_check(struct pt_regs *, long);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+ MCP_TIMESTAMP = (1 << 0), /* log time stamp */
+ MCP_UC = (1 << 1), /* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
extern int mce_notify_user(void);
#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
#else
#define mcheck_init(c) do { } while (0)
#endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+
+extern void (*mce_threshold_vector)(void);
#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 6706b3006f1..4cc48af23fe 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -47,6 +47,7 @@
#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
MSI_ADDR_DEST_ID_MASK)
+#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
#define MSI_ADDR_IR_EXT_INT (1 << 4)
#define MSI_ADDR_IR_SHV (1 << 3)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f4e505f286b..ec41fc16c16 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,6 +81,11 @@
#define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2 0x00000280
+#define CMCI_EN (1ULL << 30)
+#define CMCI_THRESHOLD_MASK 0xffffULL
+
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f1e4a79a6e4..0f915ae649a 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
#define __VIRTUAL_MASK_SHIFT 32
#endif /* CONFIG_X86_PAE */
+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+
#ifndef __ASSEMBLY__
/*
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603..826ad37006a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
#ifndef __ASSEMBLY__
-struct pgprot;
-
extern int page_is_ram(unsigned long pagenr);
extern int devmem_is_allowed(unsigned long pagenr);
-extern void map_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
- struct pgprot vma_prot);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5cc971..7727aa8b7dd 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -317,8 +317,6 @@ struct pv_mmu_ops {
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
- void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte);
void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
void (*pmd_clear)(pmd_t *pmdp);
@@ -389,7 +387,7 @@ extern struct pv_lock_ops pv_lock_ops;
#define paravirt_type(op) \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "m" (op)
+ [paravirt_opptr] "i" (&(op))
#define paravirt_clobber(clobber) \
[paravirt_clobber] "i" (clobber)
@@ -443,7 +441,7 @@ int paravirt_disable_iospace(void);
* offset into the paravirt_patch_template structure, and can therefore be
* freely converted back into a structure offset.
*/
-#define PARAVIRT_CALL "call *%[paravirt_opptr];"
+#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
/*
* These macros are intended to wrap calls through one of the paravirt
@@ -1365,13 +1363,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
pte.pte, pte.pte >> 32);
}
-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- /* 5 arg words */
- pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
-}
-
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
@@ -1388,12 +1379,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
set_pte(ptep, pte);
}
-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte(ptep, pte);
-}
-
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838..2cd07b9422f 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
#define _ASM_X86_PAT_H
#include <linux/types.h>
+#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_PAT
extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
extern int kernel_map_sync_memtype(u64 base, unsigned long size,
unsigned long flag);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+ struct pgprot vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+ struct pgprot vma_prot);
#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index c1774ac9da7..2334982b339 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -26,13 +26,6 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
-static inline void native_set_pte_present(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- native_set_pte(ptep, pte);
-}
-
static inline void native_pmd_clear(pmd_t *pmdp)
{
native_set_pmd(pmdp, __pmd(0));
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 3f13cdf6115..177b0165ea0 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,23 +31,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
ptep->pte_low = pte.pte_low;
}
-/*
- * Since this is only called on user PTEs, and the page fault handler
- * must handle the already racy situation of simultaneous page faults,
- * we are justified in merely clearing the PTE present bit, followed
- * by a set. The ordering here is important.
- */
-static inline void native_set_pte_present(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- ptep->pte_low = 0;
- smp_wmb();
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-}
-
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d0812e155f1..29d96d168bc 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,8 +31,6 @@ extern struct list_head pgd_list;
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
-#define set_pte_present(mm, addr, ptep, pte) \
- native_set_pte_present(mm, addr, ptep, pte)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 97612fc7632..31bd120cf2a 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
*/
#undef TEST_ACCESS_OK
-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
#ifdef CONFIG_X86_PAE
# include <asm/pgtable-3level.h>
#else
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe0..2733fad45f9 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
* area for the same reason. ;)
*/
#define VMALLOC_OFFSET (8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
#ifdef CONFIG_X86_PAE
#define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0..b8238dc8786 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern int nx_enabled;
+extern void set_nx(void);
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 76139506c3e..ae85a8d66a3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,9 +75,9 @@ struct cpuinfo_x86 {
#else
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
+#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
-#endif
/* CPUID returned core id bits: */
__u8 x86_coreid_bits;
/* Max extended CPUID function supported: */
@@ -391,6 +391,9 @@ DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
DECLARE_INIT_PER_CPU(irq_stack_union);
DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern unsigned long kernel_eflags;
+extern asmlinkage void ignore_sysret(void);
#else /* X86_64 */
#ifdef CONFIG_CC_STACKPROTECTOR
DECLARE_PER_CPU(unsigned long, stack_canary);
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 2b8c5160388..1b7ee5d673c 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -1 +1,8 @@
+#ifndef _ASM_X86_SECTIONS_H
+#define _ASM_X86_SECTIONS_H
+
#include <asm-generic/sections.h>
+
+extern char __brk_base[], __brk_limit[];
+
+#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 05c6f6b11fd..bdc2ada05ae 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -64,7 +64,7 @@ extern void x86_quirk_time_init(void);
#include <asm/bootparam.h>
/* Interrupt control for vSMPowered x86_64 systems */
-#ifdef CONFIG_X86_VSMP
+#ifdef CONFIG_X86_64
void vsmp_init(void);
#else
static inline void vsmp_init(void) { }
@@ -100,20 +100,51 @@ extern struct boot_params boot_params;
*/
#define LOWMEMSIZE() (0x9f000)
+/* exceedingly early brk-like allocator */
+extern unsigned long _brk_end;
+void *extend_brk(size_t size, size_t align);
+
+/*
+ * Reserve space in the brk section. The name must be unique within
+ * the file, and somewhat descriptive. The size is in bytes. Must be
+ * used at file scope.
+ *
+ * (This uses a temp function to wrap the asm so we can pass it the
+ * size parameter; otherwise we wouldn't be able to. We can't use a
+ * "section" attribute on a normal variable because it always ends up
+ * being @progbits, which ends up allocating space in the vmlinux
+ * executable.)
+ */
+#define RESERVE_BRK(name,sz) \
+ static void __section(.discard) __used \
+ __brk_reservation_fn_##name##__(void) { \
+ asm volatile ( \
+ ".pushsection .brk_reservation,\"aw\",@nobits;" \
+ ".brk." #name ":" \
+ " 1:.skip %c0;" \
+ " .size .brk." #name ", . - 1b;" \
+ " .popsection" \
+ : : "i" (sz)); \
+ }
+
#ifdef __i386__
void __init i386_start_kernel(void);
extern void probe_roms(void);
-extern unsigned long init_pg_tables_start;
-extern unsigned long init_pg_tables_end;
-
#else
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);
#endif /* __i386__ */
#endif /* _SETUP */
+#else
+#define RESERVE_BRK(name,sz) \
+ .pushsection .brk_reservation,"aw",@nobits; \
+.brk.name: \
+1: .skip sz; \
+ .size .brk.name,.-1b; \
+ .popsection
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c..9f4dfba33b2 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
+/* Loop through all installed blades */
+#define for_each_possible_blade(bid) \
+ for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
+
/*
* Macros for converting between kernel virtual addresses, socket local physical
* addresses, and UV global physical addresses.
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5e79ca69432..9c371e4a9fa 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg)
static inline int
HYPERVISOR_update_descriptor(u64 ma, u64 desc)
{
+ if (sizeof(u64) == sizeof(long))
+ return _hypercall2(int, update_descriptor, ma, desc);
return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 95f216bbfaf..6e9c1f320ac 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -70,7 +70,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
-obj-$(CONFIG_X86_VSMP) += vsmp_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module_$(BITS).o
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -111,7 +110,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o
+ obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
@@ -120,4 +119,5 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
+ obj-y += vsmp_64.o
endif
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d5..4c80f155743 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
that might execute the to be patched code.
Other CPUs are not running. */
stop_nmi();
-#ifdef CONFIG_X86_MCE
- stop_mce();
-#endif
+
+ /*
+ * Don't stop machine check exceptions while patching.
+ * MCEs only happen when something got corrupted and in this
+ * case we must do something about the corruption.
+ * Ignoring it is worse than a unlikely patching race.
+ * Also machine checks tend to be broadcast and if one CPU
+ * goes into machine check the others follow quickly, so we don't
+ * expect a machine check to cause undue problems during to code
+ * patching.
+ */
apply_alternatives(__alt_instructions, __alt_instructions_end);
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
(unsigned long)__smp_locks_end);
restart_nmi();
-#ifdef CONFIG_X86_MCE
- restart_mce();
-#endif
}
/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c..85eb8e10081 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/smp.h>
+#include <asm/mce.h>
unsigned int num_processors;
@@ -808,7 +809,7 @@ void clear_local_APIC(void)
u32 v;
/* APIC hasn't been mapped yet */
- if (!apic_phys)
+ if (!x2apic && !apic_phys)
return;
maxlvt = lapic_get_maxlvt();
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
}
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+#endif
+
/*
* Clean APIC state for other OSs:
*/
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_LVT1, value);
preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (smp_processor_id() == 0)
+ cmci_recheck();
+#endif
}
void __cpuinit end_local_APIC_setup(void)
@@ -1319,15 +1334,16 @@ void __init enable_IR_x2apic(void)
return;
}
- local_irq_save(flags);
- mask_8259A();
-
- ret = save_mask_IO_APIC_setup();
+ ret = save_IO_APIC_setup();
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
goto end;
}
+ local_irq_save(flags);
+ mask_IO_APIC_setup();
+ mask_8259A();
+
ret = enable_intr_remapping(1);
if (ret && x2apic_preenabled) {
@@ -1352,10 +1368,10 @@ end_restore:
else
reinit_intr_remapped_IO_APIC(x2apic_preenabled);
-end:
unmask_8259A();
local_irq_restore(flags);
+end:
if (!ret) {
if (!x2apic_preenabled)
pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1508,12 +1524,10 @@ void __init early_init_lapic_mapping(void)
*/
void __init init_apic_mappings(void)
{
-#ifdef CONFIG_X86_X2APIC
if (x2apic) {
boot_cpu_physical_apicid = read_apic_id();
return;
}
-#endif
/*
* If no local APIC can be found then set up a fake all
@@ -1957,12 +1971,9 @@ static int lapic_resume(struct sys_device *dev)
local_irq_save(flags);
-#ifdef CONFIG_X86_X2APIC
if (x2apic)
enable_x2apic();
- else
-#endif
- {
+ else {
/*
* Make sure the APICBASE points to the right address
*
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f933822dba1..0014714ea97 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -159,20 +159,6 @@ static int flat_apic_id_registered(void)
return physid_isset(read_xapic_id(), phys_cpu_present_map);
}
-static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-}
-
-static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
- unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
-
- return mask1 & mask2;
-}
-
static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
return hard_smp_processor_id() >> index_msb;
@@ -213,8 +199,8 @@ struct apic apic_flat = {
.set_apic_id = set_apic_id,
.apic_id_mask = 0xFFu << 24,
- .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = flat_send_IPI_mask,
.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc..da99ffcdfde 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -389,6 +389,8 @@ struct io_apic {
unsigned int index;
unsigned int unused[3];
unsigned int data;
+ unsigned int unused2[11];
+ unsigned int eoi;
};
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+ (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
}
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(vector, &io_apic->eoi);
+}
+
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -546,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
apic = entry->apic;
pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
/*
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
if (!irq_remapped(irq))
io_apic_write(apic, 0x11 + pin*2, dest);
-#else
- io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
@@ -588,10 +592,12 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
if (assign_irq_vector(irq, cfg, mask))
return BAD_APICID;
- cpumask_and(desc->affinity, cfg->domain, mask);
+ /* check that before desc->addinity get updated */
set_extra_move_desc(desc, mask);
- return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
+ cpumask_copy(desc->affinity, mask);
+
+ return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
}
static void
@@ -849,9 +855,9 @@ __setup("pirq=", ioapic_pirq_setup);
static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
/*
- * Saves and masks all the unmasked IO-APIC RTE's
+ * Saves all the IO-APIC RTE's
*/
-int save_mask_IO_APIC_setup(void)
+int save_IO_APIC_setup(void)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -876,16 +882,9 @@ int save_mask_IO_APIC_setup(void)
}
for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- struct IO_APIC_route_entry entry;
-
- entry = early_ioapic_entries[apic][pin] =
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ early_ioapic_entries[apic][pin] =
ioapic_read_entry(apic, pin);
- if (!entry.mask) {
- entry.mask = 1;
- ioapic_write_entry(apic, pin, entry);
- }
- }
return 0;
@@ -898,6 +897,25 @@ nomem:
return -ENOMEM;
}
+void mask_IO_APIC_setup(void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!early_ioapic_entries[apic])
+ break;
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = early_ioapic_entries[apic][pin];
+ if (!entry.mask) {
+ entry.mask = 1;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+ }
+}
+
void restore_IO_APIC_setup(void)
{
int apic, pin;
@@ -1411,9 +1429,7 @@ void __setup_vector_irq(int cpu)
}
static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
static struct irq_chip ir_ioapic_chip;
-#endif
#define IOAPIC_AUTO -1
#define IOAPIC_EDGE 0
@@ -1452,7 +1468,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
else
desc->status &= ~IRQ_LEVEL;
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
desc->status |= IRQ_MOVE_PCNTXT;
if (trigger)
@@ -1464,7 +1479,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
handle_edge_irq, "edge");
return;
}
-#endif
+
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1478,14 +1493,13 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
int setup_ioapic_entry(int apic_id, int irq,
struct IO_APIC_route_entry *entry,
unsigned int destination, int trigger,
- int polarity, int vector)
+ int polarity, int vector, int pin)
{
/*
* add it to the IO-APIC irq-routing table:
*/
memset(entry,0,sizeof(*entry));
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled) {
struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
struct irte irte;
@@ -1504,7 +1518,14 @@ int setup_ioapic_entry(int apic_id, int irq,
irte.present = 1;
irte.dst_mode = apic->irq_dest_mode;
- irte.trigger_mode = trigger;
+ /*
+ * Trigger mode in the IRTE will always be edge, and the
+ * actual level or edge trigger will be setup in the IO-APIC
+ * RTE. This will help simplify level triggered irq migration.
+ * For more details, see the comments above explainig IO-APIC
+ * irq migration in the presence of interrupt-remapping.
+ */
+ irte.trigger_mode = 0;
irte.dlvry_mode = apic->irq_delivery_mode;
irte.vector = vector;
irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1536,21 @@ int setup_ioapic_entry(int apic_id, int irq,
ir_entry->zero = 0;
ir_entry->format = 1;
ir_entry->index = (index & 0x7fff);
- } else
-#endif
- {
+ /*
+ * IO-APIC RTE will be configured with virtual vector.
+ * irq handler will do the explicit EOI to the io-apic.
+ */
+ ir_entry->vector = pin;
+ } else {
entry->delivery_mode = apic->irq_delivery_mode;
entry->dest_mode = apic->irq_dest_mode;
entry->dest = destination;
+ entry->vector = vector;
}
entry->mask = 0; /* enable IRQ */
entry->trigger = trigger;
entry->polarity = polarity;
- entry->vector = vector;
/* Mask level triggered irqs.
* Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1585,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
- dest, trigger, polarity, cfg->vector)) {
+ dest, trigger, polarity, cfg->vector, pin)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mp_ioapics[apic_id].apicid, pin);
__clear_irq_vector(irq, cfg);
@@ -1642,10 +1666,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
{
struct IO_APIC_route_entry entry;
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
return;
-#endif
memset(&entry, 0, sizeof(entry));
@@ -2040,8 +2062,13 @@ void disable_IO_APIC(void)
* If the i8259 is routed through an IOAPIC
* Put that IOAPIC in virtual wire mode
* so legacy interrupts can be delivered.
+ *
+ * With interrupt-remapping, for now we will use virtual wire A mode,
+ * as virtual wire B is little complex (need to configure both
+ * IOAPIC RTE aswell as interrupt-remapping table entry).
+ * As this gets called during crash dump, keep this simple for now.
*/
- if (ioapic_i8259.pin != -1) {
+ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2088,10 @@ void disable_IO_APIC(void)
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
- disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+ /*
+ * Use virtual wire A mode when interrupt remapping is enabled.
+ */
+ disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
}
#ifdef CONFIG_X86_32
@@ -2303,37 +2333,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
#ifdef CONFIG_SMP
#ifdef CONFIG_INTR_REMAP
-static void ir_irq_migration(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
/*
* Migrate the IO-APIC irq in the presence of intr-remapping.
*
- * For edge triggered, irq migration is a simple atomic update(of vector
- * and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we need to modify the io-apic RTE aswell with the update
- * vector information, along with modifying IRTE with vector and destination.
- * So irq migration for level triggered is little bit more complex compared to
- * edge triggered migration. But the good news is, we use the same algorithm
- * for level triggered migration as we have today, only difference being,
- * we now initiate the irq migration from process context instead of the
- * interrupt context.
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
*
- * In future, when we do a directed EOI (combined with cpu EOI broadcast
- * suppression) to the IO-APIC, level triggered irq migration will also be
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
*/
static void
migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
struct irte irte;
- int modify_ioapic_rte;
unsigned int dest;
- unsigned long flags;
unsigned int irq;
if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2351,13 +2368,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
- modify_ioapic_rte = desc->status & IRQ_LEVEL;
- if (modify_ioapic_rte) {
- spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- }
-
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -2372,73 +2382,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
cpumask_copy(desc->affinity, mask);
}
-static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
-{
- int ret = -1;
- struct irq_cfg *cfg = desc->chip_data;
-
- mask_IO_APIC_irq_desc(desc);
-
- if (io_apic_level_ack_pending(cfg)) {
- /*
- * Interrupt in progress. Migrating irq now will change the
- * vector information in the IO-APIC RTE and that will confuse
- * the EOI broadcast performed by cpu.
- * So, delay the irq migration to the next instance.
- */
- schedule_delayed_work(&ir_migration_work, 1);
- goto unmask;
- }
-
- /* everthing is clear. we have right of way */
- migrate_ioapic_irq_desc(desc, desc->pending_mask);
-
- ret = 0;
- desc->status &= ~IRQ_MOVE_PENDING;
- cpumask_clear(desc->pending_mask);
-
-unmask:
- unmask_IO_APIC_irq_desc(desc);
-
- return ret;
-}
-
-static void ir_irq_migration(struct work_struct *work)
-{
- unsigned int irq;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- if (desc->status & IRQ_MOVE_PENDING) {
- unsigned long flags;
-
- spin_lock_irqsave(&desc->lock, flags);
- if (!desc->chip->set_affinity ||
- !(desc->status & IRQ_MOVE_PENDING)) {
- desc->status &= ~IRQ_MOVE_PENDING;
- spin_unlock_irqrestore(&desc->lock, flags);
- continue;
- }
-
- desc->chip->set_affinity(irq, desc->pending_mask);
- spin_unlock_irqrestore(&desc->lock, flags);
- }
- }
-}
-
/*
* Migrates the IRQ destination in the process context.
*/
static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
- if (desc->status & IRQ_LEVEL) {
- desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(desc->pending_mask, mask);
- migrate_irq_remapped_level_desc(desc);
- return;
- }
-
migrate_ioapic_irq_desc(desc, mask);
}
static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2448,6 +2397,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
set_ir_ioapic_affinity_irq_desc(desc, mask);
}
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
+{
+}
#endif
asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2461,6 +2415,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irq;
+ unsigned int irr;
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
@@ -2480,6 +2435,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ /*
+ * Check if the vector that needs to be cleanedup is
+ * registered at the cpu's IRR. If so, then this is not
+ * the best time to clean it up. Lets clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to myself.
+ */
+ if (irr & (1 << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ goto unlock;
+ }
__get_cpu_var(vector_irq)[vector] = -1;
cfg->move_cleanup_count--;
unlock:
@@ -2529,9 +2496,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
#ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ io_apic_eoi(apic, pin);
+ entry = entry->next;
+ }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void ack_x2apic_level(unsigned int irq)
{
+ struct irq_desc *desc = irq_to_desc(irq);
ack_x2APIC_irq();
+ eoi_ioapic_irq(desc);
}
static void ack_x2apic_edge(unsigned int irq)
@@ -2662,20 +2664,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
.retrigger = ioapic_retrigger_irq,
};
-#ifdef CONFIG_INTR_REMAP
static struct irq_chip ir_ioapic_chip __read_mostly = {
.name = "IR-IO-APIC",
.startup = startup_ioapic_irq,
.mask = mask_IO_APIC_irq,
.unmask = unmask_IO_APIC_irq,
+#ifdef CONFIG_INTR_REMAP
.ack = ack_x2apic_edge,
.eoi = ack_x2apic_level,
#ifdef CONFIG_SMP
.set_affinity = set_ir_ioapic_affinity_irq,
#endif
+#endif
.retrigger = ioapic_retrigger_irq,
};
-#endif
static inline void init_IO_APIC_traps(void)
{
@@ -2901,10 +2903,8 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
-#endif
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2940,10 +2940,8 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
@@ -3237,9 +3235,7 @@ void destroy_irq(unsigned int irq)
if (desc)
desc->chip_data = cfg;
-#ifdef CONFIG_INTR_REMAP
free_irte(irq);
-#endif
spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
@@ -3265,7 +3261,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
struct irte irte;
int ir_index;
@@ -3291,10 +3286,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
MSI_ADDR_IR_SHV |
MSI_ADDR_IR_INDEX1(ir_index) |
MSI_ADDR_IR_INDEX2(ir_index);
- } else
-#endif
- {
- msg->address_hi = MSI_ADDR_BASE_HI;
+ } else {
+ if (x2apic_enabled())
+ msg->address_hi = MSI_ADDR_BASE_HI |
+ MSI_ADDR_EXT_DEST_ID(dest);
+ else
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
msg->address_lo =
MSI_ADDR_BASE_LO |
((apic->irq_dest_mode == 0) ?
@@ -3394,15 +3392,16 @@ static struct irq_chip msi_chip = {
.retrigger = ioapic_retrigger_irq,
};
-#ifdef CONFIG_INTR_REMAP
static struct irq_chip msi_ir_chip = {
.name = "IR-PCI-MSI",
.unmask = unmask_msi_irq,
.mask = mask_msi_irq,
+#ifdef CONFIG_INTR_REMAP
.ack = ack_x2apic_edge,
#ifdef CONFIG_SMP
.set_affinity = ir_set_msi_irq_affinity,
#endif
+#endif
.retrigger = ioapic_retrigger_irq,
};
@@ -3432,7 +3431,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
}
return index;
}
-#endif
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
@@ -3446,7 +3444,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
struct irq_desc *desc = irq_to_desc(irq);
/*
@@ -3455,7 +3452,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
desc->status |= IRQ_MOVE_PCNTXT;
set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
} else
-#endif
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3469,11 +3465,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int ret, sub_handle;
struct msi_desc *msidesc;
unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
- struct intel_iommu *iommu = 0;
+ struct intel_iommu *iommu = NULL;
int index = 0;
-#endif
irq_want = nr_irqs_gsi;
sub_handle = 0;
@@ -3482,7 +3475,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (irq == 0)
return -1;
irq_want = irq + 1;
-#ifdef CONFIG_INTR_REMAP
if (!intr_remapping_enabled)
goto no_ir;
@@ -3510,7 +3502,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
-#endif
ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
@@ -3528,7 +3519,7 @@ void arch_teardown_msi_irq(unsigned int irq)
destroy_irq(irq);
}
-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
@@ -3609,7 +3600,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
-struct irq_chip hpet_msi_type = {
+static struct irq_chip hpet_msi_type = {
.name = "HPET_MSI",
.unmask = hpet_msi_unmask,
.mask = hpet_msi_mask,
@@ -4045,11 +4036,9 @@ void __init setup_ioapic_dest(void)
else
mask = apic->target_cpus();
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
set_ir_ioapic_affinity_irq_desc(desc, mask);
else
-#endif
set_ioapic_affinity_irq_desc(desc, mask);
}
@@ -4142,9 +4131,12 @@ static int __init ioapic_insert_resources(void)
struct resource *r = ioapic_resources;
if (!r) {
- printk(KERN_ERR
- "IO APIC resources could be not be allocated.\n");
- return -1;
+ if (nr_ioapics > 0) {
+ printk(KERN_ERR
+ "IO APIC resources couldn't be allocated.\n");
+ return -1;
+ }
+ return 0;
}
for (i = 0; i < nr_ioapics; i++) {
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 8d7748efe6a..1783652bb0e 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -68,6 +68,13 @@ void __init default_setup_apic_routing(void)
apic = &apic_physflat;
printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
}
+
+ /*
+ * Now that apic routing model is selected, configure the
+ * fault handling for intr remapping.
+ */
+ if (intr_remapping_enabled)
+ enable_drhd_fault_handling();
}
/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8fb87b6dd63..4a903e2f0d1 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
unsigned long query_cpu;
unsigned long flags;
+ x2apic_wrmsr_fence();
+
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
__x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
unsigned long query_cpu;
unsigned long flags;
+ x2apic_wrmsr_fence();
+
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
unsigned long query_cpu;
unsigned long flags;
+ x2apic_wrmsr_fence();
+
local_irq_save(flags);
for_each_online_cpu(query_cpu) {
if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 23625b9f98b..a284359627e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
unsigned long query_cpu;
unsigned long flags;
+ x2apic_wrmsr_fence();
+
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
unsigned long query_cpu;
unsigned long flags;
+ x2apic_wrmsr_fence();
+
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
unsigned long query_cpu;
unsigned long flags;
+ x2apic_wrmsr_fence();
+
local_irq_save(flags);
for_each_online_cpu(query_cpu) {
if (query_cpu == this_cpu)
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412..fc999e6fc46 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
u64 size;
addr = find_e820_area_size(addr, &size, PAGE_SIZE);
- if (addr == 0)
+ if (!(addr + 1))
+ break;
+
+ if (addr >= corruption_check_size)
break;
if ((addr + size) > corruption_check_size)
size = corruption_check_size - addr;
- if (size == 0)
- break;
-
e820_update_range(addr, size, E820_RAM, E820_RESERVED);
scan_areas[num_scan_areas].addr = addr;
scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2d..4e242f9a06e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,11 +14,12 @@ obj-y += vmware.o hypervisor.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
+obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
+
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
+obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 6882a735d9c..8220ae69849 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit cpuid_bits[] = {
+ static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
{ 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80e..7e4a459daa6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
}
}
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ /* calling is from identify_secondary_cpu() ? */
+ if (c->cpu_index == boot_cpu_id)
+ return;
+
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+ (c->x86_mask == 1)))
+ goto valid_k7;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model == 7) && (c->x86_mask == 0))
+ goto valid_k7;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability
+ * bit. It's worth noting that the A5 stepping (662) of some
+ * Athlon XP's have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+ * more.
+ */
+ if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ (c->x86_model > 7))
+ if (cpu_has_mp)
+ goto valid_k7;
+
+ /* If we get here, not a certified SMP capable AMD system. */
+
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ WARN_ONCE(1, "WARNING: This combination of AMD"
+ "processors is not suitable for SMP.\n");
+ if (!test_taint(TAINT_UNSAFE_SMP))
+ add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+ ;
+#endif
+}
+
static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
}
set_cpu_cap(c, X86_FEATURE_K7);
+
+ amd_k7_smp_check(c);
}
#endif
@@ -450,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
}
#endif
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc..c95e831bb09 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
+#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/bitops.h>
#include <asm/processor.h>
-#include <asm/msr.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
*/
c->x86_capability[5] = cpuid_edx(0xC0000001);
}
-
+#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
if (c->x86_model >= 6 && c->x86_model <= 9) {
rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
/* Before Nehemiah, the C3's had 3dNOW! */
if (c->x86_model >= 6 && c->x86_model < 9)
set_cpu_cap(c, X86_FEATURE_3DNOW);
+#endif
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
display_cacheinfo(c);
}
@@ -316,16 +321,25 @@ enum {
static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
{
switch (c->x86) {
+#ifdef CONFIG_X86_32
case 5:
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
break;
+#endif
+ case 6:
+ if (c->x86_model >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ break;
}
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
}
static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
{
-
+#ifdef CONFIG_X86_32
char *name;
u32 fcr_set = 0;
u32 fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
-
+#endif
+ early_init_centaur(c);
switch (c->x86) {
+#ifdef CONFIG_X86_32
case 5:
switch (c->x86_model) {
case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
}
sprintf(c->x86_model_id, "WinChip %s", name);
break;
-
+#endif
case 6:
init_c3(c);
break;
}
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
}
static unsigned int __cpuinit
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
+#ifdef CONFIG_X86_32
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
if ((c->x86 == 6) && (c->x86_model == 9) &&
(c->x86_mask == 1) && (size == 65))
size -= 1;
-
+#endif
return size;
}
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index a1625f5a1e7..00000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x6 && c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
- set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
- early_init_centaur(c);
-
- if (c->x86 == 0x6 && c->x86_model >= 0xf) {
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- }
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
- .c_vendor = "Centaur",
- .c_ident = { "CentaurHauls" },
- .c_early_init = early_init_centaur,
- .c_init = init_centaur,
- .c_x86_vendor = X86_VENDOR_CENTAUR,
-};
-
-cpu_dev_register(centaur_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 826d5c87627..e2962cc1e27 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,52 +1,52 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
#include <linux/bootmem.h>
+#include <linux/linkage.h>
#include <linux/bitops.h>
+#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
#include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
#include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/cpumask.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
#include <asm/mtrr.h>
+#include <asm/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
#include <asm/mce.h>
+#include <asm/msr.h>
#include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
#include <asm/smp.h>
-#include <asm/cpu.h>
-#include <asm/cpumask.h>
-#include <asm/apic.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
#endif
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/hypervisor.h>
-#include <asm/stackprotector.h>
-
#include "cpu.h"
#ifdef CONFIG_X86_64
/* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_callin_mask;
-cpumask_var_t cpu_callout_mask;
cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
@@ -62,15 +62,15 @@ void __init setup_cpu_local_masks(void)
#else /* CONFIG_X86_32 */
-cpumask_t cpu_callin_map;
+cpumask_t cpu_sibling_setup_map;
cpumask_t cpu_callout_map;
cpumask_t cpu_initialized;
-cpumask_t cpu_sibling_setup_map;
+cpumask_t cpu_callin_map;
#endif /* CONFIG_X86_32 */
-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
@@ -79,48 +79,48 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* IRET will check the segment types kkeil 2000/10/28
* Also sysret mandates a special GDT layout
*
- * The TLS descriptors are currently at a different place compared to i386.
+ * TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
#else
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
/* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
/* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
/* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
- [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
GDT_STACK_CANARY_INIT
#endif
} };
@@ -164,16 +164,17 @@ static inline int flag_is_changeable_p(u32 flag)
* the CPUID. Add "volatile" to not allow gcc to
* optimize the subsequent calls to this function.
*/
- asm volatile ("pushfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "movl %0,%1\n\t"
- "xorl %2,%0\n\t"
- "pushl %0\n\t"
- "popfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "popfl\n\t"
+ asm volatile ("pushfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "movl %0, %1 \n\t"
+ "xorl %2, %0 \n\t"
+ "pushl %0 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "popfl \n\t"
+
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
@@ -188,18 +189,22 @@ static int __cpuinit have_cpuid_p(void)
static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
- /* Disable processor serial number */
- unsigned long lo, hi;
- rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- lo |= 0x200000;
- wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
- clear_cpu_cap(c, X86_FEATURE_PN);
-
- /* Disabling the serial number may affect the cpuid level */
- c->cpuid_level = cpuid_eax(0);
- }
+ unsigned long lo, hi;
+
+ if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+ return;
+
+ /* Disable processor serial number: */
+
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+ printk(KERN_NOTICE "CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
}
static int __init x86_serial_nr_setup(char *s)
@@ -232,6 +237,7 @@ struct cpuid_dependent_feature {
u32 feature;
u32 level;
};
+
static const struct cpuid_dependent_feature __cpuinitconst
cpuid_dependent_features[] = {
{ X86_FEATURE_MWAIT, 0x00000005 },
@@ -243,7 +249,11 @@ cpuid_dependent_features[] = {
static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
{
const struct cpuid_dependent_feature *df;
+
for (df = cpuid_dependent_features; df->feature; df++) {
+
+ if (!cpu_has(c, df->feature))
+ continue;
/*
* Note: cpuid_level is set to -1 if unavailable, but
* extended_extended_level is set to 0 if unavailable
@@ -251,32 +261,32 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
* when signed; hence the weird messing around with
* signs here...
*/
- if (cpu_has(c, df->feature) &&
- ((s32)df->level < 0 ?
+ if (!((s32)df->level < 0 ?
(u32)df->level > (u32)c->extended_cpuid_level :
- (s32)df->level > (s32)c->cpuid_level)) {
- clear_cpu_cap(c, df->feature);
- if (warn)
- printk(KERN_WARNING
- "CPU: CPU feature %s disabled "
- "due to lack of CPUID level 0x%x\n",
- x86_cap_flags[df->feature],
- df->level);
- }
+ (s32)df->level > (s32)c->cpuid_level))
+ continue;
+
+ clear_cpu_cap(c, df->feature);
+ if (!warn)
+ continue;
+
+ printk(KERN_WARNING
+ "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
/*
* Naming convention should be: <Name> [(<Codename>)]
* This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
*/
/* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
{
- struct cpu_model_info *info;
+ const struct cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -307,8 +317,10 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;
@@ -321,7 +333,7 @@ void switch_to_new_gdt(int cpu)
load_percpu_segment(cpu);
}
-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
@@ -340,7 +352,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
#endif
}
-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -354,22 +366,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level < 0x80000004)
return;
- v = (unsigned int *) c->x86_model_id;
+ v = (unsigned int *)c->x86_model_id;
cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /* Intel chips right-justify this string for some dumb reason;
- undo that brain damage */
+ /*
+ * Intel chips right-justify this string for some dumb reason;
+ * undo that brain damage:
+ */
p = q = &c->x86_model_id[0];
while (*p == ' ')
- p++;
+ p++;
if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+ while (*p)
+ *q++ = *p++;
+ while (q <= &c->x86_model_id[48])
+ *q++ = '\0'; /* Zero-pad the rest */
}
}
@@ -438,27 +452,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
+ goto out;
+ }
- if (smp_num_siblings > nr_cpu_ids) {
- printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
+ if (smp_num_siblings <= 1)
+ goto out;
+
+ if (smp_num_siblings > nr_cpu_ids) {
+ pr_warning("CPU: Unsupported number of siblings %d",
+ smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
- index_msb = get_count_order(smp_num_siblings);
+ index_msb = get_count_order(smp_num_siblings);
- core_bits = get_count_order(c->x86_max_cores);
+ core_bits = get_count_order(c->x86_max_cores);
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
- }
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
out:
if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -473,8 +490,8 @@ out:
static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
- int i;
static int printed;
+ int i;
for (i = 0; i < X86_VENDOR_NUM; i++) {
if (!cpu_devs[i])
@@ -483,6 +500,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
(cpu_devs[i]->c_ident[1] &&
!strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
this_cpu = cpu_devs[i];
c->x86_vendor = this_cpu->c_x86_vendor;
return;
@@ -491,7 +509,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
if (!printed) {
printed++;
- printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+ printk(KERN_ERR
+ "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
printk(KERN_ERR "CPU: Your system may be unstable.\n");
}
@@ -511,14 +531,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 junk, tfms, cap0, misc;
+
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
c->x86 = (tfms >> 8) & 0xf;
c->x86_model = (tfms >> 4) & 0xf;
c->x86_mask = tfms & 0xf;
+
if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
c->x86_cache_alignment = c->x86_clflush_size;
@@ -534,6 +557,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 capability, excap;
+
cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
c->x86_capability[0] = capability;
c->x86_capability[4] = excap;
@@ -542,6 +566,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
+
if ((xlvl & 0xffff0000) == 0x80000000) {
if (xlvl >= 0x80000001) {
c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -549,13 +574,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
}
}
-#ifdef CONFIG_X86_64
if (c->extended_cpuid_level >= 0x80000008) {
u32 eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
}
+#ifdef CONFIG_X86_32
+ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
#endif
if (c->extended_cpuid_level >= 0x80000007)
@@ -602,8 +629,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
@@ -634,12 +665,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
void __init early_cpu_init(void)
{
- struct cpu_dev **cdev;
+ const struct cpu_dev *const *cdev;
int count = 0;
- printk("KERNEL supported cpus:\n");
+ printk(KERN_INFO "KERNEL supported cpus:\n");
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
- struct cpu_dev *cpudev = *cdev;
+ const struct cpu_dev *cpudev = *cdev;
unsigned int j;
if (count >= X86_VENDOR_NUM)
@@ -650,7 +681,7 @@ void __init early_cpu_init(void)
for (j = 0; j < 2; j++) {
if (!cpudev->c_ident[j])
continue;
- printk(" %s %s\n", cpudev->c_vendor,
+ printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
cpudev->c_ident[j]);
}
}
@@ -726,9 +757,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_coreid_bits = 0;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->cpuid_level = -1; /* CPUID not detected */
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -759,8 +794,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
squash_the_stupid_serial_number(c);
/*
- * The vendor-specific functions might have changed features. Now
- * we do "generic changes."
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
*/
/* Filter out anything that depends on CPUID levels we don't have */
@@ -768,7 +803,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
- char *p;
+ const char *p;
p = table_lookup_model(c);
if (p)
strcpy(c->x86_model_id, p);
@@ -843,11 +878,11 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
}
struct msr_range {
- unsigned min;
- unsigned max;
+ unsigned min;
+ unsigned max;
};
-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
{ 0x00000000, 0x00000418},
{ 0xc0000000, 0xc000040b},
{ 0xc0010000, 0xc0010142},
@@ -856,14 +891,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
static void __cpuinit print_cpu_msr(void)
{
+ unsigned index_min, index_max;
unsigned index;
u64 val;
int i;
- unsigned index_min, index_max;
for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
index_min = msr_range_array[i].min;
index_max = msr_range_array[i].max;
+
for (index = index_min; index < index_max; index++) {
if (rdmsrl_amd_safe(index, &val))
continue;
@@ -873,6 +909,7 @@ static void __cpuinit print_cpu_msr(void)
}
static int show_msr __cpuinitdata;
+
static __init int setup_show_msr(char *arg)
{
int num;
@@ -894,12 +931,14 @@ __setup("noclflush", setup_noclflush);
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
- char *vendor = NULL;
+ const char *vendor = NULL;
- if (c->x86_vendor < X86_VENDOR_NUM)
+ if (c->x86_vendor < X86_VENDOR_NUM) {
vendor = this_cpu->c_vendor;
- else if (c->cpuid_level >= 0)
- vendor = c->x86_vendor_id;
+ } else {
+ if (c->cpuid_level >= 0)
+ vendor = c->x86_vendor_id;
+ }
if (vendor && !strstr(c->x86_model_id, vendor))
printk(KERN_CONT "%s ", vendor);
@@ -926,10 +965,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
static __init int setup_disablecpuid(char *arg)
{
int bit;
+
if (get_option(&arg, &bit) && bit < NCAPINTS*32)
setup_clear_cpu_cap(bit);
else
return 0;
+
return 1;
}
__setup("clearcpuid=", setup_disablecpuid);
@@ -939,6 +980,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE);
+
DEFINE_PER_CPU(char *, irq_stack_ptr) =
init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
@@ -948,12 +990,21 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
__aligned(PAGE_SIZE);
-extern asmlinkage void ignore_sysret(void);
-
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
@@ -983,7 +1034,7 @@ unsigned long kernel_eflags;
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
-#else /* x86_64 */
+#else /* CONFIG_X86_64 */
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU(unsigned long, stack_canary);
@@ -995,9 +1046,26 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
memset(regs, 0, sizeof(struct pt_regs));
regs->fs = __KERNEL_PERCPU;
regs->gs = __KERNEL_STACK_CANARY;
+
return regs;
}
-#endif /* x86_64 */
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+
+ set_debugreg(0, i);
+ }
+}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -1007,15 +1075,20 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
* A lot of state is already set up in PDA init for 64 bit
*/
#ifdef CONFIG_X86_64
+
void __cpuinit cpu_init(void)
{
- int cpu = stack_smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
- unsigned long v;
+ struct orig_ist *orig_ist;
struct task_struct *me;
+ struct tss_struct *t;
+ unsigned long v;
+ int cpu;
int i;
+ cpu = stack_smp_processor_id();
+ t = &per_cpu(init_tss, cpu);
+ orig_ist = &per_cpu(orig_ist, cpu);
+
#ifdef CONFIG_NUMA
if (cpu != 0 && percpu_read(node_number) == 0 &&
cpu_to_node(cpu) != NUMA_NO_NODE)
@@ -1056,19 +1129,17 @@ void __cpuinit cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!orig_ist->ist[0]) {
- static const unsigned int sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
- };
char *estacks = per_cpu(exception_stacks, cpu);
+
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- estacks += sizes[v];
+ estacks += exception_stack_sizes[v];
orig_ist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
}
}
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
/*
* <= is required because the CPU will access up to
* 8 bits beyond the end of the IO permission bitmap.
@@ -1078,8 +1149,7 @@ void __cpuinit cpu_init(void)
atomic_inc(&init_mm.mm_count);
me->active_mm = &init_mm;
- if (me->mm)
- BUG();
+ BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);
load_sp0(t, &current->thread);
@@ -1098,17 +1168,7 @@ void __cpuinit cpu_init(void)
arch_kgdb_ops.correct_hw_break();
else
#endif
- {
- /*
- * Clear all 6 debug registers:
- */
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
- }
+ clear_all_debug_regs();
fpu_init();
@@ -1129,7 +1189,8 @@ void __cpuinit cpu_init(void)
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;) local_irq_enable();
+ for (;;)
+ local_irq_enable();
}
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1145,8 +1206,7 @@ void __cpuinit cpu_init(void)
*/
atomic_inc(&init_mm.mm_count);
curr->active_mm = &init_mm;
- if (curr->mm)
- BUG();
+ BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
@@ -1159,13 +1219,7 @@ void __cpuinit cpu_init(void)
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear all 6 debug registers: */
- set_debugreg(0, 0);
- set_debugreg(0, 1);
- set_debugreg(0, 2);
- set_debugreg(0, 3);
- set_debugreg(0, 6);
- set_debugreg(0, 7);
+ clear_all_debug_regs();
/*
* Force FPU initialization:
@@ -1185,6 +1239,4 @@ void __cpuinit cpu_init(void)
xsave_init();
}
-
-
#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a3921..6de9a908e40 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -3,33 +3,34 @@
#define ARCH_X86_CPU_H
struct cpu_model_info {
- int vendor;
- int family;
- char *model_names[16];
+ int vendor;
+ int family;
+ const char *model_names[16];
};
/* attempt to consolidate cpu attributes */
struct cpu_dev {
- char * c_vendor;
+ const char *c_vendor;
/* some have two possibilities for cpuid string */
- char * c_ident[2];
+ const char *c_ident[2];
struct cpu_model_info c_models[4];
- void (*c_early_init)(struct cpuinfo_x86 *c);
- void (*c_init)(struct cpuinfo_x86 * c);
- void (*c_identify)(struct cpuinfo_x86 * c);
- unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
- int c_x86_vendor;
+ void (*c_early_init)(struct cpuinfo_x86 *);
+ void (*c_init)(struct cpuinfo_x86 *);
+ void (*c_identify)(struct cpuinfo_x86 *);
+ unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
+ int c_x86_vendor;
};
#define cpu_dev_register(cpu_devX) \
- static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__attribute__((__section__(".x86_cpu_dev.init"))) = \
&cpu_devX;
-extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+ *const __x86_cpu_dev_end[];
extern void display_cacheinfo(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 00000000000..46e29ab96c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,901 @@
+/*
+ * CPU x86 architecture debug code
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+
+#include <asm/cpu_debug.h>
+#include <asm/paravirt.h>
+#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
+static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(unsigned, cpu_modelflag);
+static DEFINE_PER_CPU(int, cpu_priv_count);
+static DEFINE_PER_CPU(unsigned, cpu_model);
+
+static DEFINE_MUTEX(cpu_debug_lock);
+
+static struct dentry *cpu_debugfs_dir;
+
+static struct cpu_debug_base cpu_base[] = {
+ { "mc", CPU_MC, 0 },
+ { "monitor", CPU_MONITOR, 0 },
+ { "time", CPU_TIME, 0 },
+ { "pmc", CPU_PMC, 1 },
+ { "platform", CPU_PLATFORM, 0 },
+ { "apic", CPU_APIC, 0 },
+ { "poweron", CPU_POWERON, 0 },
+ { "control", CPU_CONTROL, 0 },
+ { "features", CPU_FEATURES, 0 },
+ { "lastbranch", CPU_LBRANCH, 0 },
+ { "bios", CPU_BIOS, 0 },
+ { "freq", CPU_FREQ, 0 },
+ { "mtrr", CPU_MTRR, 0 },
+ { "perf", CPU_PERF, 0 },
+ { "cache", CPU_CACHE, 0 },
+ { "sysenter", CPU_SYSENTER, 0 },
+ { "therm", CPU_THERM, 0 },
+ { "misc", CPU_MISC, 0 },
+ { "debug", CPU_DEBUG, 0 },
+ { "pat", CPU_PAT, 0 },
+ { "vmx", CPU_VMX, 0 },
+ { "call", CPU_CALL, 0 },
+ { "base", CPU_BASE, 0 },
+ { "ver", CPU_VER, 0 },
+ { "conf", CPU_CONF, 0 },
+ { "smm", CPU_SMM, 0 },
+ { "svm", CPU_SVM, 0 },
+ { "osvm", CPU_OSVM, 0 },
+ { "tss", CPU_TSS, 0 },
+ { "cr", CPU_CR, 0 },
+ { "dt", CPU_DT, 0 },
+ { "registers", CPU_REG_ALL, 0 },
+};
+
+static struct cpu_file_base cpu_file[] = {
+ { "index", CPU_REG_ALL, 0 },
+ { "value", CPU_REG_ALL, 1 },
+};
+
+/* Intel Registers Range */
+static struct cpu_debug_range cpu_intel_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
+ { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
+ { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
+ { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
+ { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
+ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
+
+ { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
+ { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
+ { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
+ { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
+
+ { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
+ { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
+ { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
+ { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
+
+ { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
+ { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
+ { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
+
+ { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
+ { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
+ { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
+
+ { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
+ { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
+
+ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
+ { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
+ { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
+ { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
+ { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
+ { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
+ { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
+ { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
+
+ { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
+ { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
+ { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
+ { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
+ { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
+ { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
+
+ { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
+ { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
+ { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
+ { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
+ { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
+ { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
+ { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
+ { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
+
+ { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
+ { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
+ { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
+ { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
+
+ { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
+ { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
+
+ { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
+ { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
+ { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
+ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
+};
+
+/* AMD Registers Range */
+static struct cpu_debug_range cpu_amd_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
+ { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
+ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
+ { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
+ { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
+
+ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
+ { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
+ { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
+
+ { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
+ { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
+ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
+ { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
+
+ { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
+ { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
+ { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
+ { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
+ { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
+ { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
+ { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
+ { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
+ { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
+ { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
+ { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
+ { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
+ { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
+ { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
+ { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
+ { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
+ { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
+ { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
+};
+
+
+/* Intel */
+static int get_intel_modelflag(unsigned model)
+{
+ int flag;
+
+ switch (model) {
+ case 0x0501:
+ case 0x0502:
+ case 0x0504:
+ flag = CPU_INTEL_PENTIUM;
+ break;
+ case 0x0601:
+ case 0x0603:
+ case 0x0605:
+ case 0x0607:
+ case 0x0608:
+ case 0x060A:
+ case 0x060B:
+ flag = CPU_INTEL_P6;
+ break;
+ case 0x0609:
+ case 0x060D:
+ flag = CPU_INTEL_PENTIUM_M;
+ break;
+ case 0x060E:
+ flag = CPU_INTEL_CORE;
+ break;
+ case 0x060F:
+ case 0x0617:
+ flag = CPU_INTEL_CORE2;
+ break;
+ case 0x061C:
+ flag = CPU_INTEL_ATOM;
+ break;
+ case 0x0F00:
+ case 0x0F01:
+ case 0x0F02:
+ case 0x0F03:
+ case 0x0F04:
+ flag = CPU_INTEL_XEON_P4;
+ break;
+ case 0x0F06:
+ flag = CPU_INTEL_XEON_MP;
+ break;
+ default:
+ flag = CPU_NONE;
+ break;
+ }
+
+ return flag;
+}
+
+/* AMD */
+static int get_amd_modelflag(unsigned model)
+{
+ int flag;
+
+ switch (model >> 8) {
+ case 0x6:
+ flag = CPU_AMD_K6;
+ break;
+ case 0x7:
+ flag = CPU_AMD_K7;
+ break;
+ case 0x8:
+ flag = CPU_AMD_K8;
+ break;
+ case 0xf:
+ flag = CPU_AMD_0F;
+ break;
+ case 0x10:
+ flag = CPU_AMD_10;
+ break;
+ case 0x11:
+ flag = CPU_AMD_11;
+ break;
+ default:
+ flag = CPU_NONE;
+ break;
+ }
+
+ return flag;
+}
+
+static int get_cpu_modelflag(unsigned cpu)
+{
+ int flag;
+
+ flag = per_cpu(cpu_model, cpu);
+
+ switch (flag >> 16) {
+ case X86_VENDOR_INTEL:
+ flag = get_intel_modelflag(flag);
+ break;
+ case X86_VENDOR_AMD:
+ flag = get_amd_modelflag(flag & 0xffff);
+ break;
+ default:
+ flag = CPU_NONE;
+ break;
+ }
+
+ return flag;
+}
+
+static int get_cpu_range_count(unsigned cpu)
+{
+ int index;
+
+ switch (per_cpu(cpu_model, cpu) >> 16) {
+ case X86_VENDOR_INTEL:
+ index = ARRAY_SIZE(cpu_intel_range);
+ break;
+ case X86_VENDOR_AMD:
+ index = ARRAY_SIZE(cpu_amd_range);
+ break;
+ default:
+ index = 0;
+ break;
+ }
+
+ return index;
+}
+
+static int is_typeflag_valid(unsigned cpu, unsigned flag)
+{
+ unsigned vendor, modelflag;
+ int i, index;
+
+ /* Standard Registers should be always valid */
+ if (flag >= CPU_TSS)
+ return 1;
+
+ modelflag = per_cpu(cpu_modelflag, cpu);
+ vendor = per_cpu(cpu_model, cpu) >> 16;
+ index = get_cpu_range_count(cpu);
+
+ for (i = 0; i < index; i++) {
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if ((cpu_intel_range[i].model & modelflag) &&
+ (cpu_intel_range[i].flag & flag))
+ return 1;
+ break;
+ case X86_VENDOR_AMD:
+ if ((cpu_amd_range[i].model & modelflag) &&
+ (cpu_amd_range[i].flag & flag))
+ return 1;
+ break;
+ }
+ }
+
+ /* Invalid */
+ return 0;
+}
+
+static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
+ int index, unsigned flag)
+{
+ unsigned modelflag;
+
+ modelflag = per_cpu(cpu_modelflag, cpu);
+ *max = 0;
+ switch (per_cpu(cpu_model, cpu) >> 16) {
+ case X86_VENDOR_INTEL:
+ if ((cpu_intel_range[index].model & modelflag) &&
+ (cpu_intel_range[index].flag & flag)) {
+ *min = cpu_intel_range[index].min;
+ *max = cpu_intel_range[index].max;
+ }
+ break;
+ case X86_VENDOR_AMD:
+ if ((cpu_amd_range[index].model & modelflag) &&
+ (cpu_amd_range[index].flag & flag)) {
+ *min = cpu_amd_range[index].min;
+ *max = cpu_amd_range[index].max;
+ }
+ break;
+ }
+
+ return *max;
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_cpu_data(struct seq_file *seq, unsigned type,
+ u32 low, u32 high)
+{
+ struct cpu_private *priv;
+ u64 val = high;
+
+ if (seq) {
+ priv = seq->private;
+ if (priv->file) {
+ val = (val << 32) | low;
+ seq_printf(seq, "0x%llx\n", val);
+ } else
+ seq_printf(seq, " %08x: %08x_%08x\n",
+ type, high, low);
+ } else
+ printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
+{
+ unsigned msr, msr_min, msr_max;
+ struct cpu_private *priv;
+ u32 low, high;
+ int i, range;
+
+ if (seq) {
+ priv = seq->private;
+ if (priv->file) {
+ if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
+ &low, &high))
+ print_cpu_data(seq, priv->reg, low, high);
+ return;
+ }
+ }
+
+ range = get_cpu_range_count(cpu);
+
+ for (i = 0; i < range; i++) {
+ if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
+ continue;
+
+ for (msr = msr_min; msr <= msr_max; msr++) {
+ if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
+ continue;
+ print_cpu_data(seq, msr, low, high);
+ }
+ }
+}
+
+static void print_tss(void *arg)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct seq_file *seq = arg;
+ unsigned int seg;
+
+ seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
+ seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
+ seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
+ seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
+
+ seq_printf(seq, " RSI\t: %016lx\n", regs->si);
+ seq_printf(seq, " RDI\t: %016lx\n", regs->di);
+ seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
+ seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
+
+#ifdef CONFIG_X86_64
+ seq_printf(seq, " R08\t: %016lx\n", regs->r8);
+ seq_printf(seq, " R09\t: %016lx\n", regs->r9);
+ seq_printf(seq, " R10\t: %016lx\n", regs->r10);
+ seq_printf(seq, " R11\t: %016lx\n", regs->r11);
+ seq_printf(seq, " R12\t: %016lx\n", regs->r12);
+ seq_printf(seq, " R13\t: %016lx\n", regs->r13);
+ seq_printf(seq, " R14\t: %016lx\n", regs->r14);
+ seq_printf(seq, " R15\t: %016lx\n", regs->r15);
+#endif
+
+ asm("movl %%cs,%0" : "=r" (seg));
+ seq_printf(seq, " CS\t: %04x\n", seg);
+ asm("movl %%ds,%0" : "=r" (seg));
+ seq_printf(seq, " DS\t: %04x\n", seg);
+ seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
+ asm("movl %%es,%0" : "=r" (seg));
+ seq_printf(seq, " ES\t: %04x\n", seg);
+ asm("movl %%fs,%0" : "=r" (seg));
+ seq_printf(seq, " FS\t: %04x\n", seg);
+ asm("movl %%gs,%0" : "=r" (seg));
+ seq_printf(seq, " GS\t: %04x\n", seg);
+
+ seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
+
+ seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
+}
+
+static void print_cr(void *arg)
+{
+ struct seq_file *seq = arg;
+
+ seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
+ seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
+ seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
+ seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
+#ifdef CONFIG_X86_64
+ seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
+#endif
+}
+
+static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
+{
+ seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
+}
+
+static void print_dt(void *seq)
+{
+ struct desc_ptr dt;
+ unsigned long ldt;
+
+ /* IDT */
+ store_idt((struct desc_ptr *)&dt);
+ print_desc_ptr("IDT", seq, dt);
+
+ /* GDT */
+ store_gdt((struct desc_ptr *)&dt);
+ print_desc_ptr("GDT", seq, dt);
+
+ /* LDT */
+ store_ldt(ldt);
+ seq_printf(seq, " LDT\t: %016lx\n", ldt);
+
+ /* TR */
+ store_tr(ldt);
+ seq_printf(seq, " TR\t: %016lx\n", ldt);
+}
+
+static void print_dr(void *arg)
+{
+ struct seq_file *seq = arg;
+ unsigned long dr;
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+ get_debugreg(dr, i);
+ seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
+ }
+
+ seq_printf(seq, "\n MSR\t:\n");
+}
+
+static void print_apic(void *arg)
+{
+ struct seq_file *seq = arg;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(seq, " LAPIC\t:\n");
+ seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
+ seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
+ seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
+ seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
+ seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
+ seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
+ seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
+ seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
+ seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
+ seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
+ seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
+ seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
+ seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
+ seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
+ seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
+ seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
+ seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
+ seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
+ seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
+ seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
+ seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+ seq_printf(seq, "\n MSR\t:\n");
+}
+
+static int cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct cpu_private *priv = seq->private;
+
+ if (priv == NULL)
+ return -EINVAL;
+
+ switch (cpu_base[priv->type].flag) {
+ case CPU_TSS:
+ smp_call_function_single(priv->cpu, print_tss, seq, 1);
+ break;
+ case CPU_CR:
+ smp_call_function_single(priv->cpu, print_cr, seq, 1);
+ break;
+ case CPU_DT:
+ smp_call_function_single(priv->cpu, print_dt, seq, 1);
+ break;
+ case CPU_DEBUG:
+ if (priv->file == CPU_INDEX_BIT)
+ smp_call_function_single(priv->cpu, print_dr, seq, 1);
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+ case CPU_APIC:
+ if (priv->file == CPU_INDEX_BIT)
+ smp_call_function_single(priv->cpu, print_apic, seq, 1);
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+
+ default:
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+ }
+ seq_printf(seq, "\n");
+
+ return 0;
+}
+
+static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos == 0) /* One time is enough ;-) */
+ return seq;
+
+ return NULL;
+}
+
+static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return cpu_seq_start(seq, pos);
+}
+
+static void cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations cpu_seq_ops = {
+ .start = cpu_seq_start,
+ .next = cpu_seq_next,
+ .stop = cpu_seq_stop,
+ .show = cpu_seq_show,
+};
+
+static int cpu_seq_open(struct inode *inode, struct file *file)
+{
+ struct cpu_private *priv = inode->i_private;
+ struct seq_file *seq;
+ int err;
+
+ err = seq_open(file, &cpu_seq_ops);
+ if (!err) {
+ seq = file->private_data;
+ seq->private = priv;
+ }
+
+ return err;
+}
+
+static int write_msr(struct cpu_private *priv, u64 val)
+{
+ u32 low, high;
+
+ high = (val >> 32) & 0xffffffff;
+ low = val & 0xffffffff;
+
+ if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
+ return 0;
+
+ return -EPERM;
+}
+
+static int write_cpu_register(struct cpu_private *priv, const char *buf)
+{
+ int ret = -EPERM;
+ u64 val;
+
+ ret = strict_strtoull(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ /* Supporting only MSRs */
+ if (priv->type < CPU_TSS_BIT)
+ return write_msr(priv, val);
+
+ return ret;
+}
+
+static ssize_t cpu_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct cpu_private *priv = seq->private;
+ char buf[19];
+
+ if ((priv == NULL) || (count >= sizeof(buf)))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
+ if (!write_cpu_register(priv, buf))
+ return count;
+
+ return -EACCES;
+}
+
+static const struct file_operations cpu_fops = {
+ .owner = THIS_MODULE,
+ .open = cpu_seq_open,
+ .read = seq_read,
+ .write = cpu_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
+ unsigned file, struct dentry *dentry)
+{
+ struct cpu_private *priv = NULL;
+
+ /* Already intialized */
+ if (file == CPU_INDEX_BIT)
+ if (per_cpu(cpu_arr[type].init, cpu))
+ return 0;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv == NULL)
+ return -ENOMEM;
+
+ priv->cpu = cpu;
+ priv->type = type;
+ priv->reg = reg;
+ priv->file = file;
+ mutex_lock(&cpu_debug_lock);
+ per_cpu(priv_arr[type], cpu) = priv;
+ per_cpu(cpu_priv_count, cpu)++;
+ mutex_unlock(&cpu_debug_lock);
+
+ if (file)
+ debugfs_create_file(cpu_file[file].name, S_IRUGO,
+ dentry, (void *)priv, &cpu_fops);
+ else {
+ debugfs_create_file(cpu_base[type].name, S_IRUGO,
+ per_cpu(cpu_arr[type].dentry, cpu),
+ (void *)priv, &cpu_fops);
+ mutex_lock(&cpu_debug_lock);
+ per_cpu(cpu_arr[type].init, cpu) = 1;
+ mutex_unlock(&cpu_debug_lock);
+ }
+
+ return 0;
+}
+
+static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
+ struct dentry *dentry)
+{
+ unsigned file;
+ int err = 0;
+
+ for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
+ err = cpu_create_file(cpu, type, reg, file, dentry);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
+{
+ struct dentry *cpu_dentry = NULL;
+ unsigned reg, reg_min, reg_max;
+ int i, range, err = 0;
+ char reg_dir[12];
+ u32 low, high;
+
+ range = get_cpu_range_count(cpu);
+
+ for (i = 0; i < range; i++) {
+ if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
+ cpu_base[type].flag))
+ continue;
+
+ for (reg = reg_min; reg <= reg_max; reg++) {
+ if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
+ continue;
+
+ sprintf(reg_dir, "0x%x", reg);
+ cpu_dentry = debugfs_create_dir(reg_dir, dentry);
+ err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
+ if (err)
+ return err;
+ }
+ }
+
+ return err;
+}
+
+static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
+{
+ struct dentry *cpu_dentry = NULL;
+ unsigned type;
+ int err = 0;
+
+ for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
+ if (!is_typeflag_valid(cpu, cpu_base[type].flag))
+ continue;
+ cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
+ per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+
+ if (type < CPU_TSS_BIT)
+ err = cpu_init_msr(cpu, type, cpu_dentry);
+ else
+ err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
+ cpu_dentry);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int cpu_init_cpu(void)
+{
+ struct dentry *cpu_dentry = NULL;
+ struct cpuinfo_x86 *cpui;
+ char cpu_dir[12];
+ unsigned cpu;
+ int err = 0;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ cpui = &cpu_data(cpu);
+ if (!cpu_has(cpui, X86_FEATURE_MSR))
+ continue;
+ per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
+ (cpui->x86 << 8) |
+ (cpui->x86_model));
+ per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
+
+ sprintf(cpu_dir, "cpu%d", cpu);
+ cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
+ err = cpu_init_allreg(cpu, cpu_dentry);
+
+ pr_info("cpu%d(%d) debug files %d\n",
+ cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
+ if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
+ pr_err("Register files count %d exceeds limit %d\n",
+ per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
+ per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
+ err = -ENFILE;
+ }
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int __init cpu_debug_init(void)
+{
+ cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
+
+ return cpu_init_cpu();
+}
+
+static void __exit cpu_debug_exit(void)
+{
+ int i, cpu;
+
+ if (cpu_debugfs_dir)
+ debugfs_remove_recursive(cpu_debugfs_dir);
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
+ kfree(per_cpu(priv_arr[i], cpu));
+}
+
+module_init(cpu_debug_init);
+module_exit(cpu_debug_exit);
+
+MODULE_AUTHOR("Jaswinder Singh Rajput");
+MODULE_DESCRIPTION("CPU Debug module");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071..593171e967e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
*/
static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
-static char Cx86_model[][9] __cpuinitdata = {
+static const char __cpuinitconst Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static char Cx486_name[][5] __cpuinitdata = {
+static const char __cpuinitconst Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static char Cx486S_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
-static char Cx486D_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __cpuinitdata = "12??43";
-static char cyrix_model_mult2[] __cpuinitdata = "12233445";
+static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
+static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
}
}
-static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
cpu_dev_register(cyrix_cpu_dev);
-static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1a89a2b68d1..7437fa133c0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -14,6 +14,7 @@
#include <asm/uaccess.h>
#include <asm/ds.h>
#include <asm/bugs.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_64
#include <asm/topology.h>
@@ -54,6 +55,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
c->x86_cache_alignment = 128;
#endif
+ /* CPUID workaround for 0F33/0F34 CPU */
+ if (c->x86 == 0xF && c->x86_model == 0x3
+ && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+ c->x86_phys_bits = 36;
+
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states.
@@ -116,6 +122,28 @@ static void __cpuinit trap_init_f00f_bug(void)
}
#endif
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ /* calling is from identify_secondary_cpu() ? */
+ if (c->cpu_index == boot_cpu_id)
+ return;
+
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86 == 5 &&
+ c->x86_mask >= 1 && c->x86_mask <= 4 &&
+ c->x86_model <= 3) {
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+ "with B stepping processors.\n");
+ }
+#endif
+}
+
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
@@ -192,6 +220,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_NUMAQ
numaq_tsc_disable();
#endif
+
+ intel_smp_check(c);
}
#else
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -391,7 +421,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
}
#endif
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7293508d8f5..c471eb1a389 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
};
/* all the cache descriptor types we care about (no TLB or trace cache entries) */
-static struct _cache_table cache_table[] __cpuinitdata =
+static const struct _cache_table __cpuinitconst cache_table[] =
{
{ 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
{ 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -206,15 +206,15 @@ union l3_cache {
unsigned val;
};
-static unsigned short assocs[] __cpuinitdata = {
+static const unsigned short __cpuinitconst assocs[] = {
[1] = 1, [2] = 2, [4] = 4, [6] = 8,
[8] = 16, [0xa] = 32, [0xb] = 48,
[0xc] = 64,
[0xf] = 0xffff // ??
};
-static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
-static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
+static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
+static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
static void __cpuinit
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
}
}
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f..ca14604611e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
@@ -24,6 +26,9 @@
#include <linux/ctype.h>
#include <linux/kmod.h>
#include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -32,7 +37,6 @@
#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
atomic_t mce_entry;
@@ -47,7 +51,7 @@ static int mce_dont_init;
*/
static int tolerant = 1;
static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = smp_processor_id();
+ rdtscll(m->tsc);
+}
+
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
print_symbol("{%s}", m->ip);
printk("\n");
}
- printk(KERN_EMERG "TSC %Lx ", m->tsc);
+ printk(KERN_EMERG "TSC %llx ", m->tsc);
if (m->addr)
- printk("ADDR %Lx ", m->addr);
+ printk("ADDR %llx ", m->addr);
if (m->misc)
- printk("MISC %Lx ", m->misc);
+ printk("MISC %llx ", m->misc);
printk("\n");
printk(KERN_EMERG "This is not a software problem!\n");
printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
panic(msg);
}
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
{
+ if (mce_dont_init)
+ return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
}
/*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce m;
+ int i;
+
+ mce_setup(&m);
+
+ rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ for (i = 0; i < banks; i++) {
+ if (!bank[i] || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+ m.tsc = 0;
+
+ barrier();
+ rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Uncorrected events are handled by the exception handler
+ * when it is enabled. But when the exception is disabled log
+ * everything.
+ *
+ * TBD do the same check for MCI_STATUS_EN here?
+ */
+ if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+ continue;
+
+ if (m.status & MCI_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+
+ /*
+ * Clear state for this bank.
+ */
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
*/
void do_machine_check(struct pt_regs * regs, long error_code)
{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* error.
*/
int kill_it = 0;
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
atomic_inc(&mce_entry);
- if ((regs
- && notify_die(DIE_NMI, "machine check", regs, error_code,
+ if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
- || !banks)
+ goto out2;
+ if (!banks)
goto out2;
- memset(&m, 0, sizeof(struct mce));
- m.cpu = smp_processor_id();
+ mce_setup(&m);
+
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
/* if the restart IP is not valid, we're done for */
if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
barrier();
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS && !bank[i])
+ __clear_bit(i, toclear);
+ if (!bank[i])
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.tsc = 0;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
+ /*
+ * Non uncorrected errors are handled by machine_check_poll
+ * Leave them alone.
+ */
+ if ((m.status & MCI_STATUS_UC) == 0)
+ continue;
+
+ /*
+ * Set taint even when machine check was not enabled.
+ */
+ add_taint(TAINT_MACHINE_CHECK);
+
+ __set_bit(i, toclear);
+
if (m.status & MCI_STATUS_EN) {
/* if PCC was set, there's no way out */
no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
no_way_out = 1;
kill_it = 1;
}
+ } else {
+ /*
+ * Machine check event was not enabled. Clear, but
+ * ignore.
+ */
+ continue;
}
if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
mce_get_rip(&m, regs);
- if (error_code >= 0)
- rdtscll(m.tsc);
- if (error_code != -2)
- mce_log(&m);
+ mce_log(&m);
/* Did this bank cause the exception? */
/* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
panicm = m;
panicm_found = 1;
}
-
- add_taint(TAINT_MACHINE_CHECK);
}
- /* Never do anything final in the polling timer */
- if (!regs)
- goto out;
-
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
/* notify userspace ASAP */
set_thread_flag(TIF_MCE_NOTIFY);
- out:
/* the last thing we do is clear state */
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ for (i = 0; i < banks; i++) {
+ if (test_bit(i, toclear))
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
wrmsrl(MSR_IA32_MCG_STATUS, 0);
out2:
atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* and historically has been the register value of the
* MSR_IA32_THERMAL_STATUS (Intel) msr.
*/
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
{
struct mce m;
- memset(&m, 0, sizeof(m));
- m.cpu = cpu;
+ mce_setup(&m);
m.bank = MCE_THERMAL_BANK;
m.status = status;
- rdtscll(m.tsc);
mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
static int check_interval = 5 * 60; /* 5 minutes */
static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
{
- if (mce_available(&current_cpu_data))
- do_machine_check(NULL, 0);
-}
+ struct timer_list *t = &per_cpu(mce_timer, data);
-static void mcheck_timer(struct work_struct *work)
-{
- on_each_cpu(mcheck_check_cpu, NULL, 1);
+ WARN_ON(smp_processor_id() != data);
+
+ if (mce_available(&current_cpu_data))
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
/*
* Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
(int)round_jiffies_relative(check_interval*HZ));
}
- schedule_delayed_work(&mcheck_work, next_interval);
+ t->expires = jiffies + next_interval;
+ add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
/*
- * This is only called from process context. This is where we do
- * anything we need to alert userspace about new MCEs. This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
*/
int mce_notify_user(void)
{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &notify_user)) {
- static unsigned long last_print;
- unsigned long now = jiffies;
-
wake_up_interruptible(&mce_wait);
- if (trigger[0])
- call_usermodehelper(trigger, trigger_argv, NULL,
- UMH_NO_WAIT);
- if (time_after_eq(now, last_print + (check_interval*HZ))) {
- last_print = now;
+ /*
+ * There is no risk of missing notifications because
+ * work_pending is always cleared before the function is
+ * executed.
+ */
+ if (trigger[0] && !work_pending(&mce_trigger_work))
+ schedule_work(&mce_trigger_work);
+
+ if (__ratelimit(&ratelimit))
printk(KERN_INFO "Machine check events logged\n");
- }
return 1;
}
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
static __init int periodic_mcheck_init(void)
{
- next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
- idle_notifier_register(&mce_idle_notifier);
- return 0;
+ idle_notifier_register(&mce_idle_notifier);
+ return 0;
}
__initcall(periodic_mcheck_init);
-
/*
* Initialize Machine Checks for a CPU.
*/
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
{
u64 cap;
- int i;
+ unsigned b;
rdmsrl(MSR_IA32_MCG_CAP, cap);
- banks = cap & 0xff;
- if (banks > MCE_EXTENDED_BANK) {
- banks = MCE_EXTENDED_BANK;
- printk(KERN_INFO "MCE: warning: using only %d banks\n",
- MCE_EXTENDED_BANK);
+ b = cap & 0xff;
+ if (b > MAX_NR_BANKS) {
+ printk(KERN_WARNING
+ "MCE: Using only %u machine check banks out of %u\n",
+ MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
}
+
+ /* Don't support asymmetric configurations today */
+ WARN_ON(banks != 0 && b != banks);
+ banks = b;
+ if (!bank) {
+ bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+ if (!bank)
+ return -ENOMEM;
+ memset(bank, 0xff, banks * sizeof(u64));
+ }
+
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
- /* Log the machine checks left over from the previous reset.
- This also clears all registers */
- do_machine_check(NULL, mce_bootlog ? -1 : -2);
+ return 0;
+}
+
+static void mce_init(void *dummy)
+{
+ u64 cap;
+ int i;
+ mce_banks_t all_banks;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC, &all_banks);
set_in_cr4(X86_CR4_MCE);
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS)
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- else
- wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
/* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
- if(c->x86 == 15)
+ if (c->x86 == 15 && banks > 4)
/* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, &bank[4]);
+ clear_bit(10, (unsigned long *)&bank[4]);
if(c->x86 <= 17 && mce_bootlog < 0)
/* Lots of broken BIOS around that don't clear them
by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
}
}
+static void mce_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+
+ /* data race harmless because everyone sets to the same value */
+ if (!next_interval)
+ next_interval = check_interval * HZ;
+ if (!next_interval)
+ return;
+ setup_timer(t, mcheck_timer, smp_processor_id());
+ t->expires = round_jiffies(jiffies + next_interval);
+ add_timer(t);
+}
+
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
- mce_cpu_quirks(c);
+ if (!mce_available(c))
+ return;
- if (mce_dont_init ||
- !mce_available(c))
+ if (mce_cap_init() < 0) {
+ mce_dont_init = 1;
return;
+ }
+ mce_cpu_quirks(c);
mce_init(NULL);
mce_cpu_features(c);
+ mce_init_timer();
}
/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
{
unsigned long *cpu_tsc;
static DEFINE_MUTEX(mce_read_mutex);
- unsigned next;
+ unsigned prev, next;
char __user *buf = ubuf;
int i, err;
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
}
err = 0;
- for (i = 0; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i,0, sizeof(struct mce));
- goto timeout;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+
+ while (!mcelog.entry[i].finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(mcelog.entry + i, 0,
+ sizeof(struct mce));
+ goto timeout;
+ }
+ cpu_relax();
}
- cpu_relax();
+ smp_rmb();
+ err |= copy_to_user(buf, mcelog.entry + i,
+ sizeof(struct mce));
+ buf += sizeof(struct mce);
+timeout:
+ ;
}
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
- buf += sizeof(struct mce);
- timeout:
- ;
- }
- memset(mcelog.entry, 0, next * sizeof(struct mce));
- mcelog.next = 0;
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
synchronize_sched();
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
&mce_chrdev_ops,
};
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
/*
* Old style boot options parsing. Only for compatibility.
*/
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
return 1;
}
-/* mce=off disables machine check. Note you can re-enable it later
- using sysfs.
+/* mce=off disables machine check.
mce=TOLERANCELEVEL (number, see above)
mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
* Sysfs support
*/
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+ int i;
+
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+ return mce_disable();
+}
+
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
Only one CPU is active at this time, the others get readded later using
CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
return 0;
}
+static void mce_cpu_restart(void *data)
+{
+ del_timer_sync(&__get_cpu_var(mce_timer));
+ if (mce_available(&current_cpu_data))
+ mce_init(NULL);
+ mce_init_timer();
+}
+
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
- if (next_interval)
- cancel_delayed_work(&mcheck_work);
- /* Timer race is harmless here */
- on_each_cpu(mce_init, NULL, 1);
next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
+ on_each_cpu(mce_cpu_restart, NULL, 1);
}
static struct sysdev_class mce_sysclass = {
+ .suspend = mce_suspend,
+ .shutdown = mce_shutdown,
.resume = mce_resume,
.name = "machinecheck",
};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ char *buf)
+{
+ u64 b = bank[attr - bank_attrs];
+ return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *end;
+ u64 new = simple_strtoull(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ bank[attr - bank_attrs] = new;
+ mce_restart();
+ return end-buf;
+}
static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
ACCESSOR(check_interval,check_interval,mce_restart())
static struct sysdev_attribute *mce_attributes[] = {
- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
NULL
};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
if (err)
goto error;
}
+ for (i = 0; i < banks; i++) {
+ err = sysdev_create_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ if (err)
+ goto error2;
+ }
cpu_set(cpu, mce_device_initialized);
return 0;
+error2:
+ while (--i >= 0) {
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ }
error:
- while (i--) {
+ while (--i >= 0) {
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
}
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
+ for (i = 0; i < banks; i++)
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
cpu_clear(cpu, mce_device_initialized);
}
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_clear();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_reenable();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
switch (action) {
case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
threshold_cpu_callback(action, cpu);
mce_remove_device(cpu);
break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ del_timer_sync(t);
+ smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ t->expires = round_jiffies(jiffies + next_interval);
+ add_timer_on(t, cpu);
+ smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ break;
+ case CPU_POST_DEAD:
+ /* intentionally ignoring frozen here */
+ cmci_rediscover(cpu);
+ break;
}
return NOTIFY_OK;
}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
.notifier_call = mce_cpu_callback,
};
+static __init int mce_init_banks(void)
+{
+ int i;
+
+ bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+ GFP_KERNEL);
+ if (!bank_attrs)
+ return -ENOMEM;
+
+ for (i = 0; i < banks; i++) {
+ struct sysdev_attribute *a = &bank_attrs[i];
+ a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+ if (!a->attr.name)
+ goto nomem;
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+ return 0;
+
+nomem:
+ while (--i >= 0)
+ kfree(bank_attrs[i].attr.name);
+ kfree(bank_attrs);
+ bank_attrs = NULL;
+ return -ENOMEM;
+}
+
static __init int mce_init_device(void)
{
int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
+
+ err = mce_init_banks();
+ if (err)
+ return err;
+
err = sysdev_class_register(&mce_sysclass);
if (err)
return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd46..7d01be86887 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+static void amd_threshold_interrupt(void);
+
/*
* CPU Initialization
*/
@@ -90,7 +92,8 @@ struct thresh_restart {
};
/* must be called with correct cpu affinity */
-static long threshold_restart_bank(void *_tr)
+/* Called via smp_call_function_single() */
+static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
u32 mci_misc_hi, mci_misc_lo;
@@ -117,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
mci_misc_hi |= MASK_COUNT_EN_HI;
wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
- return 0;
}
/* cpu init entry point, called from mce.c with preempt off */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
tr.reset = 0;
tr.old_limit = 0;
threshold_restart_bank(&tr);
+
+ mce_threshold_vector = amd_threshold_interrupt;
}
}
}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
{
unsigned int bank, block;
struct mce m;
u32 low = 0, high = 0, address = 0;
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- memset(&m, 0, sizeof(m));
- rdtscll(m.tsc);
- m.cpu = smp_processor_id();
+ mce_setup(&m);
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
/* Log the machine check that caused the threshold
event. */
- do_machine_check(NULL, 0);
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
+ bank * NR_BLOCKS
+ block;
mce_log(&m);
- goto out;
+ return;
}
}
}
-out:
- inc_irq_stat(irq_threshold_count);
- irq_exit();
}
/*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
tr.b = b;
tr.reset = 0;
tr.old_limit = 0;
- work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return end - buf;
}
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
tr.b = b;
tr.reset = 0;
- work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return end - buf;
}
-static long local_error_count(void *_b)
+struct threshold_block_cross_cpu {
+ struct threshold_block *tb;
+ long retval;
+};
+
+static void local_error_count_handler(void *_tbcc)
{
- struct threshold_block *b = _b;
+ struct threshold_block_cross_cpu *tbcc = _tbcc;
+ struct threshold_block *b = tbcc->tb;
u32 low, high;
rdmsr(b->address, low, high);
- return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+ tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
}
static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
- return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
+ struct threshold_block_cross_cpu tbcc = { .tb = b, };
+
+ smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
+ return sprintf(buf, "%lx\n", tbcc.retval);
}
static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
{
struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
- work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return 1;
}
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
return 0;
- if (rdmsr_safe(address, &low, &high))
+ if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
return err;
}
-static __cpuinit long local_allocate_threshold_blocks(void *_bank)
+static __cpuinit long
+local_allocate_threshold_blocks(int cpu, unsigned int bank)
{
- unsigned int *bank = _bank;
-
- return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
- MSR_IA32_MC0_MISC + *bank * 4);
+ return allocate_threshold_blocks(cpu, bank, 0,
+ MSR_IA32_MC0_MISC + bank * 4);
}
/* symlinks sibling shared banks to first core. first core owns dir/files. */
@@ -530,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
per_cpu(threshold_banks, cpu)[bank] = b;
- err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
+ err = local_allocate_threshold_blocks(cpu, bank);
if (err)
goto out_free;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e..57df3d38347 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
@@ -13,6 +15,7 @@
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
+#include <asm/apic.h>
asmlinkage void smp_thermal_interrupt(void)
{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & 1))
- mce_log_therm_throt_event(smp_processor_id(), msr_val);
+ mce_log_therm_throt_event(msr_val);
inc_irq_stat(irq_thermal_count);
irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+ u64 cap;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+ if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+ return 0;
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+ if (*hdr == 0)
+ printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+ *hdr = 1;
+ printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+ unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+ int hdr = 0;
+ int i;
+
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+
+ if (test_bit(i, owned))
+ continue;
+
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Already owned by someone else? */
+ if (val & CMCI_EN) {
+ if (test_and_clear_bit(i, owned) || boot)
+ print_update("SHD", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ continue;
+ }
+
+ val |= CMCI_EN | CMCI_THRESHOLD;
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Did the enable bit stick? -- the bank supports CMCI */
+ if (val & CMCI_EN) {
+ if (!test_and_set_bit(i, owned) || boot)
+ print_update("CMCI", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ } else {
+ WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+ }
+ }
+ spin_unlock(&cmci_discover_lock);
+ if (hdr)
+ printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+ return;
+ local_irq_save(flags);
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ int i;
+ int banks;
+ u64 val;
+
+ if (!cmci_supported(&banks))
+ return;
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+ continue;
+ /* Disable CMCI */
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ __clear_bit(i, __get_cpu_var(mce_banks_owned));
+ }
+ spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+ int banks;
+ int cpu;
+ cpumask_var_t old;
+
+ if (!cmci_supported(&banks))
+ return;
+ if (!alloc_cpumask_var(&old, GFP_KERNEL))
+ return;
+ cpumask_copy(old, &current->cpus_allowed);
+
+ for_each_online_cpu (cpu) {
+ if (cpu == dying)
+ continue;
+ if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+ continue;
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+ }
+
+ set_cpus_allowed_ptr(current, old);
+ free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+}
+
+static void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks, 1);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
+ intel_init_cmci();
}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..23ee9e730f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+ printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+ exit_idle();
+ irq_enter();
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ irq_exit();
+ /* Ack only at the end to avoid potential reentry */
+ ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc053364..f4361b56f8e 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y := main.o if.o generic.o state.o
+obj-y := main.o if.o generic.o state.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 00000000000..ce0fe4b5c04
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
+/* MTRR (Memory Type Range Register) cleanup
+
+ Copyright (C) 2009 Yinghai Lu
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/sort.h>
+
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/kvm_para.h>
+#include "mtrr.h"
+
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ /* out of slots */
+ if (nr_range >= RANGE_NUM)
+ return nr_range;
+
+ range[nr_range].start = start;
+ range[nr_range].end = end;
+
+ nr_range++;
+
+ return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ /* try to merge it with old one */
+ for (i = 0; i < nr_range; i++) {
+ unsigned long final_start, final_end;
+ unsigned long common_start, common_end;
+
+ if (!range[i].end)
+ continue;
+
+ common_start = max(range[i].start, start);
+ common_end = min(range[i].end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(range[i].start, start);
+ final_end = max(range[i].end, end);
+
+ range[i].start = final_start;
+ range[i].end = final_end;
+ return nr_range;
+ }
+
+ /* need to add that */
+ return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+ int i, j;
+
+ for (j = 0; j < RANGE_NUM; j++) {
+ if (!range[j].end)
+ continue;
+
+ if (start <= range[j].start && end >= range[j].end) {
+ range[j].start = 0;
+ range[j].end = 0;
+ continue;
+ }
+
+ if (start <= range[j].start && end < range[j].end &&
+ range[j].start < end + 1) {
+ range[j].start = end + 1;
+ continue;
+ }
+
+
+ if (start > range[j].start && end >= range[j].end &&
+ range[j].end > start - 1) {
+ range[j].end = start - 1;
+ continue;
+ }
+
+ if (start > range[j].start && end < range[j].end) {
+ /* find the new spare */
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (range[i].end == 0)
+ break;
+ }
+ if (i < RANGE_NUM) {
+ range[i].end = range[j].end;
+ range[i].start = end + 1;
+ } else {
+ printk(KERN_ERR "run of slot in ranges\n");
+ }
+ range[j].end = start - 1;
+ continue;
+ }
+ }
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+ const struct res_range *r1 = x1;
+ const struct res_range *r2 = x2;
+ long start1, start2;
+
+ start1 = r1->start;
+ start2 = r2->start;
+
+ return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+ unsigned long base_pfn;
+ unsigned long size_pfn;
+ mtrr_type type;
+};
+
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size)
+{
+ unsigned long base, size;
+ mtrr_type type;
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ nr_range = add_range_with_merge(range, nr_range, base,
+ base + size - 1);
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* take out UC ranges */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ continue;
+ base = range_state[i].base_pfn;
+ if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
+ (mtrr_state.enabled & 1)) {
+ /* Var MTRR contains UC entry below 1M? Skip it: */
+ printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
+ "contains strange UC entry under 1M, check "
+ "with your system vendor!\n", i);
+ if (base + size <= (1<<(20-PAGE_SHIFT)))
+ continue;
+ size -= (1<<(20-PAGE_SHIFT)) - base;
+ base = 1<<(20-PAGE_SHIFT);
+ }
+ subtract_range(range, base, base + size - 1);
+ }
+ if (extra_remove_size)
+ subtract_range(range, extra_remove_base,
+ extra_remove_base + extra_remove_size - 1);
+
+ /* get new range num */
+ nr_range = 0;
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+ nr_range++;
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After UC checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ if (debug_print) {
+ printk(KERN_DEBUG "After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* clear those is not used */
+ for (i = nr_range; i < RANGE_NUM; i++)
+ memset(&range[i], 0, sizeof(range[i]));
+
+ return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+ unsigned long sum;
+ int i;
+
+ sum = 0;
+ for (i = 0; i < nr_range; i++)
+ sum += range[i].end + 1 - range[i].start;
+
+ return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 0;
+ return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 1;
+ return 0;
+}
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
+struct var_mtrr_state {
+ unsigned long range_startk;
+ unsigned long range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+ unsigned int reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type, unsigned int address_bits)
+{
+ u32 base_lo, base_hi, mask_lo, mask_hi;
+ u64 base, mask;
+
+ if (!sizek) {
+ fill_mtrr_var_range(reg, 0, 0, 0, 0);
+ return;
+ }
+
+ mask = (1ULL << address_bits) - 1;
+ mask &= ~((((u64)sizek) << 10) - 1);
+
+ base = ((u64)basek) << 10;
+
+ base |= type;
+ mask |= 0x800;
+
+ base_lo = base & ((1ULL<<32) - 1);
+ base_hi = base >> 32;
+
+ mask_lo = mask & ((1ULL<<32) - 1);
+ mask_hi = mask >> 32;
+
+ fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type)
+{
+ range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+ range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+ range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+ unsigned long basek, sizek;
+ unsigned char type;
+ unsigned int reg;
+
+ for (reg = 0; reg < num_var_ranges; reg++) {
+ basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+ sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+ type = range_state[reg].type;
+
+ set_var_mtrr(reg, basek, sizek, type, address_bits);
+ }
+}
+
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ char factor;
+ unsigned long base = sizek;
+
+ if (base & ((1<<10) - 1)) {
+ /* not MB alignment */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)) {
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+ unsigned long range_sizek, unsigned char type)
+{
+ if (!range_sizek || (reg >= num_var_ranges))
+ return reg;
+
+ while (range_sizek) {
+ unsigned long max_align, align;
+ unsigned long sizek;
+
+ /* Compute the maximum size I can make a range */
+ if (range_startk)
+ max_align = ffs(range_startk) - 1;
+ else
+ max_align = 32;
+ align = fls(range_sizek) - 1;
+ if (align > max_align)
+ align = max_align;
+
+ sizek = 1 << align;
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk,
+ &start_factor),
+ size_base = to_size_factor(sizek, &size_factor),
+
+ printk(KERN_DEBUG "Setting variable MTRR %d, "
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
+ );
+ }
+ save_var_mtrr(reg++, range_startk, sizek, type);
+ range_startk += sizek;
+ range_sizek -= sizek;
+ if (reg >= num_var_ranges)
+ break;
+ }
+ return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+ unsigned long sizek)
+{
+ unsigned long hole_basek, hole_sizek;
+ unsigned long second_basek, second_sizek;
+ unsigned long range0_basek, range0_sizek;
+ unsigned long range_basek, range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+
+ hole_basek = 0;
+ hole_sizek = 0;
+ second_basek = 0;
+ second_sizek = 0;
+ chunk_sizek = state->chunk_sizek;
+ gran_sizek = state->gran_sizek;
+
+ /* align with gran size, prevent small block used up MTRRs */
+ range_basek = ALIGN(state->range_startk, gran_sizek);
+ if ((range_basek > basek) && basek)
+ return second_sizek;
+ state->range_sizek -= (range_basek - state->range_startk);
+ range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+ while (range_sizek > state->range_sizek) {
+ range_sizek -= gran_sizek;
+ if (!range_sizek)
+ return 0;
+ }
+ state->range_sizek = range_sizek;
+
+ /* try to append some small hole */
+ range0_basek = state->range_startk;
+ range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* no increase */
+ if (range0_sizek == state->range_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + state->range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ state->range_sizek, MTRR_TYPE_WRBACK);
+ return 0;
+ }
+
+ /* only cut back, when it is not the last */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* one hole in the middle */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* one hole in middle or at end */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* hole size should be less than half of range0 size */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
+ }
+
+ if (range0_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + range0_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ range0_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (range0_sizek < state->range_sizek) {
+ /* need to handle left over */
+ range_sizek = state->range_sizek - range0_sizek;
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
+ if (debug_print)
+ printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
+ }
+
+ return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+ unsigned long size_pfn)
+{
+ unsigned long basek, sizek;
+ unsigned long second_sizek = 0;
+
+ if (state->reg >= num_var_ranges)
+ return;
+
+ basek = base_pfn << (PAGE_SHIFT - 10);
+ sizek = size_pfn << (PAGE_SHIFT - 10);
+
+ /* See if I can merge with the last range */
+ if ((basek <= 1024) ||
+ (state->range_startk + state->range_sizek == basek)) {
+ unsigned long endk = basek + sizek;
+ state->range_sizek = endk - state->range_startk;
+ return;
+ }
+ /* Write the range mtrrs */
+ if (state->range_sizek != 0)
+ second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+ /* Allocate an msr */
+ state->range_startk = basek + second_sizek;
+ state->range_sizek = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_chunk_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_gran_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+ if (arg)
+ nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+ u64 chunk_size, u64 gran_size)
+{
+ struct var_mtrr_state var_state;
+ int i;
+ int num_reg;
+
+ var_state.range_startk = 0;
+ var_state.range_sizek = 0;
+ var_state.reg = 0;
+ var_state.chunk_sizek = chunk_size >> 10;
+ var_state.gran_sizek = gran_size >> 10;
+
+ memset(range_state, 0, sizeof(range_state));
+
+ /* Write the range etc */
+ for (i = 0; i < nr_range; i++)
+ set_var_mtrr_range(&var_state, range[i].start,
+ range[i].end - range[i].start + 1);
+
+ /* Write the last range */
+ if (var_state.range_sizek != 0)
+ range_to_mtrr_with_hole(&var_state, 0, 0);
+
+ num_reg = var_state.reg;
+ /* Clear out the extra MTRR's */
+ while (var_state.reg < num_var_ranges) {
+ save_var_mtrr(var_state.reg, 0, 0, 0);
+ var_state.reg++;
+ }
+
+ return num_reg;
+}
+
+struct mtrr_cleanup_result {
+ unsigned long gran_sizek;
+ unsigned long chunk_sizek;
+ unsigned long lose_cover_sizek;
+ unsigned int num_reg;
+ int bad;
+};
+
+/*
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
+ */
+#define NUM_RESULT 136
+#define PSHIFT (PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static void __init print_out_mtrr_range_state(void)
+{
+ int i;
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+ mtrr_type type;
+
+ for (i = 0; i < num_var_ranges; i++) {
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+ int i;
+ mtrr_type type;
+ unsigned long size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ size = range_state[i].size_pfn;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ if (type == MTRR_TYPE_WRPROT)
+ type = MTRR_TYPE_UNCACHABLE;
+ num[type]++;
+ }
+
+ /* check if we got UC entries */
+ if (!num[MTRR_TYPE_UNCACHABLE])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ return 1;
+}
+
+static unsigned long __initdata range_sums;
+static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size,
+ int i)
+{
+ int num_reg;
+ static struct res_range range_new[RANGE_NUM];
+ static int nr_range_new;
+ unsigned long range_sums_new;
+
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range,
+ chunk_size, gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base, extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
+
+ /* double check it */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range ||
+ memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
+
+ if (!result[i].bad && (range_sums - range_sums_new <
+ min_loss_pfn[num_reg])) {
+ min_loss_pfn[num_reg] =
+ range_sums - range_sums_new;
+ }
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad ? "*BAD*" : " ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad ? "-" : "",
+ lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+ int i;
+ int num_reg_good;
+ int index_good;
+
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i])
+ num_reg_good = i;
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ return index_good;
+}
+
+
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long extra_remove_base, extra_remove_size;
+ unsigned long base, size, def, dummy;
+ mtrr_type type;
+ u64 chunk_size, gran_size;
+ int index_good;
+ int i;
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* check if we need handle it and can handle it */
+ if (!mtrr_need_cleanup())
+ return 0;
+
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ print_out_mtrr_range_state();
+
+ memset(range, 0, sizeof(range));
+ extra_remove_size = 0;
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
+ extra_remove_size =
+ (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+ nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+ extra_remove_size);
+ /*
+ * [0, 1M) should always be coverred by var mtrr with WB
+ * and fixed mtrrs should take effective before var mtrr for it
+ */
+ nr_range = add_range_with_merge(range, nr_range, 0,
+ (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
+ range_sums = sum_ranges(range, nr_range);
+ printk(KERN_INFO "total RAM coverred: %ldM\n",
+ range_sums >> (20 - PAGE_SHIFT));
+
+ if (mtrr_chunk_size && mtrr_gran_size) {
+ i = 0;
+ mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+ extra_remove_base, extra_remove_size, i);
+
+ mtrr_print_out_one_result(i);
+
+ if (!result[i].bad) {
+ set_var_mtrr_all(address_bits);
+ printk(KERN_DEBUG "New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ }
+ printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+ "will find optimal one\n");
+ }
+
+ i = 0;
+ memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+ memset(result, 0, sizeof(result));
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+ chunk_size <<= 1) {
+
+ if (i >= NUM_RESULT)
+ continue;
+
+ mtrr_calc_range_state(chunk_size, gran_size,
+ extra_remove_base, extra_remove_size, i);
+ if (debug_print) {
+ mtrr_print_out_one_result(i);
+ printk(KERN_INFO "\n");
+ }
+
+ i++;
+ }
+ }
+
+ /* try to find the optimal index */
+ index_good = mtrr_search_optimal_index();
+
+ if (index_good != -1) {
+ printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ i = index_good;
+ mtrr_print_out_one_result(i);
+
+ /* convert ranges to var ranges state */
+ chunk_size = result[i].chunk_sizek;
+ chunk_size <<= 10;
+ gran_size = result[i].gran_sizek;
+ gran_size <<= 10;
+ x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ set_var_mtrr_all(address_bits);
+ printk(KERN_DEBUG "New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ } else {
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++)
+ mtrr_print_out_one_result(i);
+ }
+
+ printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+ printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+ return 0;
+}
+#else
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ return 0;
+}
+#endif
+
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+ disable_mtrr_trim = 1;
+ return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+ u32 l, h;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return 0;
+ if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ return 0;
+ /* In case some hypervisor doesn't pass SYSCFG through */
+ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ return 0;
+ /*
+ * Memory between 4GB and top of mem is forced WB by this magic bit.
+ * Reserved before K8RevF, but should be zero there.
+ */
+ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+ (Tom2Enabled | Tom2ForceMemTypeWB))
+ return 1;
+ return 0;
+}
+
+static u64 __init real_trim_memory(unsigned long start_pfn,
+ unsigned long limit_pfn)
+{
+ u64 trim_start, trim_size;
+ trim_start = start_pfn;
+ trim_start <<= PAGE_SHIFT;
+ trim_size = limit_pfn;
+ trim_size <<= PAGE_SHIFT;
+ trim_size -= trim_start;
+
+ return e820_update_range(trim_start, trim_size, E820_RAM,
+ E820_RESERVED);
+}
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations. This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+ unsigned long i, base, size, highest_pfn = 0, def, dummy;
+ mtrr_type type;
+ u64 total_trim_size;
+
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+ /*
+ * Make sure we only trim uncachable memory on machines that
+ * support the Intel MTRR architecture:
+ */
+ if (!is_cpu(INTEL) || disable_mtrr_trim)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* Find highest cached pfn */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ if (highest_pfn < base + size)
+ highest_pfn = base + size;
+ }
+
+ /* kvm/qemu doesn't have mtrr set right, don't trim them all */
+ if (!highest_pfn) {
+ printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+ return 0;
+ }
+
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* no entry for WB? */
+ if (!num[MTRR_TYPE_WRBACK])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ nr_range = 0;
+ if (mtrr_tom2) {
+ range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+ range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+ if (highest_pfn < range[nr_range].end + 1)
+ highest_pfn = range[nr_range].end + 1;
+ nr_range++;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+ total_trim_size = 0;
+ /* check the head */
+ if (range[0].start)
+ total_trim_size += real_trim_memory(0, range[0].start);
+ /* check the holes */
+ for (i = 0; i < nr_range - 1; i++) {
+ if (range[i].end + 1 < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ range[i+1].start);
+ }
+ /* check the top */
+ i = nr_range - 1;
+ if (range[i].end + 1 < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ end_pfn);
+
+ if (total_trim_size) {
+ printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+ " all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
+
+ if (!changed_by_mtrr_cleanup)
+ WARN_ON(1);
+
+ printk(KERN_INFO "update e820 for mtrr\n");
+ update_e820();
+
+ return 1;
+ }
+
+ return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95..37f28fc7cf9 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
struct mtrr_state_type mtrr_state = {};
EXPORT_SYMBOL_GPL(mtrr_state);
-static int __initdata mtrr_show;
-static int __init mtrr_debug(char *opt)
+/**
+ * BIOS is expected to clear MtrrFixDramModEn bit, see for example
+ * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ * Opteron Processors" (26094 Rev. 3.30 February 2006), section
+ * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
+ * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * 0 for operation."
+ */
+static inline void k8_check_syscfg_dram_mod_en(void)
{
- mtrr_show = 1;
- return 0;
+ u32 lo, hi;
+
+ if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 >= 0x0f)))
+ return;
+
+ rdmsr(MSR_K8_SYSCFG, lo, hi);
+ if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
+ printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ " not cleared by BIOS, clearing this bit\n",
+ smp_processor_id());
+ lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
+ mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ }
}
-early_param("mtrr.show", mtrr_debug);
/*
* Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
unsigned int *p = (unsigned int *) frs;
int i;
+ k8_check_syscfg_dram_mod_en();
+
rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
-static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+static unsigned __initdata last_fixed_start;
+static unsigned __initdata last_fixed_end;
+static mtrr_type __initdata last_fixed_type;
+
+static void __init print_fixed_last(void)
+{
+ if (!last_fixed_end)
+ return;
+
+ printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
+ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+
+ last_fixed_end = 0;
+}
+
+static void __init update_fixed_last(unsigned base, unsigned end,
+ mtrr_type type)
+{
+ last_fixed_start = base;
+ last_fixed_end = end;
+ last_fixed_type = type;
+}
+
+static void __init print_fixed(unsigned base, unsigned step,
+ const mtrr_type *types)
{
unsigned i;
- for (i = 0; i < 8; ++i, ++types, base += step)
- printk(KERN_INFO "MTRR %05X-%05X %s\n",
- base, base + step - 1, mtrr_attrib_to_str(*types));
+ for (i = 0; i < 8; ++i, ++types, base += step) {
+ if (last_fixed_end == 0) {
+ update_fixed_last(base, base + step, *types);
+ continue;
+ }
+ if (last_fixed_end == base && last_fixed_type == *types) {
+ last_fixed_end = base + step;
+ continue;
+ }
+ /* new segments: gap or different type */
+ print_fixed_last();
+ update_fixed_last(base, base + step, *types);
+ }
}
static void prepare_set(void);
static void post_set(void);
+static void __init print_mtrr_state(void)
+{
+ unsigned int i;
+ int high_width;
+
+ printk(KERN_DEBUG "MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
+ if (mtrr_state.have_fixed) {
+ printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
+ mtrr_state.enabled & 1 ? "en" : "dis");
+ print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+ for (i = 0; i < 2; ++i)
+ print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+ for (i = 0; i < 8; ++i)
+ print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+
+ /* tail */
+ print_fixed_last();
+ }
+ printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
+ mtrr_state.enabled & 2 ? "en" : "dis");
+ high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+ for (i = 0; i < num_var_ranges; ++i) {
+ if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+ printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ else
+ printk(KERN_DEBUG " %u disabled\n", i);
+ }
+ if (mtrr_tom2) {
+ printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
+ mtrr_tom2, mtrr_tom2>>20);
+ }
+}
+
/* Grab all of the MTRR state for this CPU into *state */
void __init get_mtrr_state(void)
{
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
mtrr_tom2 |= low;
mtrr_tom2 &= 0xffffff800000ULL;
}
- if (mtrr_show) {
- int high_width;
-
- printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
- if (mtrr_state.have_fixed) {
- printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
- print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
- for (i = 0; i < 2; ++i)
- print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
- for (i = 0; i < 8; ++i)
- print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
- }
- printk(KERN_INFO "MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
- high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
- for (i = 0; i < num_var_ranges; ++i) {
- if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
- printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
- i,
- high_width,
- mtrr_state.var_ranges[i].base_hi,
- mtrr_state.var_ranges[i].base_lo >> 12,
- high_width,
- mtrr_state.var_ranges[i].mask_hi,
- mtrr_state.var_ranges[i].mask_lo >> 12,
- mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
- else
- printk(KERN_INFO "MTRR %u disabled\n", i);
- }
- if (mtrr_tom2) {
- printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
- mtrr_tom2, mtrr_tom2>>20);
- }
- }
+
+ print_mtrr_state();
+
mtrr_state_set = 1;
/* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
}
/**
- * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
- * see AMD publication no. 24593, chapter 3.2.1 for more information
- */
-static inline void k8_enable_fixed_iorrs(void)
-{
- unsigned lo, hi;
-
- rdmsr(MSR_K8_SYSCFG, lo, hi);
- mtrr_wrmsr(MSR_K8_SYSCFG, lo
- | K8_MTRRFIXRANGE_DRAM_ENABLE
- | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
-}
-
-/**
* set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
* @msr: MSR address of the MTTR which should be checked and updated
* @changed: pointer which indicates whether the MTRR needed to be changed
* @msrwords: pointer to the MSR values which the MSR should have
- *
- * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
- * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
*/
static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
{
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
rdmsr(msr, lo, hi);
if (lo != msrwords[0] || hi != msrwords[1]) {
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
- ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
- k8_enable_fixed_iorrs();
mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
*changed = true;
}
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
{
unsigned int mask_lo, mask_hi, base_lo, base_hi;
unsigned int tmp, hi;
+ int cpu;
+
+ /*
+ * get_mtrr doesn't need to update mtrr_state, also it could be called
+ * from any cpu, so try to print it out directly.
+ */
+ cpu = get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+
if ((mask_lo & 0x800) == 0) {
/* Invalid (i.e. free) range */
*base = 0;
*size = 0;
*type = 0;
- return;
+ goto out_put_cpu;
}
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
- /* Work out the shifted address mask. */
+ /* Work out the shifted address mask: */
tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
mask_lo = size_or_mask | tmp;
- /* Expand tmp with high bits to all 1s*/
+
+ /* Expand tmp with high bits to all 1s: */
hi = fls(tmp);
if (hi > 0) {
tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
}
}
- /* This works correctly if size is a power of two, i.e. a
- contiguous range. */
+ /*
+ * This works correctly if size is a power of two, i.e. a
+ * contiguous range:
+ */
*size = -mask_lo;
*base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
*type = base_lo & 0xff;
+
+ printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
+ cpu, reg, *base, *size,
+ mtrr_attrib_to_str(*type & 0xff));
+out_put_cpu:
+ put_cpu();
}
/**
@@ -419,6 +479,8 @@ static int set_fixed_ranges(mtrr_type * frs)
bool changed = false;
int block=-1, range;
+ k8_check_syscfg_dram_mod_en();
+
while (fixed_range_blocks[++block].ranges)
for (range=0; range < fixed_range_blocks[block].ranges; range++)
set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b825..03cda01f57c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
unsigned long lsize;
};
-static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
+static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
{
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i,
- &mtrr_state[i].lbase,
- &mtrr_state[i].lsize,
- &mtrr_state[i].ltype);
+ &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
}
return 0;
}
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
int i;
for (i = 0; i < num_var_ranges; i++) {
- if (mtrr_state[i].lsize)
+ if (mtrr_value[i].lsize)
set_mtrr(i,
- mtrr_state[i].lbase,
- mtrr_state[i].lsize,
- mtrr_state[i].ltype);
+ mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
}
return 0;
}
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
.resume = mtrr_restore,
};
-/* should be related to MTRR_VAR_RANGES nums */
-#define RANGE_NUM 256
-
-struct res_range {
- unsigned long start;
- unsigned long end;
-};
-
-static int __init
-add_range(struct res_range *range, int nr_range, unsigned long start,
- unsigned long end)
-{
- /* out of slots */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
- unsigned long end)
-{
- int i;
-
- /* try to merge it with old one */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* need to add that */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* find the new spare */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
-
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
-}
-
-struct var_mtrr_range_state {
- unsigned long base_pfn;
- unsigned long size_pfn;
- mtrr_type type;
-};
-
-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-
-static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
- unsigned long extra_remove_base,
- unsigned long extra_remove_size)
-{
- unsigned long i, base, size;
- mtrr_type type;
-
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type != MTRR_TYPE_WRBACK)
- continue;
- base = range_state[i].base_pfn;
- size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
- }
- if (debug_print) {
- printk(KERN_DEBUG "After WB checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
- }
-
- /* take out UC ranges */
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type != MTRR_TYPE_UNCACHABLE &&
- type != MTRR_TYPE_WRPROT)
- continue;
- size = range_state[i].size_pfn;
- if (!size)
- continue;
- base = range_state[i].base_pfn;
- subtract_range(range, base, base + size - 1);
- }
- if (extra_remove_size)
- subtract_range(range, extra_remove_base,
- extra_remove_base + extra_remove_size - 1);
-
- /* get new range num */
- nr_range = 0;
- for (i = 0; i < RANGE_NUM; i++) {
- if (!range[i].end)
- continue;
- nr_range++;
- }
- if (debug_print) {
- printk(KERN_DEBUG "After UC checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
- }
-
- /* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
- if (debug_print) {
- printk(KERN_DEBUG "After sorting\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
- }
-
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
-
- return nr_range;
-}
-
-static struct res_range __initdata range[RANGE_NUM];
-static int __initdata nr_range;
-
-#ifdef CONFIG_MTRR_SANITIZER
-
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
-{
- unsigned long sum;
- int i;
-
- sum = 0;
- for (i = 0; i < nr_range; i++)
- sum += range[i].end + 1 - range[i].start;
-
- return sum;
-}
-
-static int enable_mtrr_cleanup __initdata =
- CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
-
-static int __init disable_mtrr_cleanup_setup(char *str)
-{
- enable_mtrr_cleanup = 0;
- return 0;
-}
-early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
-
-static int __init enable_mtrr_cleanup_setup(char *str)
-{
- enable_mtrr_cleanup = 1;
- return 0;
-}
-early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
- debug_print = 1;
- return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
-struct var_mtrr_state {
- unsigned long range_startk;
- unsigned long range_sizek;
- unsigned long chunk_sizek;
- unsigned long gran_sizek;
- unsigned int reg;
-};
-
-static void __init
-set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type, unsigned int address_bits)
-{
- u32 base_lo, base_hi, mask_lo, mask_hi;
- u64 base, mask;
-
- if (!sizek) {
- fill_mtrr_var_range(reg, 0, 0, 0, 0);
- return;
- }
-
- mask = (1ULL << address_bits) - 1;
- mask &= ~((((u64)sizek) << 10) - 1);
-
- base = ((u64)basek) << 10;
-
- base |= type;
- mask |= 0x800;
-
- base_lo = base & ((1ULL<<32) - 1);
- base_hi = base >> 32;
-
- mask_lo = mask & ((1ULL<<32) - 1);
- mask_hi = mask >> 32;
-
- fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
-}
-
-static void __init
-save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type)
-{
- range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
- range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
- range_state[reg].type = type;
-}
-
-static void __init
-set_var_mtrr_all(unsigned int address_bits)
-{
- unsigned long basek, sizek;
- unsigned char type;
- unsigned int reg;
-
- for (reg = 0; reg < num_var_ranges; reg++) {
- basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
- sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
- type = range_state[reg].type;
-
- set_var_mtrr(reg, basek, sizek, type, address_bits);
- }
-}
-
-static unsigned long to_size_factor(unsigned long sizek, char *factorp)
-{
- char factor;
- unsigned long base = sizek;
-
- if (base & ((1<<10) - 1)) {
- /* not MB alignment */
- factor = 'K';
- } else if (base & ((1<<20) - 1)){
- factor = 'M';
- base >>= 10;
- } else {
- factor = 'G';
- base >>= 20;
- }
-
- *factorp = factor;
-
- return base;
-}
-
-static unsigned int __init
-range_to_mtrr(unsigned int reg, unsigned long range_startk,
- unsigned long range_sizek, unsigned char type)
-{
- if (!range_sizek || (reg >= num_var_ranges))
- return reg;
-
- while (range_sizek) {
- unsigned long max_align, align;
- unsigned long sizek;
-
- /* Compute the maximum size I can make a range */
- if (range_startk)
- max_align = ffs(range_startk) - 1;
- else
- max_align = 32;
- align = fls(range_sizek) - 1;
- if (align > max_align)
- align = max_align;
-
- sizek = 1 << align;
- if (debug_print) {
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
-
- start_base = to_size_factor(range_startk, &start_factor),
- size_base = to_size_factor(sizek, &size_factor),
-
- printk(KERN_DEBUG "Setting variable MTRR %d, "
- "base: %ld%cB, range: %ld%cB, type %s\n",
- reg, start_base, start_factor,
- size_base, size_factor,
- (type == MTRR_TYPE_UNCACHABLE)?"UC":
- ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
- );
- }
- save_var_mtrr(reg++, range_startk, sizek, type);
- range_startk += sizek;
- range_sizek -= sizek;
- if (reg >= num_var_ranges)
- break;
- }
- return reg;
-}
-
-static unsigned __init
-range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
- unsigned long sizek)
-{
- unsigned long hole_basek, hole_sizek;
- unsigned long second_basek, second_sizek;
- unsigned long range0_basek, range0_sizek;
- unsigned long range_basek, range_sizek;
- unsigned long chunk_sizek;
- unsigned long gran_sizek;
-
- hole_basek = 0;
- hole_sizek = 0;
- second_basek = 0;
- second_sizek = 0;
- chunk_sizek = state->chunk_sizek;
- gran_sizek = state->gran_sizek;
-
- /* align with gran size, prevent small block used up MTRRs */
- range_basek = ALIGN(state->range_startk, gran_sizek);
- if ((range_basek > basek) && basek)
- return second_sizek;
- state->range_sizek -= (range_basek - state->range_startk);
- range_sizek = ALIGN(state->range_sizek, gran_sizek);
-
- while (range_sizek > state->range_sizek) {
- range_sizek -= gran_sizek;
- if (!range_sizek)
- return 0;
- }
- state->range_sizek = range_sizek;
-
- /* try to append some small hole */
- range0_basek = state->range_startk;
- range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
-
- /* no increase */
- if (range0_sizek == state->range_sizek) {
- if (debug_print)
- printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
- range0_basek<<10,
- (range0_basek + state->range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range0_basek,
- state->range_sizek, MTRR_TYPE_WRBACK);
- return 0;
- }
-
- /* only cut back, when it is not the last */
- if (sizek) {
- while (range0_basek + range0_sizek > (basek + sizek)) {
- if (range0_sizek >= chunk_sizek)
- range0_sizek -= chunk_sizek;
- else
- range0_sizek = 0;
-
- if (!range0_sizek)
- break;
- }
- }
-
-second_try:
- range_basek = range0_basek + range0_sizek;
-
- /* one hole in the middle */
- if (range_basek > basek && range_basek <= (basek + sizek))
- second_sizek = range_basek - basek;
-
- if (range0_sizek > state->range_sizek) {
-
- /* one hole in middle or at end */
- hole_sizek = range0_sizek - state->range_sizek - second_sizek;
-
- /* hole size should be less than half of range0 size */
- if (hole_sizek >= (range0_sizek >> 1) &&
- range0_sizek >= chunk_sizek) {
- range0_sizek -= chunk_sizek;
- second_sizek = 0;
- hole_sizek = 0;
-
- goto second_try;
- }
- }
-
- if (range0_sizek) {
- if (debug_print)
- printk(KERN_DEBUG "range0: %016lx - %016lx\n",
- range0_basek<<10,
- (range0_basek + range0_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range0_basek,
- range0_sizek, MTRR_TYPE_WRBACK);
- }
-
- if (range0_sizek < state->range_sizek) {
- /* need to handle left over */
- range_sizek = state->range_sizek - range0_sizek;
-
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n",
- range_basek<<10,
- (range_basek + range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range_basek,
- range_sizek, MTRR_TYPE_WRBACK);
- }
-
- if (hole_sizek) {
- hole_basek = range_basek - hole_sizek - second_sizek;
- if (debug_print)
- printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10,
- (hole_basek + hole_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, hole_basek,
- hole_sizek, MTRR_TYPE_UNCACHABLE);
- }
-
- return second_sizek;
-}
-
-static void __init
-set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
- unsigned long size_pfn)
-{
- unsigned long basek, sizek;
- unsigned long second_sizek = 0;
-
- if (state->reg >= num_var_ranges)
- return;
-
- basek = base_pfn << (PAGE_SHIFT - 10);
- sizek = size_pfn << (PAGE_SHIFT - 10);
-
- /* See if I can merge with the last range */
- if ((basek <= 1024) ||
- (state->range_startk + state->range_sizek == basek)) {
- unsigned long endk = basek + sizek;
- state->range_sizek = endk - state->range_startk;
- return;
- }
- /* Write the range mtrrs */
- if (state->range_sizek != 0)
- second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
-
- /* Allocate an msr */
- state->range_startk = basek + second_sizek;
- state->range_sizek = sizek - second_sizek;
-}
-
-/* mininum size of mtrr block that can take hole */
-static u64 mtrr_chunk_size __initdata = (256ULL<<20);
-
-static int __init parse_mtrr_chunk_size_opt(char *p)
-{
- if (!p)
- return -EINVAL;
- mtrr_chunk_size = memparse(p, &p);
- return 0;
-}
-early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
-
-/* granity of mtrr of block */
-static u64 mtrr_gran_size __initdata;
-
-static int __init parse_mtrr_gran_size_opt(char *p)
-{
- if (!p)
- return -EINVAL;
- mtrr_gran_size = memparse(p, &p);
- return 0;
-}
-early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
-
-static int nr_mtrr_spare_reg __initdata =
- CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
-
-static int __init parse_mtrr_spare_reg(char *arg)
-{
- if (arg)
- nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
- return 0;
-}
-
-early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
-
-static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
- u64 chunk_size, u64 gran_size)
-{
- struct var_mtrr_state var_state;
- int i;
- int num_reg;
-
- var_state.range_startk = 0;
- var_state.range_sizek = 0;
- var_state.reg = 0;
- var_state.chunk_sizek = chunk_size >> 10;
- var_state.gran_sizek = gran_size >> 10;
-
- memset(range_state, 0, sizeof(range_state));
-
- /* Write the range etc */
- for (i = 0; i < nr_range; i++)
- set_var_mtrr_range(&var_state, range[i].start,
- range[i].end - range[i].start + 1);
-
- /* Write the last range */
- if (var_state.range_sizek != 0)
- range_to_mtrr_with_hole(&var_state, 0, 0);
-
- num_reg = var_state.reg;
- /* Clear out the extra MTRR's */
- while (var_state.reg < num_var_ranges) {
- save_var_mtrr(var_state.reg, 0, 0, 0);
- var_state.reg++;
- }
-
- return num_reg;
-}
-
-struct mtrr_cleanup_result {
- unsigned long gran_sizek;
- unsigned long chunk_sizek;
- unsigned long lose_cover_sizek;
- unsigned int num_reg;
- int bad;
-};
-
-/*
- * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 2G
- * so we need (1+16)*8
- */
-#define NUM_RESULT 136
-#define PSHIFT (PAGE_SHIFT - 10)
-
-static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
-static unsigned long __initdata min_loss_pfn[RANGE_NUM];
-
-static void __init print_out_mtrr_range_state(void)
-{
- int i;
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
- mtrr_type type;
-
- for (i = 0; i < num_var_ranges; i++) {
-
- size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
- if (!size_base)
- continue;
-
- size_base = to_size_factor(size_base, &size_factor),
- start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
- type = range_state[i].type;
-
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
- i, start_base, start_factor,
- size_base, size_factor,
- (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
- ((type == MTRR_TYPE_WRPROT) ? "WP" :
- ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
- );
- }
-}
-
-static int __init mtrr_need_cleanup(void)
-{
- int i;
- mtrr_type type;
- unsigned long size;
- /* extra one for all 0 */
- int num[MTRR_NUM_TYPES + 1];
-
- /* check entries number */
- memset(num, 0, sizeof(num));
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- size = range_state[i].size_pfn;
- if (type >= MTRR_NUM_TYPES)
- continue;
- if (!size)
- type = MTRR_NUM_TYPES;
- if (type == MTRR_TYPE_WRPROT)
- type = MTRR_TYPE_UNCACHABLE;
- num[type]++;
- }
-
- /* check if we got UC entries */
- if (!num[MTRR_TYPE_UNCACHABLE])
- return 0;
-
- /* check if we only had WB and UC */
- if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
- num_var_ranges - num[MTRR_NUM_TYPES])
- return 0;
-
- return 1;
-}
-
-static unsigned long __initdata range_sums;
-static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
- unsigned long extra_remove_base,
- unsigned long extra_remove_size,
- int i)
-{
- int num_reg;
- static struct res_range range_new[RANGE_NUM];
- static int nr_range_new;
- unsigned long range_sums_new;
-
- /* convert ranges to var ranges state */
- num_reg = x86_setup_var_mtrrs(range, nr_range,
- chunk_size, gran_size);
-
- /* we got new setting in range_state, check it */
- memset(range_new, 0, sizeof(range_new));
- nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
- extra_remove_base, extra_remove_size);
- range_sums_new = sum_ranges(range_new, nr_range_new);
-
- result[i].chunk_sizek = chunk_size >> 10;
- result[i].gran_sizek = gran_size >> 10;
- result[i].num_reg = num_reg;
- if (range_sums < range_sums_new) {
- result[i].lose_cover_sizek =
- (range_sums_new - range_sums) << PSHIFT;
- result[i].bad = 1;
- } else
- result[i].lose_cover_sizek =
- (range_sums - range_sums_new) << PSHIFT;
-
- /* double check it */
- if (!result[i].bad && !result[i].lose_cover_sizek) {
- if (nr_range_new != nr_range ||
- memcmp(range, range_new, sizeof(range)))
- result[i].bad = 1;
- }
-
- if (!result[i].bad && (range_sums - range_sums_new <
- min_loss_pfn[num_reg])) {
- min_loss_pfn[num_reg] =
- range_sums - range_sums_new;
- }
-}
-
-static void __init mtrr_print_out_one_result(int i)
-{
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad ? "*BAD*" : " ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad ? "-" : "",
- lose_base, lose_factor);
-}
-
-static int __init mtrr_search_optimal_index(void)
-{
- int i;
- int num_reg_good;
- int index_good;
-
- if (nr_mtrr_spare_reg >= num_var_ranges)
- nr_mtrr_spare_reg = num_var_ranges - 1;
- num_reg_good = -1;
- for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i])
- num_reg_good = i;
- }
-
- index_good = -1;
- if (num_reg_good != -1) {
- for (i = 0; i < NUM_RESULT; i++) {
- if (!result[i].bad &&
- result[i].num_reg == num_reg_good &&
- !result[i].lose_cover_sizek) {
- index_good = i;
- break;
- }
- }
- }
-
- return index_good;
-}
-
-
-static int __init mtrr_cleanup(unsigned address_bits)
-{
- unsigned long extra_remove_base, extra_remove_size;
- unsigned long base, size, def, dummy;
- mtrr_type type;
- u64 chunk_size, gran_size;
- int index_good;
- int i;
-
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
- return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
- def &= 0xff;
- if (def != MTRR_TYPE_UNCACHABLE)
- return 0;
-
- /* get it and store it aside */
- memset(range_state, 0, sizeof(range_state));
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
- range_state[i].base_pfn = base;
- range_state[i].size_pfn = size;
- range_state[i].type = type;
- }
-
- /* check if we need handle it and can handle it */
- if (!mtrr_need_cleanup())
- return 0;
-
- /* print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
- print_out_mtrr_range_state();
-
- memset(range, 0, sizeof(range));
- extra_remove_size = 0;
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
- if (mtrr_tom2)
- extra_remove_size =
- (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
- nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
- extra_remove_size);
- /*
- * [0, 1M) should always be coverred by var mtrr with WB
- * and fixed mtrrs should take effective before var mtrr for it
- */
- nr_range = add_range_with_merge(range, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
- /* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
- range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM coverred: %ldM\n",
- range_sums >> (20 - PAGE_SHIFT));
-
- if (mtrr_chunk_size && mtrr_gran_size) {
- i = 0;
- mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
- extra_remove_base, extra_remove_size, i);
-
- mtrr_print_out_one_result(i);
-
- if (!result[i].bad) {
- set_var_mtrr_all(address_bits);
- return 1;
- }
- printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
- "will find optimal one\n");
- }
-
- i = 0;
- memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
- memset(result, 0, sizeof(result));
- for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
-
- for (chunk_size = gran_size; chunk_size < (1ULL<<32);
- chunk_size <<= 1) {
-
- if (i >= NUM_RESULT)
- continue;
-
- mtrr_calc_range_state(chunk_size, gran_size,
- extra_remove_base, extra_remove_size, i);
- if (debug_print) {
- mtrr_print_out_one_result(i);
- printk(KERN_INFO "\n");
- }
-
- i++;
- }
- }
-
- /* try to find the optimal index */
- index_good = mtrr_search_optimal_index();
-
- if (index_good != -1) {
- printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
- i = index_good;
- mtrr_print_out_one_result(i);
-
- /* convert ranges to var ranges state */
- chunk_size = result[i].chunk_sizek;
- chunk_size <<= 10;
- gran_size = result[i].gran_sizek;
- gran_size <<= 10;
- x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
- print_out_mtrr_range_state();
- return 1;
- } else {
- /* print out all */
- for (i = 0; i < NUM_RESULT; i++)
- mtrr_print_out_one_result(i);
- }
-
- printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
- printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
-
- return 0;
-}
-#else
-static int __init mtrr_cleanup(unsigned address_bits)
-{
- return 0;
-}
-#endif
-
-static int __initdata changed_by_mtrr_cleanup;
-
-static int disable_mtrr_trim;
-
-static int __init disable_mtrr_trim_setup(char *str)
-{
- disable_mtrr_trim = 1;
- return 0;
-}
-early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
-
-/*
- * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
- * for memory >4GB. Check for that here.
- * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
- * apply to are wrong, but so far we don't know of any such case in the wild.
- */
-#define Tom2Enabled (1U << 21)
-#define Tom2ForceMemTypeWB (1U << 22)
-
-int __init amd_special_default_mtrr(void)
-{
- u32 l, h;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return 0;
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
- return 0;
- /* In case some hypervisor doesn't pass SYSCFG through */
- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
- return 0;
- /*
- * Memory between 4GB and top of mem is forced WB by this magic bit.
- * Reserved before K8RevF, but should be zero there.
- */
- if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
- (Tom2Enabled | Tom2ForceMemTypeWB))
- return 1;
- return 0;
-}
-
-static u64 __init real_trim_memory(unsigned long start_pfn,
- unsigned long limit_pfn)
-{
- u64 trim_start, trim_size;
- trim_start = start_pfn;
- trim_start <<= PAGE_SHIFT;
- trim_size = limit_pfn;
- trim_size <<= PAGE_SHIFT;
- trim_size -= trim_start;
-
- return e820_update_range(trim_start, trim_size, E820_RAM,
- E820_RESERVED);
-}
-/**
- * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
- * @end_pfn: ending page frame number
- *
- * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
- * memory configurations. This routine checks that the highest MTRR matches
- * the end of memory, to make sure the MTRRs having a write back type cover
- * all of the memory the kernel is intending to use. If not, it'll trim any
- * memory off the end by adjusting end_pfn, removing it from the kernel's
- * allocation pools, warning the user with an obnoxious message.
- */
-int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
-{
- unsigned long i, base, size, highest_pfn = 0, def, dummy;
- mtrr_type type;
- u64 total_trim_size;
-
- /* extra one for all 0 */
- int num[MTRR_NUM_TYPES + 1];
- /*
- * Make sure we only trim uncachable memory on machines that
- * support the Intel MTRR architecture:
- */
- if (!is_cpu(INTEL) || disable_mtrr_trim)
- return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
- def &= 0xff;
- if (def != MTRR_TYPE_UNCACHABLE)
- return 0;
-
- /* get it and store it aside */
- memset(range_state, 0, sizeof(range_state));
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
- range_state[i].base_pfn = base;
- range_state[i].size_pfn = size;
- range_state[i].type = type;
- }
-
- /* Find highest cached pfn */
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type != MTRR_TYPE_WRBACK)
- continue;
- base = range_state[i].base_pfn;
- size = range_state[i].size_pfn;
- if (highest_pfn < base + size)
- highest_pfn = base + size;
- }
-
- /* kvm/qemu doesn't have mtrr set right, don't trim them all */
- if (!highest_pfn) {
- printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
- return 0;
- }
-
- /* check entries number */
- memset(num, 0, sizeof(num));
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type >= MTRR_NUM_TYPES)
- continue;
- size = range_state[i].size_pfn;
- if (!size)
- type = MTRR_NUM_TYPES;
- num[type]++;
- }
-
- /* no entry for WB? */
- if (!num[MTRR_TYPE_WRBACK])
- return 0;
-
- /* check if we only had WB and UC */
- if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
- num_var_ranges - num[MTRR_NUM_TYPES])
- return 0;
-
- memset(range, 0, sizeof(range));
- nr_range = 0;
- if (mtrr_tom2) {
- range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
- if (highest_pfn < range[nr_range].end + 1)
- highest_pfn = range[nr_range].end + 1;
- nr_range++;
- }
- nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
-
- total_trim_size = 0;
- /* check the head */
- if (range[0].start)
- total_trim_size += real_trim_memory(0, range[0].start);
- /* check the holes */
- for (i = 0; i < nr_range - 1; i++) {
- if (range[i].end + 1 < range[i+1].start)
- total_trim_size += real_trim_memory(range[i].end + 1,
- range[i+1].start);
- }
- /* check the top */
- i = nr_range - 1;
- if (range[i].end + 1 < end_pfn)
- total_trim_size += real_trim_memory(range[i].end + 1,
- end_pfn);
-
- if (total_trim_size) {
- printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
- " all of memory, losing %lluMB of RAM.\n",
- total_trim_size >> 20);
-
- if (!changed_by_mtrr_cleanup)
- WARN_ON(1);
-
- printk(KERN_INFO "update e820 for mtrr\n");
- update_e820();
-
- return 1;
- }
-
- return 0;
-}
+int __initdata changed_by_mtrr_cleanup;
/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6..77f67f7b347 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
+extern struct mtrr_state_type mtrr_state;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
int amd_init_mtrr(void);
int cyrix_init_mtrr(void);
int centaur_init_mtrr(void);
+
+extern int changed_by_mtrr_cleanup;
+extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5a..bb62b3e5caa 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e096..fd2c37bf7ac 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
* so no special init takes place.
*/
-static struct cpu_dev umc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
.c_models = {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 508bec1cee2..ef2c3563357 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
/*
* Add a memory region to the kernel e820 map.
*/
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+ int type)
{
- int x = e820.nr_map;
+ int x = e820x->nr_map;
- if (x == ARRAY_SIZE(e820.map)) {
+ if (x == ARRAY_SIZE(e820x->map)) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
+ e820x->map[x].addr = start;
+ e820x->map[x].size = size;
+ e820x->map[x].type = type;
+ e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+ __e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+ switch (type) {
+ case E820_RAM:
+ case E820_RESERVED_KERN:
+ printk(KERN_CONT "(usable)");
+ break;
+ case E820_RESERVED:
+ printk(KERN_CONT "(reserved)");
+ break;
+ case E820_ACPI:
+ printk(KERN_CONT "(ACPI data)");
+ break;
+ case E820_NVS:
+ printk(KERN_CONT "(ACPI NVS)");
+ break;
+ case E820_UNUSABLE:
+ printk(KERN_CONT "(unusable)");
+ break;
+ default:
+ printk(KERN_CONT "type %u", type);
+ break;
+ }
}
void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
(unsigned long long) e820.map[i].addr,
(unsigned long long)
(e820.map[i].addr + e820.map[i].size));
- switch (e820.map[i].type) {
- case E820_RAM:
- case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)\n");
- break;
- case E820_RESERVED:
- printk(KERN_CONT "(reserved)\n");
- break;
- case E820_ACPI:
- printk(KERN_CONT "(ACPI data)\n");
- break;
- case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)\n");
- break;
- case E820_UNUSABLE:
- printk("(unusable)\n");
- break;
- default:
- printk(KERN_CONT "type %u\n", e820.map[i].type);
- break;
- }
+ e820_print_type(e820.map[i].type);
+ printk(KERN_CONT "\n");
}
}
@@ -221,7 +233,7 @@ void __init e820_print_map(char *who)
*/
int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
- int *pnr_map)
+ u32 *pnr_map)
{
struct change_member {
struct e820entry *pbios; /* pointer to original bios entry */
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
return __append_e820_map(biosmap, nr_map);
}
-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
u64 size, unsigned old_type,
unsigned new_type)
{
- int i;
+ u64 end;
+ unsigned int i;
u64 real_updated_size = 0;
BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
- for (i = 0; i < e820.nr_map; i++) {
+ end = start + size;
+ printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+ (unsigned long long) start,
+ (unsigned long long) end);
+ e820_print_type(old_type);
+ printk(KERN_CONT " ==> ");
+ e820_print_type(new_type);
+ printk(KERN_CONT "\n");
+
+ for (i = 0; i < e820x->nr_map; i++) {
struct e820entry *ei = &e820x->map[i];
u64 final_start, final_end;
+ u64 ei_end;
+
if (ei->type != old_type)
continue;
- /* totally covered? */
- if (ei->addr >= start &&
- (ei->addr + ei->size) <= (start + size)) {
+
+ ei_end = ei->addr + ei->size;
+ /* totally covered by new range? */
+ if (ei->addr >= start && ei_end <= end) {
ei->type = new_type;
real_updated_size += ei->size;
continue;
}
+
+ /* new range is totally covered? */
+ if (ei->addr < start && ei_end > end) {
+ __e820_add_region(e820x, start, size, new_type);
+ __e820_add_region(e820x, end, ei_end - end, ei->type);
+ ei->size = start - ei->addr;
+ real_updated_size += size;
+ continue;
+ }
+
/* partially covered */
final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
+ final_end = min(end, ei_end);
if (final_start >= final_end)
continue;
- e820_add_region(final_start, final_end - final_start,
- new_type);
+
+ __e820_add_region(e820x, final_start, final_end - final_start,
+ new_type);
+
real_updated_size += final_end - final_start;
+ /*
+ * left range could be head or tail, so need to update
+ * size at first.
+ */
ei->size -= final_end - final_start;
if (ei->addr < final_start)
continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
- return e820_update_range_map(&e820, start, size, old_type, new_type);
+ return __e820_update_range(&e820, start, size, old_type, new_type);
}
static u64 __init e820_update_range_saved(u64 start, u64 size,
unsigned old_type, unsigned new_type)
{
- return e820_update_range_map(&e820_saved, start, size, old_type,
+ return __e820_update_range(&e820_saved, start, size, old_type,
new_type);
}
@@ -511,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
void __init update_e820(void)
{
- int nr_map;
+ u32 nr_map;
nr_map = e820.nr_map;
if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -522,7 +563,7 @@ void __init update_e820(void)
}
static void __init update_e820_saved(void)
{
- int nr_map;
+ u32 nr_map;
nr_map = e820_saved.nr_map;
if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -1020,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
continue;
return addr;
}
- return -1UL;
+ return -1ULL;
}
/*
@@ -1034,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
u64 start;
start = startt;
- while (size < sizet)
+ while (size < sizet && (start + 1))
start = find_e820_area_size(start, &size, align);
if (size < sizet)
return 0;
+#ifdef CONFIG_X86_32
+ if (start >= MAXMEM)
+ return 0;
+ if (start + size > MAXMEM)
+ size = MAXMEM - start;
+#endif
+
addr = round_down(start + size - sizet, align);
+ if (addr < start)
+ return 0;
e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
printk(KERN_INFO "update e820 for early_reserve_e820\n");
@@ -1253,7 +1303,7 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
- int nr = e820.nr_map;
+ u32 nr = e820.nr_map;
if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
early_panic("Invalid user supplied memory map");
@@ -1336,7 +1386,7 @@ void __init e820_reserve_resources_late(void)
char *__init default_machine_specific_memory_setup(void)
{
char *who = "BIOS-e820";
- int new_nr;
+ u32 new_nr;
/*
* Try to copy the BIOS-supplied E820-map.
*
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 639ad98238a..335f049d110 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
}
-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
{
int i;
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
writel(hi, &ehci_debug->data47);
}
-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
{
unsigned char *bytes = buf;
u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
return ret;
}
-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
int size)
{
u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
return ret;
}
-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
- int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+ int request, int value, int index, void *data, int size)
{
u32 pids, addr, ctrl;
struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
return 0;
}
-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
{
u32 portsc;
u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
return -EBUSY;
}
-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
{
u32 status;
int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
typedef void (*set_debug_port_t)(int port);
-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
{
}
-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
{
u32 dword;
dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 899e8938e79..c929add475c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -442,8 +442,7 @@ sysenter_past_esp:
GET_THREAD_INFO(%ebp)
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz sysenter_audit
sysenter_do_call:
cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx
+ testl $_TIF_ALLWORK_MASK, %ecx
jne sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
#ifdef CONFIG_AUDITSYSCALL
sysenter_audit:
- testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
addl $4,%esp
CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
jmp sysenter_do_call
sysexit_audit:
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
movl PT_EAX(%esp),%eax /* reload syscall return value */
jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx # current->work
+ testl $_TIF_ALLWORK_MASK, %ecx # current->work
jne syscall_exit_work
restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
- testb $_TIF_WORK_SYSCALL_EXIT, %cl
+ testl $_TIF_WORK_SYSCALL_EXIT, %ecx
jz work_pending
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 83d1836b946..a331ec38af9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -368,6 +368,7 @@ ENTRY(save_rest)
END(save_rest)
/* save complete stack frame */
+ .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
+ .popsection
/*
* A newly forked process directly context switches into this address.
@@ -416,7 +418,6 @@ ENTRY(ret_from_fork)
GET_THREAD_INFO(%rcx)
- CFI_REMEMBER_STATE
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -428,7 +429,6 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
- CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
#endif
apicinterrupt LOCAL_TIMER_VECTOR \
apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+ generic_interrupt smp_generic_interrupt
#ifdef CONFIG_SMP
apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe18..3f8579f8d42 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
{
reserve_trampoline_memory();
- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
- reserve_early(init_pg_tables_start, init_pg_tables_end,
- "INIT_PG_TABLE");
-
reserve_ebda_region();
/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b27224769..70eaa852c73 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
reserve_trampoline_memory();
- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c32ca19d591..30683883e0c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -38,42 +38,40 @@
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
/*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
* We need:
- * - one bit for each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
- * - enough space to map all low memory, which means
- * (2^32/4096) / 1024 pages (worst case, non PAE)
- * (2^32/4096) / 512 + 4 pages (worst case for PAE)
- * - a few pages for allocator use before the kernel pagetable has
- * been set up
+ * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
*
* Modulo rounding, each megabyte assigned here requires a kilobyte of
* memory, which is currently unreclaimed.
*
* This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
*/
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
#if PTRS_PER_PMD > 1
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
#else
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
-ALLOCATOR_SLOP = 4
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+/* Enough space to fit pagetables for the low memory linear map */
+MAPPING_BEYOND_END = \
+ PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * the worst-case size of the kernel itself, plus the extra we need
+ * to map for the linear map.
+ */
+KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
/*
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -166,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
/*
* Initialize page tables. This creates a PDE and a set of page
- * tables, which are located immediately beyond _end. The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
* Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end.
*
* Note that the stack is not yet set up!
*/
@@ -190,8 +188,7 @@ default_entry:
xorl %ebx,%ebx /* %ebx is kept at zero */
- movl $pa(pg0), %edi
- movl %edi, pa(init_pg_tables_start)
+ movl $pa(__brk_base), %edi
movl $pa(swapper_pg_pmd), %edx
movl $PTE_IDENT_ATTR, %eax
10:
@@ -209,14 +206,14 @@ default_entry:
loop 11b
/*
- * End condition: we must map up to and including INIT_MAP_BEYOND_END
- * bytes beyond the end of our own page tables.
+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
*/
- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
cmpl %ebp,%eax
jb 10b
1:
- movl %edi,pa(init_pg_tables_end)
+ addl $__PAGE_OFFSET, %edi
+ movl %edi, pa(_brk_end)
shrl $12, %eax
movl %eax, pa(max_pfn_mapped)
@@ -227,8 +224,7 @@ default_entry:
page_pde_offset = (__PAGE_OFFSET >> 20);
- movl $pa(pg0), %edi
- movl %edi, pa(init_pg_tables_start)
+ movl $pa(__brk_base), %edi
movl $pa(swapper_pg_dir), %edx
movl $PTE_IDENT_ATTR, %eax
10:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
addl $0x1000,%eax
loop 11b
/*
- * End condition: we must map up to and including INIT_MAP_BEYOND_END
- * bytes beyond the end of our own page tables; the +0x007 is
- * the attribute bits
+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
*/
- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
cmpl %ebp,%eax
jb 10b
- movl %edi,pa(init_pg_tables_end)
+ addl $__PAGE_OFFSET, %edi
+ movl %edi, pa(_brk_end)
shrl $12, %eax
movl %eax, pa(max_pfn_mapped)
@@ -636,6 +631,7 @@ swapper_pg_fixmap:
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
+
/*
* This starts the data section.
*/
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 10f92fb532f..3475440baa5 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,17 +3,17 @@
*
*/
#include <linux/clockchips.h>
-#include <linux/init.h>
#include <linux/interrupt.h>
+#include <linux/spinlock.h>
#include <linux/jiffies.h>
#include <linux/module.h>
-#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
-#include <asm/smp.h>
-#include <asm/delay.h>
#include <asm/i8253.h>
-#include <asm/io.h>
#include <asm/hpet.h>
+#include <asm/smp.h>
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode,
{
spin_lock(&i8253_lock);
- switch(mode) {
+ switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
/* binary, mode 2, LSB/MSB, ch 0 */
outb_pit(0x34, PIT_MODE);
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
* registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
* !using_apic_timer decisions in do_timer_interrupt_hook()
*/
-static struct clock_event_device pit_clockevent = {
+static struct clock_event_device pit_ce = {
.name = "pit",
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.set_mode = init_pit_timer,
@@ -114,15 +114,13 @@ void __init setup_pit_timer(void)
* Start pit with the boot cpu mask and make it global after the
* IO_APIC has been initialized.
*/
- pit_clockevent.cpumask = cpumask_of(smp_processor_id());
- pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
- pit_clockevent.shift);
- pit_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFF, &pit_clockevent);
- pit_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &pit_clockevent);
- clockevents_register_device(&pit_clockevent);
- global_clock_event = &pit_clockevent;
+ pit_ce.cpumask = cpumask_of(smp_processor_id());
+ pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
+ pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
+ pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
+
+ clockevents_register_device(&pit_ce);
+ global_clock_event = &pit_ce;
}
#ifndef CONFIG_X86_64
@@ -133,11 +131,11 @@ void __init setup_pit_timer(void)
*/
static cycle_t pit_read(void)
{
+ static int old_count;
+ static u32 old_jifs;
unsigned long flags;
int count;
u32 jifs;
- static int old_count;
- static u32 old_jifs;
spin_lock_irqsave(&i8253_lock, flags);
/*
@@ -179,9 +177,9 @@ static cycle_t pit_read(void)
* Previous attempts to handle these cases intelligently were
* buggy, so we just do the simple thing now.
*/
- if (count > old_count && jifs == old_jifs) {
+ if (count > old_count && jifs == old_jifs)
count = old_count;
- }
+
old_count = count;
old_jifs = jifs;
@@ -192,13 +190,13 @@ static cycle_t pit_read(void)
return (cycle_t)(jifs * LATCH) + count;
}
-static struct clocksource clocksource_pit = {
- .name = "pit",
- .rating = 110,
- .read = pit_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 20,
+static struct clocksource pit_cs = {
+ .name = "pit",
+ .rating = 110,
+ .read = pit_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .mult = 0,
+ .shift = 20,
};
static void pit_disable_clocksource(void)
@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void)
/*
* Use mult to check whether it is registered or not
*/
- if (clocksource_pit.mult) {
- clocksource_unregister(&clocksource_pit);
- clocksource_pit.mult = 0;
+ if (pit_cs.mult) {
+ clocksource_unregister(&pit_cs);
+ pit_cs.mult = 0;
}
}
@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+ pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
return 0;
- clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE,
- clocksource_pit.shift);
- return clocksource_register(&clocksource_pit);
+ pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
+
+ return clocksource_register(&pit_cs);
}
arch_initcall(init_pit_clocksource);
-#endif
+#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 720d2607aac..a979b5bd2fc 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -7,10 +7,10 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/init.h>
#include <linux/delay.h>
+#include <linux/init.h>
#include <linux/dmi.h>
-#include <asm/io.h>
+#include <linux/io.h>
int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay);
static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
{
if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
- printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
- id->ident);
+ pr_notice("%s: using 0xed I/O delay port\n", id->ident);
io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
}
@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
.callback = dmi_io_delay_0xed_port,
.ident = "Compaq Presario V6000",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
- DMI_MATCH(DMI_BOARD_NAME, "30B7")
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B7")
}
},
{
.callback = dmi_io_delay_0xed_port,
.ident = "HP Pavilion dv9000z",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
- DMI_MATCH(DMI_BOARD_NAME, "30B9")
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B9")
}
},
{
.callback = dmi_io_delay_0xed_port,
.ident = "HP Pavilion dv6000",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
- DMI_MATCH(DMI_BOARD_NAME, "30B8")
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B8")
}
},
{
.callback = dmi_io_delay_0xed_port,
.ident = "HP Pavilion tx1000",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
- DMI_MATCH(DMI_BOARD_NAME, "30BF")
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30BF")
}
},
{
.callback = dmi_io_delay_0xed_port,
.ident = "Presario F700",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
- DMI_MATCH(DMI_BOARD_NAME, "30D3")
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30D3")
}
},
{ }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca1650aa..3aaf7b9e3a8 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
atomic_t irq_err_count;
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
@@ -42,55 +45,60 @@ void ack_bad_irq(unsigned int irq)
/*
* /proc/interrupts printing:
*/
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
{
int j;
- seq_printf(p, "NMI: ");
+ seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
seq_printf(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
+ seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
+
+ seq_printf(p, "%*s: ", prec, "SPU");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
#endif
+ if (generic_interrupt_extension) {
+ seq_printf(p, "PLT: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+ seq_printf(p, " Platform interrupts\n");
+ }
#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
+ seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
+ seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
+ seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
+ seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
# ifdef CONFIG_X86_64
- seq_printf(p, "THR: ");
+ seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
-#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
return 0;
}
@@ -98,19 +106,22 @@ static int show_other_interrupts(struct seq_file *p)
int show_interrupts(struct seq_file *p, void *v)
{
unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j;
+ int i = *(loff_t *) v, j, prec;
struct irqaction *action;
struct irq_desc *desc;
if (i > nr_irqs)
return 0;
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
if (i == nr_irqs)
- return show_other_interrupts(p);
+ return show_other_interrupts(p, prec);
/* print header */
if (i == 0) {
- seq_printf(p, " ");
+ seq_printf(p, "%*s", prec + 8, "");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d", j);
seq_putc(p, '\n');
@@ -121,23 +132,15 @@ int show_interrupts(struct seq_file *p, void *v)
return 0;
spin_lock_irqsave(&desc->lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
-#endif
action = desc->action;
if (!action && !any_count)
goto out;
- seq_printf(p, "%3d: ", i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
+ seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#endif
seq_printf(p, " %8s", desc->chip->name);
seq_printf(p, "-%-8s", desc->name);
@@ -162,7 +165,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
+ sum += irq_stats(cpu)->irq_spurious_count;
#endif
+ if (generic_interrupt_extension)
+ sum += irq_stats(cpu)->generic_irqs;
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
@@ -174,9 +180,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_threshold_count;
#endif
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += irq_stats(cpu)->irq_spurious_count;
-#endif
return sum;
}
@@ -226,4 +229,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
return 1;
}
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ ack_APIC_irq();
+
+ exit_idle();
+
+ irq_enter();
+
+ inc_irq_stat(generic_irqs);
+
+ if (generic_interrupt_extension)
+ generic_interrupt_extension();
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+}
+
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a3006..bc132610544 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f3..c7a49e0ffbf 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index ff7d3b0124f..e444357375c 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -8,11 +8,11 @@
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
-#include <linux/stat.h>
+#include <linux/module.h>
#include <linux/init.h>
+#include <linux/stat.h>
#include <linux/io.h>
#include <linux/mm.h>
-#include <linux/module.h>
#include <asm/setup.h>
@@ -26,9 +26,8 @@ struct setup_data_node {
u32 len;
};
-static ssize_t
-setup_data_read(struct file *file, char __user *user_buf, size_t count,
- loff_t *ppos)
+static ssize_t setup_data_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
struct setup_data_node *node = file->private_data;
unsigned long remain;
@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
if (pos < 0)
return -EINVAL;
+
if (pos >= node->len)
return 0;
if (count > node->len - pos)
count = node->len - pos;
+
pa = node->paddr + sizeof(struct setup_data) + pos;
pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
if (PageHighMem(pg)) {
p = ioremap_cache(pa, count);
if (!p)
return -ENXIO;
- } else {
+ } else
p = __va(pa);
- }
remain = copy_to_user(user_buf, p, count);
@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
static int setup_data_open(struct inode *inode, struct file *file)
{
file->private_data = inode->i_private;
+
return 0;
}
static const struct file_operations fops_setup_data = {
- .read = setup_data_read,
- .open = setup_data_open,
+ .read = setup_data_read,
+ .open = setup_data_open,
};
static int __init
@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no,
{
struct dentry *d, *type, *data;
char buf[16];
- int error;
sprintf(buf, "%d", no);
d = debugfs_create_dir(buf, parent);
- if (!d) {
- error = -ENOMEM;
- goto err_return;
- }
+ if (!d)
+ return -ENOMEM;
+
type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
- if (!type) {
- error = -ENOMEM;
+ if (!type)
goto err_dir;
- }
+
data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
- if (!data) {
- error = -ENOMEM;
+ if (!data)
goto err_type;
- }
+
return 0;
err_type:
debugfs_remove(type);
err_dir:
debugfs_remove(d);
-err_return:
- return error;
+ return -ENOMEM;
}
static int __init create_setup_data_nodes(struct dentry *parent)
{
struct setup_data_node *node;
struct setup_data *data;
- int error, no = 0;
+ int error = -ENOMEM;
struct dentry *d;
struct page *pg;
u64 pa_data;
+ int no = 0;
d = debugfs_create_dir("setup_data", parent);
- if (!d) {
- error = -ENOMEM;
- goto err_return;
- }
+ if (!d)
+ return -ENOMEM;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node) {
- error = -ENOMEM;
+ if (!node)
goto err_dir;
- }
+
pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
if (PageHighMem(pg)) {
data = ioremap_cache(pa_data, sizeof(*data));
@@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
error = -ENXIO;
goto err_dir;
}
- } else {
+ } else
data = __va(pa_data);
- }
node->paddr = pa_data;
node->type = data->type;
@@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent)
goto err_dir;
no++;
}
+
return 0;
err_dir:
debugfs_remove(d);
-err_return:
return error;
}
@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = {
static int __init boot_params_kdebugfs_init(void)
{
struct dentry *dbp, *version, *data;
- int error;
+ int error = -ENOMEM;
dbp = debugfs_create_dir("boot_params", NULL);
- if (!dbp) {
- error = -ENOMEM;
- goto err_return;
- }
+ if (!dbp)
+ return -ENOMEM;
+
version = debugfs_create_x16("version", S_IRUGO, dbp,
&boot_params.hdr.version);
- if (!version) {
- error = -ENOMEM;
+ if (!version)
goto err_dir;
- }
+
data = debugfs_create_blob("data", S_IRUGO, dbp,
&boot_params_blob);
- if (!data) {
- error = -ENOMEM;
+ if (!data)
goto err_version;
- }
+
error = create_setup_data_nodes(dbp);
if (error)
goto err_data;
+
return 0;
err_data:
@@ -205,10 +196,9 @@ err_version:
debugfs_remove(version);
err_dir:
debugfs_remove(dbp);
-err_return:
return error;
}
-#endif
+#endif /* CONFIG_DEBUG_BOOT_PARAMS */
static int __init arch_kdebugfs_init(void)
{
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 4558dd3918c..55b94614e34 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,7 +193,7 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
- if (search_exception_tables(opcodes))
+ if (search_exception_tables((unsigned long)opcodes))
return 0; /* Page fault may occur on this address. */
retry:
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986ec..33019ddb56b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
kvm_mmu_write(ptep, pte_val(pte));
}
-static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
static void kvm_pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.set_pte_present = kvm_set_pte_present;
pv_mmu_ops.pte_clear = kvm_pte_clear;
pv_mmu_ops.pmd_clear = kvm_pmd_clear;
#endif
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a6..e7368c1da01 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
#include <linux/ftrace.h>
#include <linux/suspend.h>
#include <linux/gfp.h>
+#include <linux/io.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#include <asm/io.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
"\tmovl %%eax,%%fs\n"
"\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
- ::: "eax", "memory");
+ : : : "eax", "memory");
#undef STR
#undef __STR
}
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
- /* We need to put APICs in legacy mode so that we can
+ /*
+ * We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel. kexec/kdump
* paths already have calls to disable_IO_APIC() in
* one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);
- /* The segment registers are funny things, they have both a
+ /*
+ * The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
- /* The gdt & idt are now invalid.
+ /*
+ * The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd..89cea4d4467 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,47 @@
#include <linux/reboot.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#include <asm/io.h>
+
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+ unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ struct page *page;
+ int result = -ENOMEM;
+
+ addr &= PMD_MASK;
+ pgd += pgd_index(addr);
+ if (!pgd_present(*pgd)) {
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page)
+ goto out;
+ pud = (pud_t *)page_address(page);
+ memset(pud, 0, PAGE_SIZE);
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud)) {
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page)
+ goto out;
+ pmd = (pmd_t *)page_address(page);
+ memset(pmd, 0, PAGE_SIZE);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ result = 0;
+out:
+ return result;
+}
static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
}
level3p = (pud_t *)page_address(page);
result = init_level3_page(image, level3p, addr, last_addr);
- if (result) {
+ if (result)
goto out;
- }
set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
addr += PGDIR_SIZE;
}
@@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
if (result)
return result;
+ /*
+ * image->start may be outside 0 ~ max_pfn, for example when
+ * jump back to original kernel from kexeced kernel
+ */
+ result = init_one_level2_page(image, level4p, image->start);
+ if (result)
+ return result;
return init_transition_pgtable(image, level4p);
}
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
+ int save_ftrace_enabled;
- tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ save_processor_state();
+#endif
+
+ save_ftrace_enabled = __ftrace_enabled_save();
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to disable_IO_APIC() in
+ * one form or other. kexec jump path also need
+ * one.
+ */
+ disable_IO_APIC();
+#endif
+ }
+
control_page = page_address(image->control_code_page) + PAGE_SIZE;
- memcpy(control_page, relocate_kernel, PAGE_SIZE);
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
page_list[PA_TABLE_PAGE] =
(unsigned long)__pa(page_address(image->control_code_page));
- /* The segment registers are funny things, they have both a
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
+
+ /*
+ * The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
- /* The gdt & idt are now invalid.
+ /*
+ * The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
/* now call it */
- relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
- image->start);
+ image->start = relocate_kernel((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
}
void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f..712d15fdc41 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
+static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
{
.callback = set_check_enable_amd_mmconf,
.ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1ba..dce99dca6cf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
-#endif
-
-#ifdef CONFIG_X86_IO_APIC
static int bad_ioapic(unsigned long address)
{
@@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
+#endif /* CONFIG_X86_IO_APIC */
-#endif
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
@@ -275,6 +276,20 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
return 1;
}
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+ *ptr += size;
+ *count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+ "type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->length, 1);
+}
+
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -310,61 +325,30 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
- {
- struct mpc_cpu *m = (struct mpc_cpu *)mpt;
- /* ACPI may have already provided this data */
- if (!acpi_lapic)
- MP_processor_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ /* ACPI may have already provided this data */
+ if (!acpi_lapic)
+ MP_processor_info((struct mpc_cpu *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
case MP_BUS:
- {
- struct mpc_bus *m = (struct mpc_bus *)mpt;
-#ifdef CONFIG_X86_IO_APIC
- MP_bus_info(m);
-#endif
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ MP_bus_info((struct mpc_bus *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
case MP_IOAPIC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
- MP_ioapic_info(m);
-#endif
- mpt += sizeof(struct mpc_ioapic);
- count += sizeof(struct mpc_ioapic);
- break;
- }
+ MP_ioapic_info((struct mpc_ioapic *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
case MP_INTSRC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
- MP_intsrc_info(m);
-#endif
- mpt += sizeof(struct mpc_intsrc);
- count += sizeof(struct mpc_intsrc);
- break;
- }
+ MP_intsrc_info((struct mpc_intsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
case MP_LINTSRC:
- {
- struct mpc_lintsrc *m =
- (struct mpc_lintsrc *)mpt;
- MP_lintsrc_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
default:
/* wrong mptable */
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
- printk(KERN_ERR "type %x\n", *mpt);
- print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->length, 1);
+ smp_dump_mptable(mpc, mpt);
count = mpc->length;
break;
}
@@ -558,6 +542,68 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
static struct mpf_intel *mpf_found;
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ mpc = early_ioremap(physptr, PAGE_SIZE);
+ size = mpc->length;
+ early_iounmap(mpc, PAGE_SIZE);
+ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+
+ return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_ioremap(mpf->physptr, size);
+ /*
+ * Read the physical hardware table. Anything here will
+ * override the defaults.
+ */
+ if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ smp_found_config = 0;
+#endif
+ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+ "... disabling SMP support. (tell your hw vendor)\n");
+ early_iounmap(mpc, size);
+ return -1;
+ }
+ early_iounmap(mpc, size);
+
+ if (early)
+ return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * If there are no explicit MP IRQ entries, then we are
+ * broken. We set up most of the low 16 IO-APIC pins to
+ * ISA defaults and hope it will work.
+ */
+ if (!mp_irq_entries) {
+ struct mpc_bus bus;
+
+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+ "using default mptable. (tell your hw vendor)\n");
+
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ memcpy(bus.bustype, "ISA ", 6);
+ MP_bus_info(&bus);
+
+ construct_default_ioirq_mptable(0);
+ }
+#endif
+
+ return 0;
+}
+
/*
* Scan the memory blocks for an SMP configuration block.
*/
@@ -611,45 +657,8 @@ static void __init __get_smp_config(unsigned int early)
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
-
- /*
- * Read the physical hardware table. Anything here will
- * override the defaults.
- */
- if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
-#ifdef CONFIG_X86_LOCAL_APIC
- smp_found_config = 0;
-#endif
- printk(KERN_ERR
- "BIOS bug, MP table errors detected!...\n");
- printk(KERN_ERR "... disabling SMP support. "
- "(tell your hw vendor)\n");
- return;
- }
-
- if (early)
+ if (check_physptr(mpf, early))
return;
-#ifdef CONFIG_X86_IO_APIC
- /*
- * If there are no explicit MP IRQ entries, then we are
- * broken. We set up most of the low 16 IO-APIC pins to
- * ISA defaults and hope it will work.
- */
- if (!mp_irq_entries) {
- struct mpc_bus bus;
-
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
- "using default mptable. "
- "(tell your hw vendor)\n");
-
- bus.type = MP_BUS;
- bus.busid = 0;
- memcpy(bus.bustype, "ISA ", 6);
- MP_bus_info(&bus);
-
- construct_default_ioirq_mptable(0);
- }
-#endif
} else
BUG();
@@ -670,6 +679,31 @@ void __init get_smp_config(void)
__get_smp_config(0);
}
+static void smp_reserve_bootmem(struct mpf_intel *mpf)
+{
+ unsigned long size = get_mpc_size(mpf->physptr);
+#ifdef CONFIG_X86_32
+ /*
+ * We cannot access to MPC table to compute table size yet,
+ * as only few megabytes from the bottom is mapped now.
+ * PC-9800's MPC table places on the very last of physical
+ * memory; so that simply reserving PAGE_SIZE from mpf->physptr
+ * yields BUG() in reserve_bootmem.
+ * also need to make sure physptr is below than max_low_pfn
+ * we don't need reserve the area above max_low_pfn
+ */
+ unsigned long end = max_low_pfn * PAGE_SIZE;
+
+ if (mpf->physptr < end) {
+ if (mpf->physptr + size > end)
+ size = end - mpf->physptr;
+ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+ }
+#else
+ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+#endif
+}
+
static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned reserve)
{
@@ -697,36 +731,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
- BOOTMEM_DEFAULT);
- if (mpf->physptr) {
- unsigned long size = PAGE_SIZE;
-#ifdef CONFIG_X86_32
- /*
- * We cannot access to MPC table to compute
- * table size yet, as only few megabytes from
- * the bottom is mapped now.
- * PC-9800's MPC table places on the very last
- * of physical memory; so that simply reserving
- * PAGE_SIZE from mpf->physptr yields BUG()
- * in reserve_bootmem.
- * also need to make sure physptr is below than
- * max_low_pfn
- * we don't need reserve the area above max_low_pfn
- */
- unsigned long end = max_low_pfn * PAGE_SIZE;
-
- if (mpf->physptr < end) {
- if (mpf->physptr + size > end)
- size = end - mpf->physptr;
- reserve_bootmem_generic(mpf->physptr, size,
- BOOTMEM_DEFAULT);
- }
-#else
- reserve_bootmem_generic(mpf->physptr, size,
+ reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
BOOTMEM_DEFAULT);
-#endif
- }
+ if (mpf->physptr)
+ smp_reserve_bootmem(mpf);
return 1;
}
@@ -829,7 +837,57 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
#define SPARE_SLOT_NUM 20
static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
-#endif
+
+static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "OLD ");
+ print_MP_intsrc_info(m);
+
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ apic_printk(APIC_VERBOSE, "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ return;
+ }
+ if (!i) {
+ /* legacy, do nothing */
+ return;
+ }
+ if (*nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2) are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[*nr_m_spare] = m;
+ *nr_m_spare += 1;
+ }
+}
+#else /* CONFIG_X86_IO_APIC */
+static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
+ int count)
+{
+ if (!mpc_new_phys) {
+ pr_info("No spare slots, try to append...take your risk, "
+ "new mpc_length %x\n", count);
+ } else {
+ if (count <= mpc_new_length)
+ pr_info("No spare slots, try to append..., "
+ "new mpc_length %x\n", count);
+ else {
+ pr_err("mpc_new_length %lx is too small\n",
+ mpc_new_length);
+ return -1;
+ }
+ }
+
+ return 0;
+}
static int __init replace_intsrc_all(struct mpc_table *mpc,
unsigned long mpc_new_phys,
@@ -837,77 +895,33 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
{
#ifdef CONFIG_X86_IO_APIC
int i;
- int nr_m_spare = 0;
#endif
-
int count = sizeof(*mpc);
+ int nr_m_spare = 0;
unsigned char *mpt = ((unsigned char *)mpc) + count;
printk(KERN_INFO "mpc_length %x\n", mpc->length);
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
- {
- struct mpc_cpu *m = (struct mpc_cpu *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
case MP_BUS:
- {
- struct mpc_bus *m = (struct mpc_bus *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
case MP_IOAPIC:
- {
- mpt += sizeof(struct mpc_ioapic);
- count += sizeof(struct mpc_ioapic);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
case MP_INTSRC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
- printk(KERN_INFO "OLD ");
- print_MP_intsrc_info(m);
- i = get_MP_intsrc_index(m);
- if (i > 0) {
- assign_to_mpc_intsrc(&mp_irqs[i], m);
- printk(KERN_INFO "NEW ");
- print_mp_irq_info(&mp_irqs[i]);
- } else if (!i) {
- /* legacy, do nothing */
- } else if (nr_m_spare < SPARE_SLOT_NUM) {
- /*
- * not found (-1), or duplicated (-2)
- * are invalid entries,
- * we need to use the slot later
- */
- m_spare[nr_m_spare] = m;
- nr_m_spare++;
- }
-#endif
- mpt += sizeof(struct mpc_intsrc);
- count += sizeof(struct mpc_intsrc);
- break;
- }
+ check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
case MP_LINTSRC:
- {
- struct mpc_lintsrc *m =
- (struct mpc_lintsrc *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
default:
/* wrong mptable */
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
- printk(KERN_ERR "type %x\n", *mpt);
- print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
- 1, mpc, mpc->length, 1);
+ smp_dump_mptable(mpc, mpt);
goto out;
}
}
@@ -924,23 +938,15 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
continue;
if (nr_m_spare > 0) {
- printk(KERN_INFO "*NEW* found ");
+ apic_printk(APIC_VERBOSE, "*NEW* found\n");
nr_m_spare--;
assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
m_spare[nr_m_spare] = NULL;
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
- if (!mpc_new_phys) {
- printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
- } else {
- if (count <= mpc_new_length)
- printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
- else {
- printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
- goto out;
- }
- }
+ if (!check_slot(mpc_new_phys, mpc_new_length, count))
+ goto out;
assign_to_mpc_intsrc(&mp_irqs[i], m);
mpc->length = count;
mpt += sizeof(struct mpc_intsrc);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358d8ee..8e45f446488 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = {
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
- .set_pte_present = native_set_pte_present,
.pte_clear = native_pte_clear,
.pmd_clear = native_pmd_clear,
#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index c70ab5a5d4c..8b02a3936d4 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -1,14 +1,14 @@
/* Fallback functions when the main IOMMU code is not compiled in. This
code is roughly equivalent to i386. */
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/string.h>
#include <linux/dma-mapping.h>
#include <linux/scatterlist.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
-#include <asm/iommu.h>
#include <asm/processor.h>
+#include <asm/iommu.h>
#include <asm/dma.h>
static int
@@ -79,11 +79,11 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
}
struct dma_mapping_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = nommu_free_coherent,
- .map_single = nommu_map_single,
- .map_sg = nommu_map_sg,
- .is_phys = 1,
+ .alloc_coherent = dma_generic_alloc_coherent,
+ .free_coherent = nommu_free_coherent,
+ .map_single = nommu_map_single,
+ .map_sg = nommu_map_sg,
+ .is_phys = 1,
};
void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6afa5232dbb..156f87582c6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -65,11 +65,11 @@ void exit_thread(void)
{
struct task_struct *me = current;
struct thread_struct *t = &me->thread;
+ unsigned long *bp = t->io_bitmap_ptr;
- if (me->thread.io_bitmap_ptr) {
+ if (bp) {
struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
- kfree(t->io_bitmap_ptr);
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
/*
@@ -78,6 +78,7 @@ void exit_thread(void)
memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
t->io_bitmap_max = 0;
put_cpu();
+ kfree(bp);
}
ds_exit_thread(current);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3d9672e59c1..19378715f41 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child,
if (!cfg.signal)
return -EINVAL;
- return -EOPNOTSUPP;
-
child->thread.bts_ovfl_signal = cfg.signal;
+ return -EOPNOTSUPP;
}
if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 697d1b78cfb..e95022e4f5d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
if (!force_hpet_address)
return;
- if (rcba_base == NULL)
- BUG();
+ BUG_ON(rcba_base == NULL);
/* read the Function Disable register, dword mode only */
val = readl(rcba_base + 0x3404);
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d2..41235531b11 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
#define PTR(x) (x << 2)
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
* ~ control_page + PAGE_SIZE are used as data storage and stack for
* jumping back
*/
@@ -76,8 +77,10 @@ relocate_kernel:
movl %eax, CP_PA_SWAP_PAGE(%edi)
movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
- /* get physical address of control page now */
- /* this is impossible after page table switch */
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
/* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
/* store the start address on the stack */
pushl %edx
- /* Set cr0 to a known state:
+ /*
+ * Set cr0 to a known state:
* - Paging disabled
* - Alignment check disabled
* - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
/* clear cr4 if applicable */
testl %ecx, %ecx
jz 1f
- /* Set cr4 to a known state:
+ /*
+ * Set cr4 to a known state:
* Setting everything to zero seems safe.
*/
xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
call swap_pages
addl $8, %esp
- /* To be certain of avoiding problems with self-modifying code
+ /*
+ * To be certain of avoiding problems with self-modifying code
* I need to execute a serializing instruction here.
* So I flush the TLB, it's handy, and not processor dependent.
*/
xorl %eax, %eax
movl %eax, %cr3
- /* set all of the registers to known values */
- /* leave %esp alone */
+ /*
+ * set all of the registers to known values
+ * leave %esp alone
+ */
testl %esi, %esi
jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a47..4de8f5b3d47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,29 +19,77 @@
#define PTR(x) (x << 3)
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP DATA(0x0)
+#define CR0 DATA(0x8)
+#define CR3 DATA(0x10)
+#define CR4 DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE DATA(0x20)
+#define CP_PA_SWAP_PAGE DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+
.text
.align PAGE_SIZE
.code64
.globl relocate_kernel
relocate_kernel:
- /* %rdi indirection_page
+ /*
+ * %rdi indirection_page
* %rsi page_list
* %rdx start address
+ * %rcx preserve_context
*/
+ /* Save the CPU context, used for jumping back */
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushf
+
+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
+ movq %rsp, RSP(%r11)
+ movq %cr0, %rax
+ movq %rax, CR0(%r11)
+ movq %cr3, %rax
+ movq %rax, CR3(%r11)
+ movq %cr4, %rax
+ movq %rax, CR4(%r11)
+
/* zero out flags, and disable interrupts */
pushq $0
popfq
- /* get physical address of control page now */
- /* this is impossible after page table switch */
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
/* get physical address of page table now too */
- movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
+ movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+ /* get physical address of swap page now */
+ movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+ /* save some information for jumping back */
+ movq %r9, CP_PA_TABLE_PAGE(%r11)
+ movq %r10, CP_PA_SWAP_PAGE(%r11)
+ movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
/* Switch to the identity mapped page tables */
- movq %rcx, %cr3
+ movq %r9, %cr3
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
/* store the start address on the stack */
pushq %rdx
- /* Set cr0 to a known state:
+ /*
+ * Set cr0 to a known state:
* - Paging enabled
* - Alignment check disabled
* - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
orl $(X86_CR0_PG | X86_CR0_PE), %eax
movq %rax, %cr0
- /* Set cr4 to a known state:
+ /*
+ * Set cr4 to a known state:
* - physical address extension enabled
*/
movq $X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
1:
/* Flush the TLB (needed?) */
- movq %rcx, %cr3
+ movq %r9, %cr3
+
+ movq %rcx, %r11
+ call swap_pages
+
+ /*
+ * To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /*
+ * set all of the registers to known values
+ * leave %rsp alone
+ */
+
+ testq %r11, %r11
+ jnz 1f
+ xorq %rax, %rax
+ xorq %rbx, %rbx
+ xorq %rcx, %rcx
+ xorq %rdx, %rdx
+ xorq %rsi, %rsi
+ xorq %rdi, %rdi
+ xorq %rbp, %rbp
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r9
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+
+ ret
+
+1:
+ popq %rdx
+ leaq PAGE_SIZE(%r10), %rsp
+ call *%rdx
+
+ /* get the re-entry point of the peer system */
+ movq 0(%rsp), %rbp
+ call 1f
+1:
+ popq %r8
+ subq $(1b - relocate_kernel), %r8
+ movq CP_PA_SWAP_PAGE(%r8), %r10
+ movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+ movq CP_PA_TABLE_PAGE(%r8), %rax
+ movq %rax, %cr3
+ lea PAGE_SIZE(%r8), %rsp
+ call swap_pages
+ movq $virtual_mapped, %rax
+ pushq %rax
+ ret
+
+virtual_mapped:
+ movq RSP(%r8), %rsp
+ movq CR4(%r8), %rax
+ movq %rax, %cr4
+ movq CR3(%r8), %rax
+ movq CR0(%r8), %r8
+ movq %rax, %cr3
+ movq %r8, %cr0
+ movq %rbp, %rax
+
+ popf
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ ret
/* Do the copies */
+swap_pages:
movq %rdi, %rcx /* Put the page_list in %rcx */
xorq %rdi, %rdi
xorq %rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
+ movq %rdi, %rdx
+ movq %rsi, %rax
+
+ movq %r10, %rdi
movq $512, %rcx
rep ; movsq
- jmp 0b
-3:
-
- /* To be certain of avoiding problems with self-modifying code
- * I need to execute a serializing instruction here.
- * So I flush the TLB by reloading %cr3 here, it's handy,
- * and not processor dependent.
- */
- movq %cr3, %rax
- movq %rax, %cr3
- /* set all of the registers to known values */
- /* leave %rsp alone */
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq $512, %rcx
+ rep ; movsq
- xorq %rax, %rax
- xorq %rbx, %rbx
- xorq %rcx, %rcx
- xorq %rdx, %rdx
- xorq %rsi, %rsi
- xorq %rdi, %rdi
- xorq %rbp, %rbp
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r9
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- xorq %r14, %r14
- xorq %r15, %r15
+ movq %rdx, %rdi
+ movq %r10, %rsi
+ movq $512, %rcx
+ rep ; movsq
+ lea PAGE_SIZE(%rax), %rsi
+ jmp 0b
+3:
ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index dd6f2b71561..5d465b207e7 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,14 +1,14 @@
/*
* RTC related functions
*/
+#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
#include <linux/acpi.h>
#include <linux/bcd.h>
-#include <linux/mc146818rtc.h>
-#include <linux/platform_device.h>
#include <linux/pnp.h>
-#include <asm/time.h>
#include <asm/vsyscall.h>
+#include <asm/time.h>
#ifdef CONFIG_X86_32
/*
@@ -16,9 +16,9 @@
* register we are working with. It is required for NMI access to the
* CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
*/
-volatile unsigned long cmos_lock = 0;
+volatile unsigned long cmos_lock;
EXPORT_SYMBOL(cmos_lock);
-#endif
+#endif /* CONFIG_X86_32 */
/* For two digit years assume time is always after that */
#define CMOS_YEARS_OFFS 2000
@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
*/
int mach_set_rtc_mmss(unsigned long nowtime)
{
- int retval = 0;
int real_seconds, real_minutes, cmos_minutes;
unsigned char save_control, save_freq_select;
+ int retval = 0;
/* tell the clock it's being set */
save_control = CMOS_READ(RTC_CONTROL);
@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
real_seconds = bin2bcd(real_seconds);
real_minutes = bin2bcd(real_minutes);
}
- CMOS_WRITE(real_seconds,RTC_SECONDS);
- CMOS_WRITE(real_minutes,RTC_MINUTES);
+ CMOS_WRITE(real_seconds, RTC_SECONDS);
+ CMOS_WRITE(real_minutes, RTC_MINUTES);
} else {
printk(KERN_WARNING
"set_rtc_mmss: can't update from %d to %d\n",
@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr)
outb(addr, RTC_PORT(0));
val = inb(RTC_PORT(1));
lock_cmos_suffix(addr);
+
return val;
}
EXPORT_SYMBOL(rtc_cmos_read);
@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write);
static int set_rtc_mmss(unsigned long nowtime)
{
- int retval;
unsigned long flags;
+ int retval;
spin_lock_irqsave(&rtc_lock, flags);
retval = set_wallclock(nowtime);
@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void)
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
"registered platform RTC device (no PNP device found)\n");
+
return 0;
}
device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b746deb9ebc..a0d26237d7c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,8 +112,13 @@
#define ARCH_SETUP
#endif
+RESERVE_BRK(dmi_alloc, 65536);
+
unsigned int boot_cpu_id __read_mostly;
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
#ifdef CONFIG_X86_64
int default_cpu_present_to_apicid(int mps_cpu)
{
@@ -158,12 +163,6 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
- immediately after the boot time page tables. It contains a *physical*
- address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
static struct resource video_ram_resource = {
.name = "Video RAM area",
.start = 0xa0000,
@@ -202,7 +201,9 @@ struct ist_info ist_info;
#endif
#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+ .x86_phys_bits = MAX_PHYSMEM_BITS,
+};
EXPORT_SYMBOL(boot_cpu_data);
#endif
@@ -217,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
int bootloader_type;
/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
* Setup options
*/
struct screen_info screen_info;
@@ -267,6 +262,35 @@ static inline void copy_edd(void)
}
#endif
+void * __init extend_brk(size_t size, size_t align)
+{
+ size_t mask = align - 1;
+ void *ret;
+
+ BUG_ON(_brk_start == 0);
+ BUG_ON(align & mask);
+
+ _brk_end = (_brk_end + mask) & ~mask;
+ BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+ ret = (void *)_brk_end;
+ _brk_end += size;
+
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+static void __init reserve_brk(void)
+{
+ if (_brk_end > _brk_start)
+ reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+ /* Mark brk area as locked down and no longer taking any
+ new allocations */
+ _brk_start = 0;
+}
+
#ifdef CONFIG_BLK_DEV_INITRD
#ifdef CONFIG_X86_32
@@ -715,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
- init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
- init_mm.brk = (unsigned long) &_end;
-#endif
+ init_mm.brk = _brk_end;
code_resource.start = virt_to_phys(_text);
code_resource.end = virt_to_phys(_etext)-1;
@@ -840,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
+ reserve_brk();
+
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d2cc6428c58..dfcc74ab0ab 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -211,31 +211,27 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
{
/* Default to using normal stack */
unsigned long sp = regs->sp;
+ int onsigstack = on_sig_stack(sp);
#ifdef CONFIG_X86_64
/* redzone */
sp -= 128;
#endif /* CONFIG_X86_64 */
- /*
- * If we are on the alternate signal stack and would overflow it, don't.
- * Return an always-bogus address instead so we will die with SIGSEGV.
- */
- if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
- return (void __user *) -1L;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- } else {
+ if (!onsigstack) {
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (sas_ss_flags(sp) == 0)
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ } else {
#ifdef CONFIG_X86_32
- /* This is the legacy signal stack switching. */
- if ((regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
- sp = (unsigned long) ka->sa.sa_restorer;
+ /* This is the legacy signal stack switching. */
+ if ((regs->ss & 0xffff) != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer)
+ sp = (unsigned long) ka->sa.sa_restorer;
#endif /* CONFIG_X86_32 */
+ }
}
if (used_math()) {
@@ -244,12 +240,22 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = round_down(sp, 64);
#endif /* CONFIG_X86_64 */
*fpstate = (void __user *)sp;
-
- if (save_i387_xstate(*fpstate) < 0)
- return (void __user *)-1L;
}
- return (void __user *)align_sigframe(sp - frame_size);
+ sp = align_sigframe(sp - frame_size);
+
+ /*
+ * If we are on the alternate signal stack and would overflow it, don't.
+ * Return an always-bogus address instead so we will die with SIGSEGV.
+ */
+ if (onsigstack && !likely(on_sig_stack(sp)))
+ return (void __user *)-1L;
+
+ /* save i387 state */
+ if (used_math() && save_i387_xstate(*fpstate) < 0)
+ return (void __user *)-1L;
+
+ return (void __user *)sp;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f5080..ef7d10170c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
atomic_t init_deasserted;
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
-
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
/* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
cpumask_set_cpu(cpuid, cpu_callin_mask);
}
-static int __cpuinitdata unsafe_smp;
-
/*
* Activate a secondary processor.
*/
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
cpu_idle();
}
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
- /*
- * Mask B, Pentium, but not Pentium MMX
- */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3)
- /*
- * Remember we have B step Pentia with bugs
- */
- smp_b_stepping = 1;
-
- /*
- * Certain Athlons might work (for various values of 'work') in SMP
- * but they are not certified as MP capable.
- */
- if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
- if (num_possible_cpus() == 1)
- goto valid_k7;
-
- /* Athlon 660/661 is valid. */
- if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
- (c->x86_mask == 1)))
- goto valid_k7;
-
- /* Duron 670 is valid */
- if ((c->x86_model == 7) && (c->x86_mask == 0))
- goto valid_k7;
-
- /*
- * Athlon 662, Duron 671, and Athlon >model 7 have capability
- * bit. It's worth noting that the A5 stepping (662) of some
- * Athlon XP's have the MP bit set.
- * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
- * more.
- */
- if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
- ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
- (c->x86_model > 7))
- if (cpu_has_mp)
- goto valid_k7;
-
- /* If we get here, not a certified SMP capable AMD system. */
- unsafe_smp = 1;
- }
-
-valid_k7:
- ;
-}
-
-static void __cpuinit smp_checks(void)
-{
- if (smp_b_stepping)
- printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
- "with B stepping processors.\n");
-
- /*
- * Don't taint if we are running SMP kernel on a single non-MP
- * approved Athlon
- */
- if (unsafe_smp && num_online_cpus() > 1) {
- printk(KERN_INFO "WARNING: This combination of AMD"
- "processors is not suitable for SMP.\n");
- add_taint(TAINT_UNSAFE_SMP);
- }
-}
-
/*
* The bootstrap kernel entry code has set these up. Save them for
* a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
c->cpu_index = id;
if (id != 0)
identify_secondary_cpu(c);
- smp_apply_quirks(c);
}
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
pr_debug("Boot done.\n");
impress_friends();
- smp_checks();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d038b9c45cf..79c07324728 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -750,7 +750,7 @@ static int __init uv_bau_init(void)
int node;
int nblades;
int last_blade;
- int cur_cpu = 0;
+ int cur_cpu;
if (!is_uv_system())
return 0;
@@ -760,6 +760,7 @@ static int __init uv_bau_init(void)
uv_mmask = (1UL << uv_hub_info->n_val) - 1;
nblades = 0;
last_blade = -1;
+ cur_cpu = 0;
for_each_online_node(node) {
blade = uv_node_to_blade_id(node);
if (blade == last_blade)
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0fcc95a354f..7e4515957a1 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,10 +25,10 @@
*
* Send feedback to <colpatch@us.ibm.com>
*/
-#include <linux/init.h>
-#include <linux/smp.h>
#include <linux/nodemask.h>
#include <linux/mmzone.h>
+#include <linux/init.h>
+#include <linux/smp.h>
#include <asm/cpu.h>
static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num)
*/
if (num)
per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
+
return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
}
EXPORT_SYMBOL(arch_register_cpu);
@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num)
unregister_cpu(&per_cpu(cpu_devices, num).cpu);
}
EXPORT_SYMBOL(arch_unregister_cpu);
-#else
+#else /* CONFIG_HOTPLUG_CPU */
+
static int __init arch_register_cpu(int num)
{
return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
}
-#endif /*CONFIG_HOTPLUG_CPU*/
+#endif /* CONFIG_HOTPLUG_CPU */
static int __init topology_init(void)
{
@@ -70,11 +72,11 @@ static int __init topology_init(void)
#ifdef CONFIG_NUMA
for_each_online_node(i)
register_one_node(i);
-#endif /* CONFIG_NUMA */
+#endif
for_each_present_cpu(i)
arch_register_cpu(i);
+
return 0;
}
-
subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 00000000000..2ffb6c53326
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME "sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+ struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+ .name = RTC_NAME,
+ .rating = 400,
+ .read = uv_read_rtc,
+ .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+ .shift = 10,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+ .name = RTC_NAME,
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+ .shift = 20,
+ .rating = 400,
+ .irq = -1,
+ .set_next_event = uv_rtc_next_event,
+ .set_mode = uv_rtc_timer_setup,
+ .event_handler = NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+ spinlock_t lock;
+ /* next cpu waiting for timer, local node relative: */
+ int next_cpu;
+ /* number of cpus on this node: */
+ int ncpus;
+ struct {
+ int lcpu; /* systemwide logical cpu number */
+ u64 expires; /* next timer expiration for this cpu */
+ } cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head **blade_info __read_mostly;
+
+static int uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+ unsigned long apicid, val;
+ int pnode;
+
+ apicid = cpu_physical_id(cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+ return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+ UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+ u64 val;
+ int pnode = uv_cpu_to_pnode(cpu);
+
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+ uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+ UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+ val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+ ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+ /* Set configuration */
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+ /* Initialize comparator value */
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+ return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+ int bid;
+
+ for_each_possible_blade(bid) {
+ kfree(blade_info[bid]);
+ }
+ kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+ int cpu;
+
+ blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+ if (!blade_info)
+ return -ENOMEM;
+ memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+ for_each_present_cpu(cpu) {
+ int nid = cpu_to_node(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ struct uv_rtc_timer_head *head = blade_info[bid];
+
+ if (!head) {
+ head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+ (uv_blade_nr_possible_cpus(bid) *
+ 2 * sizeof(u64)),
+ GFP_KERNEL, nid);
+ if (!head) {
+ uv_rtc_deallocate_timers();
+ return -ENOMEM;
+ }
+ spin_lock_init(&head->lock);
+ head->ncpus = uv_blade_nr_possible_cpus(bid);
+ head->next_cpu = -1;
+ blade_info[bid] = head;
+ }
+
+ head->cpu[bcpu].lcpu = cpu;
+ head->cpu[bcpu].expires = ULLONG_MAX;
+ }
+
+ return 0;
+}
+
+/* Find and set the next expiring timer. */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+ u64 lowest = ULLONG_MAX;
+ int c, bcpu = -1;
+
+ head->next_cpu = -1;
+ for (c = 0; c < head->ncpus; c++) {
+ u64 exp = head->cpu[c].expires;
+ if (exp < lowest) {
+ bcpu = c;
+ lowest = exp;
+ }
+ }
+ if (bcpu >= 0) {
+ head->next_cpu = bcpu;
+ c = head->cpu[bcpu].lcpu;
+ if (uv_setup_intr(c, lowest))
+ /* If we didn't set it up in time, trigger */
+ uv_rtc_send_IPI(c);
+ } else {
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ }
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int next_cpu;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ next_cpu = head->next_cpu;
+ *t = expires;
+ /* Will this one be next to go off? */
+ if (next_cpu < 0 || bcpu == next_cpu ||
+ expires < head->cpu[next_cpu].expires) {
+ head->next_cpu = bcpu;
+ if (uv_setup_intr(cpu, expires)) {
+ *t = ULLONG_MAX;
+ uv_rtc_find_next_timer(head, pnode);
+ spin_unlock_irqrestore(&head->lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore(&head->lock, flags);
+ return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int rc = 0;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+ rc = 1;
+
+ *t = ULLONG_MAX;
+
+ /* Was the hardware setup for this timer? */
+ if (head->next_cpu == bcpu)
+ uv_rtc_find_next_timer(head, pnode);
+
+ spin_unlock_irqrestore(&head->lock, flags);
+
+ return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+ return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+ struct clock_event_device *ced)
+{
+ int ced_cpu = cpumask_first(ced->cpumask);
+
+ return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ int ced_cpu = cpumask_first(evt->cpumask);
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_RESUME:
+ /* Nothing to do here yet */
+ break;
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ uv_rtc_unset_timer(ced_cpu);
+ break;
+ }
+}
+
+static void uv_rtc_interrupt(void)
+{
+ struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+ int cpu = smp_processor_id();
+
+ if (!ced || !ced->event_handler)
+ return;
+
+ if (uv_rtc_unset_timer(cpu) != 1)
+ return;
+
+ ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+ uv_rtc_enable = 1;
+
+ return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+ struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+ *ced = clock_event_device_uv;
+ ced->cpumask = cpumask_of(smp_processor_id());
+ clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+ int rc;
+
+ if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+ return -ENODEV;
+
+ generic_interrupt_extension = uv_rtc_interrupt;
+
+ clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+ clocksource_uv.shift);
+
+ rc = clocksource_register(&clocksource_uv);
+ if (rc) {
+ generic_interrupt_extension = NULL;
+ return rc;
+ }
+
+ /* Setup and register clockevents */
+ rc = uv_rtc_allocate_timers();
+ if (rc) {
+ clocksource_unregister(&clocksource_uv);
+ generic_interrupt_extension = NULL;
+ return rc;
+ }
+
+ clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+ NSEC_PER_SEC, clock_event_device_uv.shift);
+
+ clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+ sn_rtc_cycles_per_second;
+
+ clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+ (NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+ rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+ if (rc) {
+ clocksource_unregister(&clocksource_uv);
+ generic_interrupt_extension = NULL;
+ uv_rtc_deallocate_timers();
+ }
+
+ return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 191a876e9e8..31ffc24eec4 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
static irqreturn_t piix4_master_intr(int irq, void *dev_id)
{
int realirq;
- irq_desc_t *desc;
+ struct irq_desc *desc;
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90e2cb..95deb9f2211 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
vmi_ops.update_pte(ptep, VMI_PAGE_PT);
}
-static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
-}
-
static void vmi_set_pud(pud_t *pudp, pud_t pudval)
{
/* Um, eww */
@@ -750,7 +745,6 @@ static inline int __init activate_vmi(void)
pv_mmu_ops.set_pmd = vmi_set_pmd;
#ifdef CONFIG_X86_PAE
pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pte_present = vmi_set_pte_present;
pv_mmu_ops.set_pud = vmi_set_pud;
pv_mmu_ops.pte_clear = vmi_pte_clear;
pv_mmu_ops.pmd_clear = vmi_pmd_clear;
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d860963f26..62ad500d55f 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,15 +189,24 @@ SECTIONS
*(.bss)
. = ALIGN(4);
__bss_stop = .;
- _end = . ;
- /* This is where the kernel creates the early boot page tables */
+ }
+
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
. = ALIGN(PAGE_SIZE);
- pg0 = . ;
+ __brk_base = . ;
+ . += 64 * 1024 ; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = . ;
+ }
+
+ .end : AT(ADDR(.end) - LOAD_OFFSET) {
+ _end = . ;
}
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
+ *(.discard)
}
STABS_DEBUG
@@ -205,6 +214,12 @@ SECTIONS
DWARF_DEBUG
}
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
+
#ifdef CONFIG_KEXEC
/* Link time checks */
#include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f680..c8742507b03 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -29,8 +29,8 @@ SECTIONS
{
. = __START_KERNEL;
phys_startup_64 = startup_64 - LOAD_OFFSET;
- _text = .; /* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
+ _text = .; /* Text and read-only data */
/* First the code that has to be first for bootstrapping */
*(.text.head)
_stext = .;
@@ -61,13 +61,13 @@ SECTIONS
.data : AT(ADDR(.data) - LOAD_OFFSET) {
DATA_DATA
CONSTRUCTORS
+ _edata = .; /* End of data section */
} :data
- _edata = .; /* End of data section */
- . = ALIGN(PAGE_SIZE);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
*(.data.cacheline_aligned)
}
. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
#undef VVIRT_OFFSET
#undef VVIRT
- . = ALIGN(THREAD_SIZE); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+ . = ALIGN(THREAD_SIZE); /* init_task */
*(.data.init_task)
}:data.init
- . = ALIGN(PAGE_SIZE);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
*(.data.page_aligned)
}
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- __smp_alt_begin = .;
- __smp_locks = .;
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ /* might get freed after init */
+ . = ALIGN(PAGE_SIZE);
+ __smp_alt_begin = .;
+ __smp_locks = .;
*(.smp_locks)
+ __smp_locks_end = .;
+ . = ALIGN(PAGE_SIZE);
+ __smp_alt_end = .;
}
- __smp_locks_end = .;
- . = ALIGN(PAGE_SIZE);
- __smp_alt_end = .;
. = ALIGN(PAGE_SIZE); /* Init code and data */
- __init_begin = .;
+ __init_begin = .; /* paired with __init_end */
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
__initdata_end = .;
}
- . = ALIGN(16);
- __setup_start = .;
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
- __setup_end = .;
- __initcall_start = .;
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+ . = ALIGN(16);
+ __setup_start = .;
+ *(.init.setup)
+ __setup_end = .;
+ }
.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+ __initcall_start = .;
INITCALLS
+ __initcall_end = .;
}
- __initcall_end = .;
- __con_initcall_start = .;
.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ __con_initcall_start = .;
*(.con_initcall.init)
+ __con_initcall_end = .;
}
- __con_initcall_end = .;
- __x86_cpu_dev_start = .;
.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
*(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
}
- __x86_cpu_dev_end = .;
SECURITY_INIT
. = ALIGN(8);
.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
+ __parainstructions = .;
*(.parainstructions)
- __parainstructions_end = .;
+ __parainstructions_end = .;
}
- . = ALIGN(8);
- __alt_instructions = .;
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ . = ALIGN(8);
+ __alt_instructions = .;
*(.altinstructions)
+ __alt_instructions_end = .;
}
- __alt_instructions_end = .;
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
@@ -207,9 +209,11 @@ SECTIONS
#ifdef CONFIG_BLK_DEV_INITRD
. = ALIGN(PAGE_SIZE);
- __initramfs_start = .;
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
- __initramfs_end = .;
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+ __initramfs_start = .;
+ *(.init.ramfs)
+ __initramfs_end = .;
+ }
#endif
#ifdef CONFIG_SMP
@@ -229,20 +233,29 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
__init_end = .;
- . = ALIGN(PAGE_SIZE);
- __nosave_begin = .;
.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- *(.data.nosave)
+ . = ALIGN(PAGE_SIZE);
+ __nosave_begin = .;
+ *(.data.nosave)
+ . = ALIGN(PAGE_SIZE);
+ __nosave_end = .;
} :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- __bss_start = .; /* BSS */
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ __bss_start = .; /* BSS */
*(.bss.page_aligned)
*(.bss)
- }
- __bss_stop = .;
+ __bss_stop = .;
+ }
+
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ __brk_base = . ;
+ . += 64 * 1024 ; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = . ;
+ }
_end = . ;
@@ -250,6 +263,7 @@ SECTIONS
/DISCARD/ : {
*(.exitcall.exit)
*(.eh_frame)
+ *(.discard)
}
STABS_DEBUG
@@ -275,3 +289,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
ASSERT((per_cpu__irq_stack_union == 0),
"irq_stack_union is not at start of per-cpu area");
#endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big")
+#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 74de562812c..a1d804bcd48 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -22,7 +22,7 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
-#ifdef CONFIG_PARAVIRT
+#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*
* Interrupt control on vSMPowered systems:
* ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
@@ -114,6 +114,7 @@ static void __init set_vsmp_pv_ops(void)
}
#endif
+#ifdef CONFIG_PCI
static int is_vsmp = -1;
static void __init detect_vsmp_box(void)
@@ -139,6 +140,15 @@ int is_vsmp_box(void)
}
}
+#else
+static void __init detect_vsmp_box(void)
+{
+}
+int is_vsmp_box(void)
+{
+ return 0;
+}
+#endif
void __init vsmp_init(void)
{
detect_vsmp_box();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9fe4ddaa8f6..90e44a10e68 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1058,14 +1058,6 @@ __init void lguest_init(void)
* lguest_init() where the rest of the fairly chaotic boot setup
* occurs. */
- /* The native boot code sets up initial page tables immediately after
- * the kernel itself, and sets init_pg_tables_end so they're not
- * clobbered. The Launcher places our initial pagetables somewhere at
- * the top of our physical memory, so we don't need extra space: set
- * init_pg_tables_end to the end of the kernel. */
- init_pg_tables_start = __pa(pg0);
- init_pg_tables_end = __pa(pg0);
-
/* As described in head_32.S, we map the first 128M of memory. */
max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3..ad5441ed1b5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
/*
* memcpy - Copy a memory block.
*
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
* Output:
* rax original destination
- */
+ */
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
ALIGN
memcpy_c:
CFI_STARTPROC
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ shrl $3, %ecx
+ andl $7, %edx
rep movsq
- movl %edx,%ecx
+ movl %edx, %ecx
rep movsb
ret
CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
- pushq %rbx
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbx, 0
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $6,%ecx
+ /*
+ * Put the number of full 64-byte blocks into %ecx.
+ * Tail portion is handled at the end:
+ */
+ movq %rdi, %rax
+ movl %edx, %ecx
+ shrl $6, %ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
+ /*
+ * We decrement the loop index here - and the zero-flag is
+ * checked at the end of the loop (instructions inbetween do
+ * not change the zero flag):
+ */
decl %ecx
- movq (%rsi),%r11
- movq 8(%rsi),%r8
+ /*
+ * Move in blocks of 4x16 bytes:
+ */
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r8, 1*8(%rdi)
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r10
+ movq %r9, 2*8(%rdi)
+ movq %r10, 3*8(%rdi)
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
+ movq 4*8(%rsi), %r11
+ movq 5*8(%rsi), %r8
+ movq %r11, 4*8(%rdi)
+ movq %r8, 5*8(%rdi)
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
+ movq 6*8(%rsi), %r9
+ movq 7*8(%rsi), %r10
+ movq %r9, 6*8(%rdi)
+ movq %r10, 7*8(%rdi)
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
jnz .Lloop_64
.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
+ movl %edx, %ecx
+ andl $63, %ecx
+ shrl $3, %ecx
jz .Lhandle_7
+
.p2align 4
.Lloop_8:
decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
+ movq (%rsi), %r8
+ movq %r8, (%rdi)
+ leaq 8(%rdi), %rdi
+ leaq 8(%rsi), %rsi
jnz .Lloop_8
.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
+ movl %edx, %ecx
+ andl $7, %ecx
+ jz .Lend
+
.p2align 4
.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
+ movb (%rsi), %r8b
+ movb %r8b, (%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
-.Lende:
- popq %rbx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rbx
+.Lend:
ret
-.Lfinal:
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs run faster using the string copy instructions.
+ * It is also a lot simpler. Use this when possible:
+ */
- .section .altinstr_replacement,"ax"
+ .section .altinstr_replacement, "ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
2:
.previous
- .section .altinstructions,"a"
+
+ .section .altinstructions, "a"
.align 8
.quad memcpy
.quad 1b
.byte X86_FEATURE_REP_GOOD
- /* Replace only beginning, memcpy is used to apply alternatives, so it
- * is silly to overwrite itself with nops - reboot is only outcome... */
+
+ /*
+ * Replace only beginning, memcpy is used to apply alternatives,
+ * so it is silly to overwrite itself with nops - reboot is the
+ * only outcome...
+ */
.byte 2b - 1b
.byte 2b - 1b
.previous
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0..522db5e3d0b 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -121,22 +121,13 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
pagefault_enable();
}
-/* This is the same as kmap_atomic() but can map memory that doesn't
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
* have a struct page associated with it.
*/
void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
{
- enum fixed_addresses idx;
- unsigned long vaddr;
-
- pagefault_disable();
-
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
- arch_flush_lazy_mmu_mode();
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
@@ -158,7 +149,6 @@ EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
-#ifdef CONFIG_NUMA
void __init set_highmem_pages_init(void)
{
struct zone *zone;
@@ -182,11 +172,3 @@ void __init set_highmem_pages_init(void)
}
totalram_pages += totalhigh_pages;
}
-#else
-void __init set_highmem_pages_init(void)
-{
- add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
- totalram_pages += totalhigh_pages;
-}
-#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722587d..fd3da1dda1c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,8 +1,345 @@
+#include <linux/ioport.h>
#include <linux/swap.h>
+
#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
+{
+ unsigned long puds, pmds, ptes, tables, start;
+
+ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+ if (use_gbpages) {
+ unsigned long extra;
+
+ extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+ pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+ if (use_pse) {
+ unsigned long extra;
+
+ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+ extra += PMD_SIZE;
+#endif
+ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else
+ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+ /* for fixmap */
+ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+ /*
+ * RED-PEN putting page tables only on node 0 could
+ * cause a hotspot and fill up ZONE_DMA. The page tables
+ * need roughly 0.5KB per GB.
+ */
+#ifdef CONFIG_X86_32
+ start = 0x7000;
+ e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+ start = 0x8000;
+ e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+ if (e820_table_start == -1UL)
+ panic("Cannot find space for the kernel page tables");
+
+ e820_table_start >>= PAGE_SHIFT;
+ e820_table_end = e820_table_start;
+ e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ unsigned long page_size_mask = 0;
+ unsigned long start_pfn, end_pfn;
+ unsigned long ret = 0;
+ unsigned long pos;
+
+ struct map_range mr[NR_RANGE_MR];
+ int nr_range, i;
+ int use_pse, use_gbpages;
+
+ printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+ if (!after_bootmem)
+ init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+ set_nx();
+ if (nx_enabled)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+#endif
+
+ if (use_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (use_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = 0;
+
+ /* head if not big page alignment ? */
+ start_pfn = start >> PAGE_SHIFT;
+ pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (pos == 0)
+ end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ else
+ end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+ if (end_pfn > (end >> PAGE_SHIFT))
+ end_pfn = end >> PAGE_SHIFT;
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* big page (2M) range */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+#ifdef CONFIG_X86_64
+ /* big page (1G) range */
+ start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
+#endif
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = pos>>PAGE_SHIFT;
+ end_pfn = end>>PAGE_SHIFT;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof(struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
+ }
+
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ mr[i].start, mr[i].end,
+ (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+ /*
+ * Find space for the kernel direct mapping tables.
+ *
+ * Later we should allocate these tables in the local node of the
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
+ */
+ if (!after_bootmem)
+ find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+ for (i = 0; i < nr_range; i++)
+ kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+ ret = end;
+#else /* CONFIG_X86_64 */
+ for (i = 0; i < nr_range; i++)
+ ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+ early_ioremap_page_table_range_init();
+
+ load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+ if (!after_bootmem)
+ mmu_cr4_features = read_cr4();
+#endif
+ __flush_tlb_all();
+
+ if (!after_bootmem && e820_table_end > e820_table_start)
+ reserve_early(e820_table_start << PAGE_SHIFT,
+ e820_table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start, end);
+
+ return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+ return 0;
+ if (!page_is_ram(pagenr))
+ return 1;
+ return 0;
+}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
@@ -47,3 +384,10 @@ void free_initmem(void)
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 47df0e1bbeb..db81e9a8556 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
+#include <asm/init.h>
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
-
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;
static __init void *alloc_low_page(void)
{
- unsigned long pfn = table_end++;
+ unsigned long pfn = e820_table_end++;
void *adr;
- if (pfn >= table_top)
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_init_bootmem)
+ if (after_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = NULL;
- if (after_init_bootmem) {
+ if (after_bootmem) {
#ifdef CONFIG_DEBUG_PAGEALLOC
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
@@ -168,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
&& (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < table_start
- || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+ && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+ || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
pte_t *newpte;
int i;
- BUG_ON(after_init_bootmem);
+ BUG_ON(after_bootmem);
newpte = alloc_low_page();
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
@@ -242,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
- unsigned long start_pfn,
- unsigned long end_pfn,
- int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
+ int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+ unsigned long start_pfn, end_pfn;
+ pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
@@ -255,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
unsigned pages_2m, pages_4k;
int mapping_iter;
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
/*
* First iteration will setup identity mapping using large/small pages
* based on use_pse, with other attributes same as set by
@@ -369,26 +371,6 @@ repeat:
mapping_iter = 2;
goto repeat;
}
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (!page_is_ram(pagenr))
- return 1;
return 0;
}
@@ -545,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
{
+ pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
/*
@@ -641,7 +624,7 @@ static int __init noexec_setup(char *str)
}
early_param("noexec", noexec_setup);
-static void __init set_nx(void)
+void __init set_nx(void)
{
unsigned int v[4], l, h;
@@ -793,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
+ __vmalloc_start_set = true;
+
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
@@ -814,176 +799,66 @@ static void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+static unsigned long __init setup_node_bootmem(int nodeid,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long bootmap)
+{
+ unsigned long bootmap_size;
+
+ /* don't touch min_low_pfn */
+ bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+ bootmap >> PAGE_SHIFT,
+ start_pfn, end_pfn);
+ printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
+ nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
+ nodeid, bootmap, bootmap + bootmap_size);
+ free_bootmem_with_active_regions(nodeid, end_pfn);
+ early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+ return bootmap + bootmap_size;
+}
+
void __init setup_bootmem_allocator(void)
{
- int i;
+ int nodeid;
unsigned long bootmap_size, bootmap;
/*
* Initialize the boot-time allocator (with low memory only):
*/
bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
- max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+ bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
- /* don't touch min_low_pfn */
- bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
- min_low_pfn, max_low_pfn);
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
- printk(KERN_INFO " low ram: %08lx - %08lx\n",
- min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
- printk(KERN_INFO " bootmap %08lx - %08lx\n",
- bootmap, bootmap + bootmap_size);
- for_each_online_node(i)
- free_bootmem_with_active_regions(i, max_low_pfn);
- early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-
- after_init_bootmem = 1;
-}
-
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
- unsigned long puds, pmds, ptes, tables, start;
+ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ for_each_online_node(nodeid) {
+ unsigned long start_pfn, end_pfn;
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
- if (use_pse) {
- unsigned long extra;
-
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
- extra += PMD_SIZE;
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-
- /*
- * RED-PEN putting page tables only on node 0 could
- * cause a hotspot and fill up ZONE_DMA. The page tables
- * need roughly 0.5KB per GB.
- */
- start = 0x7000;
- table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
- tables, PAGE_SIZE);
- if (table_start == -1UL)
- panic("Cannot find space for the kernel page tables");
-
- table_start >>= PAGE_SHIFT;
- table_end = table_start;
- table_top = table_start + (tables>>PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT,
- (table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- pgd_t *pgd_base = swapper_pg_dir;
- unsigned long start_pfn, end_pfn;
- unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- int use_pse = 0;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ start_pfn = node_start_pfn[nodeid];
+ end_pfn = node_end_pfn[nodeid];
+ if (start_pfn > max_low_pfn)
+ continue;
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
#else
- int use_pse = cpu_has_pse;
+ start_pfn = 0;
+ end_pfn = max_low_pfn;
#endif
-
- /*
- * Find space for the kernel direct mapping tables.
- */
- if (!after_init_bootmem)
- find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
- set_nx();
- if (nx_enabled)
- printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
-
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
-
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
- }
-
- /*
- * Don't use a large page for the first 2/4MB of memory
- * because there are often fixed size MTRRs in there
- * and overlapping MTRRs into large pages can cause
- * slowdowns.
- */
- big_page_start = PMD_SIZE;
-
- if (start < big_page_start) {
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
- } else {
- /* head is not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+ bootmap);
}
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
- /* big page range */
- start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < (big_page_start >> PAGE_SHIFT))
- start_pfn = big_page_start >> PAGE_SHIFT;
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
- use_pse);
-
- /* tail is not big page alignment ? */
- start_pfn = end_pfn;
- if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn)
- kernel_physical_mapping_init(pgd_base, start_pfn,
- end_pfn, 0);
- }
-
- early_ioremap_page_table_range_init(pgd_base);
- load_cr3(swapper_pg_dir);
-
- __flush_tlb_all();
-
- if (!after_init_bootmem)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-
- if (!after_init_bootmem)
- early_memtest(start, end);
-
- return end >> PAGE_SHIFT;
+ after_bootmem = 1;
}
-
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
@@ -1217,13 +1092,6 @@ void mark_rodata_ro(void)
}
#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 07f44d491df..54efa57d1c0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
#include <asm/kdebug.h>
#include <asm/numa.h>
#include <asm/cacheflush.h>
+#include <asm/init.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
-
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
* around without checking the pgd every time.
*/
-int after_bootmem;
-
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
/*
* noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
- do_not_nx = 0;
+ disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
- do_not_nx = 1;
+ disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
unsigned long efer;
rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || do_not_nx)
+ if (!(efer & EFER_NX) || disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
@@ -325,13 +318,9 @@ void __init cleanup_highmap(void)
}
}
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
static __ref void *alloc_low_page(unsigned long *phys)
{
- unsigned long pfn = table_end++;
+ unsigned long pfn = e820_table_end++;
void *adr;
if (after_bootmem) {
@@ -341,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
return adr;
}
- if (pfn >= table_top)
+ if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -581,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
-{
- unsigned long puds, pmds, ptes, tables, start;
-
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
- if (use_gbpages) {
- unsigned long extra;
- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
- if (use_pse) {
- unsigned long extra;
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
- /*
- * RED-PEN putting page tables only on node 0 could
- * cause a hotspot and fill up ZONE_DMA. The page tables
- * need roughly 0.5KB per GB.
- */
- start = 0x8000;
- table_start = find_e820_area(start, end, tables, PAGE_SIZE);
- if (table_start == -1UL)
- panic("Cannot find space for the kernel page tables");
-
- table_start >>= PAGE_SHIFT;
- table_end = table_start;
- table_top = table_start + (tables >> PAGE_SHIFT);
-
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
- unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
{
unsigned long next, last_map_addr = end;
@@ -669,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
return last_map_addr;
}
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
- unsigned long start_pfn, unsigned long end_pfn,
- unsigned long page_size_mask)
-{
-
- if (start_pfn < end_pfn) {
- if (nr_range >= NR_RANGE_MR)
- panic("run out of range for init_memory_mapping\n");
- mr[nr_range].start = start_pfn<<PAGE_SHIFT;
- mr[nr_range].end = end_pfn<<PAGE_SHIFT;
- mr[nr_range].page_size_mask = page_size_mask;
- nr_range++;
- }
-
- return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
-{
- unsigned long last_map_addr = 0;
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
-
- printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
-#endif
-
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
-
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
-
- /* head if not big page alignment ?*/
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (2M) range*/
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask &
- ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- if (start_pfn < end_pfn) {
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
- }
-
- /* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
- /* try to merge same page size and continuous */
- for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
- unsigned long old_start;
- if (mr[i].end != mr[i+1].start ||
- mr[i].page_size_mask != mr[i+1].page_size_mask)
- continue;
- /* move it */
- old_start = mr[i].start;
- memmove(&mr[i], &mr[i+1],
- (nr_range - 1 - i) * sizeof (struct map_range));
- mr[i--].start = old_start;
- nr_range--;
- }
-
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " %010lx - %010lx page %s\n",
- mr[i].start, mr[i].end,
- (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
- (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
- if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
-
- for (i = 0; i < nr_range; i++)
- last_map_addr = kernel_physical_mapping_init(
- mr[i].start, mr[i].end,
- mr[i].page_size_mask);
-
- if (!after_bootmem)
- mmu_cr4_features = read_cr4();
- __flush_tlb_all();
-
- if (!after_bootmem && table_end > table_start)
- reserve_early(table_start << PAGE_SHIFT,
- table_end << PAGE_SHIFT, "PGTABLE");
-
- printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
- last_map_addr, end);
-
- if (!after_bootmem)
- early_memtest(start, end);
-
- return last_map_addr >> PAGE_SHIFT;
-}
-
#ifndef CONFIG_NUMA
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -910,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
- if (pagenr <= 256)
- return 1;
- if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
- return 0;
- if (!page_is_ram(pagenr))
- return 1;
- return 0;
-}
-
-
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
kcore_modules, kcore_vsyscall;
@@ -1019,13 +768,6 @@ void mark_rodata_ro(void)
#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_init_pages("initrd memory", start, end);
-}
-#endif
-
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff4..699c9b2895a 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -31,16 +31,27 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
}
EXPORT_SYMBOL_GPL(is_io_mapping_possible);
-/* Map 'pfn' using fixed map 'type' and protections 'prot'
- */
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
pagefault_disable();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+}
+
+/*
+ * Map 'pfn' using fixed map 'type' and protections 'prot'
+ */
+void *
+iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+{
/*
* For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
* PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +61,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
prot = PAGE_KERNEL_UC_MINUS;
- idx = type + KM_TYPE_NR*smp_processor_id();
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
- arch_flush_lazy_mmu_mode();
-
- return (void*) vaddr;
+ return kmap_atomic_prot_pfn(pfn, type, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648..0dfa09d69e8 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,13 +22,17 @@
#include <asm/pgalloc.h>
#include <asm/pat.h>
-#ifdef CONFIG_X86_64
-
-static inline int phys_addr_valid(unsigned long addr)
+static inline int phys_addr_valid(resource_size_t addr)
{
- return addr < (1UL << boot_cpu_data.x86_phys_bits);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+ return 1;
+#endif
}
+#ifdef CONFIG_X86_64
+
unsigned long __phys_addr(unsigned long x)
{
if (x >= __START_KERNEL_map) {
@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long x)
} else {
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
- !phys_addr_valid(x));
+ VIRTUAL_BUG_ON(!phys_addr_valid(x));
}
return x;
}
@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x)
if (x < PAGE_OFFSET)
return false;
x -= PAGE_OFFSET;
- if (system_state == SYSTEM_BOOTING ?
- x > MAXMEM : !phys_addr_valid(x)) {
+ if (!phys_addr_valid(x))
return false;
- }
}
return pfn_valid(x >> PAGE_SHIFT);
@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
#else
-static inline int phys_addr_valid(unsigned long addr)
-{
- return 1;
-}
-
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- /* VMALLOC_* aren't constants; not available at the boot time */
+ /* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
- is_vmalloc_addr((void *) x));
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
return x - PAGE_OFFSET;
}
EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x)
{
if (x < PAGE_OFFSET)
return false;
- if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+ return false;
+ if (x >= FIXADDR_START)
return false;
return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
@@ -508,13 +505,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
return &bm_pte[pte_index(addr)];
}
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
void __init early_ioremap_init(void)
{
pmd_t *pmd;
+ int i;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -581,6 +584,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
static int __init check_early_ioremap_leak(void)
{
int count = 0;
@@ -602,7 +606,8 @@ static int __init check_early_ioremap_leak(void)
}
late_initcall(check_early_ioremap_leak);
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset, last_addr;
unsigned int nrpages;
@@ -668,9 +673,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
--nrpages;
}
if (early_ioremap_debug)
- printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+ printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
- prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
}
@@ -738,8 +743,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
}
prev_map[slot] = NULL;
}
-
-void __this_fixmap_does_not_exist(void)
-{
- WARN_ON(1);
-}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 6a518dd08a3..4f115e00486 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
if (!ctx->active) {
- pr_warning("kmmio: spurious debug trap on CPU %d.\n",
+ pr_debug("kmmio: spurious debug trap on CPU %d.\n",
smp_processor_id());
goto out;
}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd7883d03..605c8be0621 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
{
if (arg)
memtest_pattern = simple_strtoul(arg, NULL, 0);
+ else
+ memtest_pattern = ARRAY_SIZE(patterns);
+
return 0;
}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95a035..3daefa04ace 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
for_each_online_node(nid)
propagate_e820_map_node(nid);
- for_each_online_node(nid)
+ for_each_online_node(nid) {
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+ }
- NODE_DATA(0)->bdata = &bootmem_node_data[0];
setup_bootmem_allocator();
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 9c4294986af..d71e1b636ce 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/setup.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
@@ -33,6 +34,7 @@ struct cpa_data {
unsigned long pfn;
unsigned force_split : 1;
int curpage;
+ struct page **pages;
};
/*
@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -95,7 +98,7 @@ static inline unsigned long highmap_start_pfn(void)
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -201,10 +204,10 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
-static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+ int in_flags, struct page **pages)
{
unsigned int i, level;
- unsigned long *addr;
BUG_ON(irqs_disabled());
@@ -225,14 +228,22 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache)
* will cause all other CPUs to flush the same
* cachelines:
*/
- for (i = 0, addr = start; i < numpages; i++, addr++) {
- pte_t *pte = lookup_address(*addr, &level);
+ for (i = 0; i < numpages; i++) {
+ unsigned long addr;
+ pte_t *pte;
+
+ if (in_flags & CPA_PAGES_ARRAY)
+ addr = (unsigned long)page_address(pages[i]);
+ else
+ addr = start[i];
+
+ pte = lookup_address(addr, &level);
/*
* Only flush present addresses:
*/
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range((void *) *addr, PAGE_SIZE);
+ clflush_cache_range((void *)addr, PAGE_SIZE);
}
}
@@ -584,7 +595,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
unsigned int level;
pte_t *kpte, old_pte;
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY)
+ address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+ else if (cpa->flags & CPA_ARRAY)
address = cpa->vaddr[cpa->curpage];
else
address = *cpa->vaddr;
@@ -687,7 +700,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY)
+ vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+ else if (cpa->flags & CPA_ARRAY)
vaddr = cpa->vaddr[cpa->curpage];
else
vaddr = *cpa->vaddr;
@@ -698,7 +713,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa = *cpa;
temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~CPA_ARRAY;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -711,7 +726,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, _brk_end))
return 0;
/*
@@ -724,7 +739,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa = *cpa;
temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~CPA_ARRAY;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
/*
* The high mapping range is imprecise, so ignore the return value.
@@ -745,7 +760,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
*/
cpa->numpages = numpages;
/* for array changes, we can't use large page */
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
if (!debug_pagealloc)
@@ -769,7 +784,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
*/
BUG_ON(cpa->numpages > numpages);
numpages -= cpa->numpages;
- if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
cpa->curpage++;
else
*cpa->vaddr += cpa->numpages * PAGE_SIZE;
@@ -786,7 +801,8 @@ static inline int cache_attr(pgprot_t attr)
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
- int force_split, int array)
+ int force_split, int in_flag,
+ struct page **pages)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -801,15 +817,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
return 0;
/* Ensure we are PAGE_SIZE aligned */
- if (!array) {
- if (*addr & ~PAGE_MASK) {
- *addr &= PAGE_MASK;
- /*
- * People should not be passing in unaligned addresses:
- */
- WARN_ON_ONCE(1);
- }
- } else {
+ if (in_flag & CPA_ARRAY) {
int i;
for (i = 0; i < numpages; i++) {
if (addr[i] & ~PAGE_MASK) {
@@ -817,6 +825,18 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
WARN_ON_ONCE(1);
}
}
+ } else if (!(in_flag & CPA_PAGES_ARRAY)) {
+ /*
+ * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+ * No need to cehck in that case
+ */
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
}
/* Must avoid aliasing mappings in the highmem code */
@@ -832,6 +852,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
arch_flush_lazy_mmu_mode();
cpa.vaddr = addr;
+ cpa.pages = pages;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
@@ -839,8 +860,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa.curpage = 0;
cpa.force_split = force_split;
- if (array)
- cpa.flags |= CPA_ARRAY;
+ if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+ cpa.flags |= in_flag;
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -866,9 +887,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
* wbindv):
*/
if (!ret && cpu_has_clflush) {
- if (cpa.flags & CPA_ARRAY)
- cpa_flush_array(addr, numpages, cache);
- else
+ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ cpa_flush_array(addr, numpages, cache,
+ cpa.flags, pages);
+ } else
cpa_flush_range(*addr, numpages, cache);
} else
cpa_flush_all(cache);
@@ -888,14 +910,28 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages,
pgprot_t mask, int array)
{
return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
- array);
+ (array ? CPA_ARRAY : 0), NULL);
}
static inline int change_page_attr_clear(unsigned long *addr, int numpages,
pgprot_t mask, int array)
{
return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
- array);
+ (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+ CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+ CPA_PAGES_ARRAY, pages);
}
int _set_memory_uc(unsigned long addr, int numpages)
@@ -1043,7 +1079,7 @@ int set_memory_np(unsigned long addr, int numpages)
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
- __pgprot(0), 1, 0);
+ __pgprot(0), 1, 0, NULL);
}
int set_pages_uc(struct page *page, int numpages)
@@ -1054,6 +1090,35 @@ int set_pages_uc(struct page *page, int numpages)
}
EXPORT_SYMBOL(set_pages_uc);
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+ unsigned long start;
+ unsigned long end;
+ int i;
+ int free_idx;
+
+ for (i = 0; i < addrinarray; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+ goto err_out;
+ }
+
+ if (cpa_set_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
+ return 0; /* Success */
+ }
+err_out:
+ free_idx = i;
+ for (i = 0; i < free_idx; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
int set_pages_wb(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
@@ -1062,6 +1127,26 @@ int set_pages_wb(struct page *page, int numpages)
}
EXPORT_SYMBOL(set_pages_wb);
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+ int retval;
+ unsigned long start;
+ unsigned long end;
+ int i;
+
+ retval = cpa_clear_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK));
+
+ for (i = 0; i < addrinarray; i++) {
+ start = (unsigned long)page_address(pages[i]);
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+
+ return retval;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
int set_pages_x(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2ed37158012..640339ee4fb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,10 +677,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
- * reserve_pfn_range() doesn't support RAM pages.
+ * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+ * behavior with RAM pages by returning success.
*/
if (is_ram != 0)
- return -EINVAL;
+ return 0;
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index f2e477c91c1..46c8834aedc 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -50,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
}
pte = pte_offset_kernel(pmd, vaddr);
if (pte_val(pteval))
- set_pte_present(&init_mm, vaddr, pte, pteval);
+ set_pte_at(&init_mm, vaddr, pte, pteval);
else
pte_clear(&init_mm, vaddr, pte);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a654d59e448..821e97017e9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,11 +187,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
cpumask, cpumask_of(smp_processor_id()));
/*
- * Make the above memory operations globally visible before
- * sending the IPI.
- */
- smp_mb();
- /*
* We have to send the IPI only to
* CPUs affected.
*/
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 82d22fc601a..8c362b96b64 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = {
+static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
/*
* Systems where PCI IO resource ISA alignment can be skipped
* when the ISA enable bit in the bridge control is not set
@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
}
#endif
-static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
+static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
#ifdef __i386__
/*
* Laptops which need pci=assign-busses to see Cardbus cards
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 7d388d5cf54..9c49919e4d1 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
-static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = {
+static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
{
.ident = "MSI-K8T-Neo2Fir",
.matches = {
@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
*/
static u16 toshiba_line_size;
-static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = {
+static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = {
{
.ident = "Toshiba PS5 based laptop",
.matches = {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5ead808dd70..f234a37bd42 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -319,6 +319,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
return -EINVAL;
}
flags = new_flags;
+ vma->vm_page_prot = __pgprot(
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK) |
+ flags);
}
if (((vma->vm_pgoff < max_low_pfn_mapped) ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index cb6afa4ec95..db3802fb7b8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1723,9 +1723,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
{
pmd_t *kernel_pmd;
- init_pg_tables_start = __pa(pgd);
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
- max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ xen_start_info->nr_pt_frames * PAGE_SIZE +
+ 512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1870,7 +1870,6 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
- .set_pte_present = xen_set_pte_at,
.pte_clear = xen_pte_clear,
.pmd_clear = xen_pmd_clear,
#endif /* CONFIG_X86_PAE */
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 5f333403c2e..d313039e2fd 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -31,6 +31,8 @@
#include <linux/iova.h>
#include <linux/intel-iommu.h>
#include <linux/timer.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
#undef PREFIX
#define PREFIX "DMAR:"
@@ -509,6 +511,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
return -ENOMEM;
iommu->seq_id = iommu_allocated++;
+ sprintf (iommu->name, "dmar%d", iommu->seq_id);
iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
if (!iommu->reg) {
@@ -751,6 +754,42 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
}
/*
+ * Disable Queued Invalidation interface.
+ */
+void dmar_disable_qi(struct intel_iommu *iommu)
+{
+ unsigned long flags;
+ u32 sts;
+ cycles_t start_time = get_cycles();
+
+ if (!ecap_qis(iommu->ecap))
+ return;
+
+ spin_lock_irqsave(&iommu->register_lock, flags);
+
+ sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
+ if (!(sts & DMA_GSTS_QIES))
+ goto end;
+
+ /*
+ * Give a chance to HW to complete the pending invalidation requests.
+ */
+ while ((readl(iommu->reg + DMAR_IQT_REG) !=
+ readl(iommu->reg + DMAR_IQH_REG)) &&
+ (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+ cpu_relax();
+
+ iommu->gcmd &= ~DMA_GCMD_QIE;
+
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
+ !(sts & DMA_GSTS_QIES), sts);
+end:
+ spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
+/*
* Enable Queued Invalidation interface. This is a must to support
* interrupt-remapping. Also used by DMA-remapping, which replaces
* register based IOTLB invalidation.
@@ -770,20 +809,20 @@ int dmar_enable_qi(struct intel_iommu *iommu)
if (iommu->qi)
return 0;
- iommu->qi = kmalloc(sizeof(*qi), GFP_KERNEL);
+ iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
if (!iommu->qi)
return -ENOMEM;
qi = iommu->qi;
- qi->desc = (void *)(get_zeroed_page(GFP_KERNEL));
+ qi->desc = (void *)(get_zeroed_page(GFP_ATOMIC));
if (!qi->desc) {
kfree(qi);
iommu->qi = 0;
return -ENOMEM;
}
- qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_KERNEL);
+ qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC);
if (!qi->desc_status) {
free_page((unsigned long) qi->desc);
kfree(qi);
@@ -812,3 +851,254 @@ int dmar_enable_qi(struct intel_iommu *iommu)
return 0;
}
+
+/* iommu interrupt handling. Most stuff are MSI-like. */
+
+enum faulttype {
+ DMA_REMAP,
+ INTR_REMAP,
+ UNKNOWN,
+};
+
+static const char *dma_remap_fault_reasons[] =
+{
+ "Software",
+ "Present bit in root entry is clear",
+ "Present bit in context entry is clear",
+ "Invalid context entry",
+ "Access beyond MGAW",
+ "PTE Write access is not set",
+ "PTE Read access is not set",
+ "Next page table ptr is invalid",
+ "Root table address invalid",
+ "Context table ptr is invalid",
+ "non-zero reserved fields in RTP",
+ "non-zero reserved fields in CTP",
+ "non-zero reserved fields in PTE",
+};
+
+static const char *intr_remap_fault_reasons[] =
+{
+ "Detected reserved fields in the decoded interrupt-remapped request",
+ "Interrupt index exceeded the interrupt-remapping table size",
+ "Present field in the IRTE entry is clear",
+ "Error accessing interrupt-remapping table pointed by IRTA_REG",
+ "Detected reserved fields in the IRTE entry",
+ "Blocked a compatibility format interrupt request",
+ "Blocked an interrupt request due to source-id verification failure",
+};
+
+#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
+
+const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
+{
+ if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
+ ARRAY_SIZE(intr_remap_fault_reasons))) {
+ *fault_type = INTR_REMAP;
+ return intr_remap_fault_reasons[fault_reason - 0x20];
+ } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
+ *fault_type = DMA_REMAP;
+ return dma_remap_fault_reasons[fault_reason];
+ } else {
+ *fault_type = UNKNOWN;
+ return "Unknown";
+ }
+}
+
+void dmar_msi_unmask(unsigned int irq)
+{
+ struct intel_iommu *iommu = get_irq_data(irq);
+ unsigned long flag;
+
+ /* unmask it */
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(0, iommu->reg + DMAR_FECTL_REG);
+ /* Read a reg to force flush the post write */
+ readl(iommu->reg + DMAR_FECTL_REG);
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_mask(unsigned int irq)
+{
+ unsigned long flag;
+ struct intel_iommu *iommu = get_irq_data(irq);
+
+ /* mask it */
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
+ /* Read a reg to force flush the post write */
+ readl(iommu->reg + DMAR_FECTL_REG);
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+ struct intel_iommu *iommu = get_irq_data(irq);
+ unsigned long flag;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
+ writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
+ writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_read(int irq, struct msi_msg *msg)
+{
+ struct intel_iommu *iommu = get_irq_data(irq);
+ unsigned long flag;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
+ msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
+ msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
+ u8 fault_reason, u16 source_id, unsigned long long addr)
+{
+ const char *reason;
+ int fault_type;
+
+ reason = dmar_get_fault_reason(fault_reason, &fault_type);
+
+ if (fault_type == INTR_REMAP)
+ printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] "
+ "fault index %llx\n"
+ "INTR-REMAP:[fault reason %02d] %s\n",
+ (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+ PCI_FUNC(source_id & 0xFF), addr >> 48,
+ fault_reason, reason);
+ else
+ printk(KERN_ERR
+ "DMAR:[%s] Request device [%02x:%02x.%d] "
+ "fault addr %llx \n"
+ "DMAR:[fault reason %02d] %s\n",
+ (type ? "DMA Read" : "DMA Write"),
+ (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+ PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+ return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+irqreturn_t dmar_fault(int irq, void *dev_id)
+{
+ struct intel_iommu *iommu = dev_id;
+ int reg, fault_index;
+ u32 fault_status;
+ unsigned long flag;
+
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+ if (fault_status)
+ printk(KERN_ERR "DRHD: handling fault status reg %x\n",
+ fault_status);
+
+ /* TBD: ignore advanced fault log currently */
+ if (!(fault_status & DMA_FSTS_PPF))
+ goto clear_rest;
+
+ fault_index = dma_fsts_fault_record_index(fault_status);
+ reg = cap_fault_reg_offset(iommu->cap);
+ while (1) {
+ u8 fault_reason;
+ u16 source_id;
+ u64 guest_addr;
+ int type;
+ u32 data;
+
+ /* highest 32 bits */
+ data = readl(iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN + 12);
+ if (!(data & DMA_FRCD_F))
+ break;
+
+ fault_reason = dma_frcd_fault_reason(data);
+ type = dma_frcd_type(data);
+
+ data = readl(iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN + 8);
+ source_id = dma_frcd_source_id(data);
+
+ guest_addr = dmar_readq(iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN);
+ guest_addr = dma_frcd_page_addr(guest_addr);
+ /* clear the fault */
+ writel(DMA_FRCD_F, iommu->reg + reg +
+ fault_index * PRIMARY_FAULT_REG_LEN + 12);
+
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+ dmar_fault_do_one(iommu, type, fault_reason,
+ source_id, guest_addr);
+
+ fault_index++;
+ if (fault_index > cap_num_fault_regs(iommu->cap))
+ fault_index = 0;
+ spin_lock_irqsave(&iommu->register_lock, flag);
+ }
+clear_rest:
+ /* clear all the other faults */
+ fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+ writel(fault_status, iommu->reg + DMAR_FSTS_REG);
+
+ spin_unlock_irqrestore(&iommu->register_lock, flag);
+ return IRQ_HANDLED;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu)
+{
+ int irq, ret;
+
+ /*
+ * Check if the fault interrupt is already initialized.
+ */
+ if (iommu->irq)
+ return 0;
+
+ irq = create_irq();
+ if (!irq) {
+ printk(KERN_ERR "IOMMU: no free vectors\n");
+ return -EINVAL;
+ }
+
+ set_irq_data(irq, iommu);
+ iommu->irq = irq;
+
+ ret = arch_setup_dmar_msi(irq);
+ if (ret) {
+ set_irq_data(irq, NULL);
+ iommu->irq = 0;
+ destroy_irq(irq);
+ return 0;
+ }
+
+ ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
+ if (ret)
+ printk(KERN_ERR "IOMMU: can't request irq\n");
+ return ret;
+}
+
+int __init enable_drhd_fault_handling(void)
+{
+ struct dmar_drhd_unit *drhd;
+
+ /*
+ * Enable fault control interrupt.
+ */
+ for_each_drhd_unit(drhd) {
+ int ret;
+ struct intel_iommu *iommu = drhd->iommu;
+ ret = dmar_set_interrupt(iommu);
+
+ if (ret) {
+ printk(KERN_ERR "DRHD %Lx: failed to enable fault, "
+ " interrupt, ret %d\n",
+ (unsigned long long)drhd->reg_base_addr, ret);
+ return -1;
+ }
+ }
+
+ return 0;
+}
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index f3f686581a9..ef167b8b047 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1004,194 +1004,6 @@ static int iommu_disable_translation(struct intel_iommu *iommu)
return 0;
}
-/* iommu interrupt handling. Most stuff are MSI-like. */
-
-static const char *fault_reason_strings[] =
-{
- "Software",
- "Present bit in root entry is clear",
- "Present bit in context entry is clear",
- "Invalid context entry",
- "Access beyond MGAW",
- "PTE Write access is not set",
- "PTE Read access is not set",
- "Next page table ptr is invalid",
- "Root table address invalid",
- "Context table ptr is invalid",
- "non-zero reserved fields in RTP",
- "non-zero reserved fields in CTP",
- "non-zero reserved fields in PTE",
-};
-#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
-
-const char *dmar_get_fault_reason(u8 fault_reason)
-{
- if (fault_reason > MAX_FAULT_REASON_IDX)
- return "Unknown";
- else
- return fault_reason_strings[fault_reason];
-}
-
-void dmar_msi_unmask(unsigned int irq)
-{
- struct intel_iommu *iommu = get_irq_data(irq);
- unsigned long flag;
-
- /* unmask it */
- spin_lock_irqsave(&iommu->register_lock, flag);
- writel(0, iommu->reg + DMAR_FECTL_REG);
- /* Read a reg to force flush the post write */
- readl(iommu->reg + DMAR_FECTL_REG);
- spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_mask(unsigned int irq)
-{
- unsigned long flag;
- struct intel_iommu *iommu = get_irq_data(irq);
-
- /* mask it */
- spin_lock_irqsave(&iommu->register_lock, flag);
- writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
- /* Read a reg to force flush the post write */
- readl(iommu->reg + DMAR_FECTL_REG);
- spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_write(int irq, struct msi_msg *msg)
-{
- struct intel_iommu *iommu = get_irq_data(irq);
- unsigned long flag;
-
- spin_lock_irqsave(&iommu->register_lock, flag);
- writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
- writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
- writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
- spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_read(int irq, struct msi_msg *msg)
-{
- struct intel_iommu *iommu = get_irq_data(irq);
- unsigned long flag;
-
- spin_lock_irqsave(&iommu->register_lock, flag);
- msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
- msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
- msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
- spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
- u8 fault_reason, u16 source_id, unsigned long long addr)
-{
- const char *reason;
-
- reason = dmar_get_fault_reason(fault_reason);
-
- printk(KERN_ERR
- "DMAR:[%s] Request device [%02x:%02x.%d] "
- "fault addr %llx \n"
- "DMAR:[fault reason %02d] %s\n",
- (type ? "DMA Read" : "DMA Write"),
- (source_id >> 8), PCI_SLOT(source_id & 0xFF),
- PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
- return 0;
-}
-
-#define PRIMARY_FAULT_REG_LEN (16)
-static irqreturn_t iommu_page_fault(int irq, void *dev_id)
-{
- struct intel_iommu *iommu = dev_id;
- int reg, fault_index;
- u32 fault_status;
- unsigned long flag;
-
- spin_lock_irqsave(&iommu->register_lock, flag);
- fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-
- /* TBD: ignore advanced fault log currently */
- if (!(fault_status & DMA_FSTS_PPF))
- goto clear_overflow;
-
- fault_index = dma_fsts_fault_record_index(fault_status);
- reg = cap_fault_reg_offset(iommu->cap);
- while (1) {
- u8 fault_reason;
- u16 source_id;
- u64 guest_addr;
- int type;
- u32 data;
-
- /* highest 32 bits */
- data = readl(iommu->reg + reg +
- fault_index * PRIMARY_FAULT_REG_LEN + 12);
- if (!(data & DMA_FRCD_F))
- break;
-
- fault_reason = dma_frcd_fault_reason(data);
- type = dma_frcd_type(data);
-
- data = readl(iommu->reg + reg +
- fault_index * PRIMARY_FAULT_REG_LEN + 8);
- source_id = dma_frcd_source_id(data);
-
- guest_addr = dmar_readq(iommu->reg + reg +
- fault_index * PRIMARY_FAULT_REG_LEN);
- guest_addr = dma_frcd_page_addr(guest_addr);
- /* clear the fault */
- writel(DMA_FRCD_F, iommu->reg + reg +
- fault_index * PRIMARY_FAULT_REG_LEN + 12);
-
- spin_unlock_irqrestore(&iommu->register_lock, flag);
-
- iommu_page_fault_do_one(iommu, type, fault_reason,
- source_id, guest_addr);
-
- fault_index++;
- if (fault_index > cap_num_fault_regs(iommu->cap))
- fault_index = 0;
- spin_lock_irqsave(&iommu->register_lock, flag);
- }
-clear_overflow:
- /* clear primary fault overflow */
- fault_status = readl(iommu->reg + DMAR_FSTS_REG);
- if (fault_status & DMA_FSTS_PFO)
- writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
-
- spin_unlock_irqrestore(&iommu->register_lock, flag);
- return IRQ_HANDLED;
-}
-
-int dmar_set_interrupt(struct intel_iommu *iommu)
-{
- int irq, ret;
-
- irq = create_irq();
- if (!irq) {
- printk(KERN_ERR "IOMMU: no free vectors\n");
- return -EINVAL;
- }
-
- set_irq_data(irq, iommu);
- iommu->irq = irq;
-
- ret = arch_setup_dmar_msi(irq);
- if (ret) {
- set_irq_data(irq, NULL);
- iommu->irq = 0;
- destroy_irq(irq);
- return 0;
- }
-
- /* Force fault register is cleared */
- iommu_page_fault(irq, iommu);
-
- ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
- if (ret)
- printk(KERN_ERR "IOMMU: can't request irq\n");
- return ret;
-}
static int iommu_init_domains(struct intel_iommu *iommu)
{
@@ -1987,7 +1799,7 @@ static int __init init_dmars(void)
struct dmar_rmrr_unit *rmrr;
struct pci_dev *pdev;
struct intel_iommu *iommu;
- int i, ret, unit = 0;
+ int i, ret;
/*
* for each drhd
@@ -2043,11 +1855,40 @@ static int __init init_dmars(void)
}
}
+ /*
+ * Start from the sane iommu hardware state.
+ */
+ for_each_drhd_unit(drhd) {
+ if (drhd->ignored)
+ continue;
+
+ iommu = drhd->iommu;
+
+ /*
+ * If the queued invalidation is already initialized by us
+ * (for example, while enabling interrupt-remapping) then
+ * we got the things already rolling from a sane state.
+ */
+ if (iommu->qi)
+ continue;
+
+ /*
+ * Clear any previous faults.
+ */
+ dmar_fault(-1, iommu);
+ /*
+ * Disable queued invalidation if supported and already enabled
+ * before OS handover.
+ */
+ dmar_disable_qi(iommu);
+ }
+
for_each_drhd_unit(drhd) {
if (drhd->ignored)
continue;
iommu = drhd->iommu;
+
if (dmar_enable_qi(iommu)) {
/*
* Queued Invalidate not enabled, use Register Based
@@ -2109,7 +1950,6 @@ static int __init init_dmars(void)
if (drhd->ignored)
continue;
iommu = drhd->iommu;
- sprintf (iommu->name, "dmar%d", unit++);
iommu_flush_write_buffer(iommu);
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 9d07a05d26f..b041a409f4a 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -117,21 +117,22 @@ int get_irte(int irq, struct irte *entry)
{
int index;
struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
if (!entry)
return -1;
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
irq_iommu = valid_irq_2_iommu(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return -1;
}
index = irq_iommu->irte_index + irq_iommu->sub_handle;
*entry = *(irq_iommu->iommu->ir_table->base + index);
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return 0;
}
@@ -141,6 +142,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
struct irq_2_iommu *irq_iommu;
u16 index, start_index;
unsigned int mask = 0;
+ unsigned long flags;
int i;
if (!count)
@@ -170,7 +172,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
return -1;
}
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
do {
for (i = index; i < index + count; i++)
if (table->base[i].present)
@@ -182,7 +184,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
if (index == start_index) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
printk(KERN_ERR "can't allocate an IRTE\n");
return -1;
}
@@ -193,7 +195,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
irq_iommu = irq_2_iommu_alloc(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
printk(KERN_ERR "can't allocate irq_2_iommu\n");
return -1;
}
@@ -203,7 +205,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
irq_iommu->sub_handle = 0;
irq_iommu->irte_mask = mask;
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return index;
}
@@ -223,30 +225,32 @@ int map_irq_to_irte_handle(int irq, u16 *sub_handle)
{
int index;
struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
irq_iommu = valid_irq_2_iommu(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return -1;
}
*sub_handle = irq_iommu->sub_handle;
index = irq_iommu->irte_index;
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return index;
}
int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
{
struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
irq_iommu = irq_2_iommu_alloc(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
printk(KERN_ERR "can't allocate irq_2_iommu\n");
return -1;
}
@@ -256,7 +260,7 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
irq_iommu->sub_handle = subhandle;
irq_iommu->irte_mask = 0;
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return 0;
}
@@ -264,11 +268,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
{
struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
irq_iommu = valid_irq_2_iommu(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return -1;
}
@@ -277,7 +282,7 @@ int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
irq_iommu->sub_handle = 0;
irq_2_iommu(irq)->irte_mask = 0;
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return 0;
}
@@ -289,11 +294,12 @@ int modify_irte(int irq, struct irte *irte_modified)
struct irte *irte;
struct intel_iommu *iommu;
struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
irq_iommu = valid_irq_2_iommu(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return -1;
}
@@ -302,11 +308,11 @@ int modify_irte(int irq, struct irte *irte_modified)
index = irq_iommu->irte_index + irq_iommu->sub_handle;
irte = &iommu->ir_table->base[index];
- set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1));
+ set_64bit((unsigned long *)irte, irte_modified->low);
__iommu_flush_cache(iommu, irte, sizeof(*irte));
rc = qi_flush_iec(iommu, index, 0);
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return rc;
}
@@ -317,11 +323,12 @@ int flush_irte(int irq)
int index;
struct intel_iommu *iommu;
struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
irq_iommu = valid_irq_2_iommu(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return -1;
}
@@ -330,7 +337,7 @@ int flush_irte(int irq)
index = irq_iommu->irte_index + irq_iommu->sub_handle;
rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return rc;
}
@@ -363,11 +370,12 @@ int free_irte(int irq)
struct irte *irte;
struct intel_iommu *iommu;
struct irq_2_iommu *irq_iommu;
+ unsigned long flags;
- spin_lock(&irq_2_ir_lock);
+ spin_lock_irqsave(&irq_2_ir_lock, flags);
irq_iommu = valid_irq_2_iommu(irq);
if (!irq_iommu) {
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return -1;
}
@@ -378,7 +386,7 @@ int free_irte(int irq)
if (!irq_iommu->sub_handle) {
for (i = 0; i < (1 << irq_iommu->irte_mask); i++)
- set_64bit((unsigned long *)irte, 0);
+ set_64bit((unsigned long *)(irte + i), 0);
rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
}
@@ -387,7 +395,7 @@ int free_irte(int irq)
irq_iommu->sub_handle = 0;
irq_iommu->irte_mask = 0;
- spin_unlock(&irq_2_ir_lock);
+ spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return rc;
}
@@ -439,12 +447,12 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
struct page *pages;
ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!iommu->ir_table)
return -ENOMEM;
- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, INTR_REMAP_PAGE_ORDER);
+ pages = alloc_pages(GFP_ATOMIC | __GFP_ZERO, INTR_REMAP_PAGE_ORDER);
if (!pages) {
printk(KERN_ERR "failed to allocate pages of order %d\n",
@@ -459,11 +467,55 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
return 0;
}
+/*
+ * Disable Interrupt Remapping.
+ */
+static void disable_intr_remapping(struct intel_iommu *iommu)
+{
+ unsigned long flags;
+ u32 sts;
+
+ if (!ecap_ir_support(iommu->ecap))
+ return;
+
+ spin_lock_irqsave(&iommu->register_lock, flags);
+
+ sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
+ if (!(sts & DMA_GSTS_IRES))
+ goto end;
+
+ iommu->gcmd &= ~DMA_GCMD_IRE;
+ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+ readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+ spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
int __init enable_intr_remapping(int eim)
{
struct dmar_drhd_unit *drhd;
int setup = 0;
+ for_each_drhd_unit(drhd) {
+ struct intel_iommu *iommu = drhd->iommu;
+
+ /*
+ * Clear previous faults.
+ */
+ dmar_fault(-1, iommu);
+
+ /*
+ * Disable intr remapping and queued invalidation, if already
+ * enabled prior to OS handover.
+ */
+ disable_intr_remapping(iommu);
+
+ dmar_disable_qi(iommu);
+ }
+
/*
* check for the Interrupt-remapping support
*/
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f28440784cf..2f342746895 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -24,10 +24,10 @@
#include <linux/acpi.h>
#include <linux/types.h>
#include <linux/msi.h>
+#include <linux/irqreturn.h>
-#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
struct intel_iommu;
-
+#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
struct dmar_drhd_unit {
struct list_head list; /* list of drhd units */
struct acpi_dmar_header *hdr; /* ACPI header */
@@ -49,7 +49,7 @@ extern int dmar_dev_scope_init(void);
/* Intel IOMMU detection */
extern void detect_intel_iommu(void);
-
+extern int enable_drhd_fault_handling(void);
extern int parse_ioapics_under_ir(void);
extern int alloc_iommu(struct dmar_drhd_unit *);
@@ -63,12 +63,12 @@ static inline int dmar_table_init(void)
{
return -ENODEV;
}
+static inline int enable_drhd_fault_handling(void)
+{
+ return -1;
+}
#endif /* !CONFIG_DMAR && !CONFIG_INTR_REMAP */
-#ifdef CONFIG_INTR_REMAP
-extern int intr_remapping_enabled;
-extern int enable_intr_remapping(int);
-
struct irte {
union {
struct {
@@ -97,6 +97,10 @@ struct irte {
__u64 high;
};
};
+#ifdef CONFIG_INTR_REMAP
+extern int intr_remapping_enabled;
+extern int enable_intr_remapping(int);
+
extern int get_irte(int irq, struct irte *entry);
extern int modify_irte(int irq, struct irte *irte_modified);
extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
@@ -111,14 +115,40 @@ extern int irq_remapped(int irq);
extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
extern struct intel_iommu *map_ioapic_to_ir(int apic);
#else
+static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+{
+ return -1;
+}
+static inline int modify_irte(int irq, struct irte *irte_modified)
+{
+ return -1;
+}
+static inline int free_irte(int irq)
+{
+ return -1;
+}
+static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+{
+ return -1;
+}
+static inline int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
+ u16 sub_handle)
+{
+ return -1;
+}
+static inline struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+ return NULL;
+}
+static inline struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+ return NULL;
+}
#define irq_remapped(irq) (0)
#define enable_intr_remapping(mode) (-1)
#define intr_remapping_enabled (0)
#endif
-#ifdef CONFIG_DMAR
-extern const char *dmar_get_fault_reason(u8 fault_reason);
-
/* Can't use the common MSI interrupt functions
* since DMAR is not a pci device
*/
@@ -127,8 +157,10 @@ extern void dmar_msi_mask(unsigned int irq);
extern void dmar_msi_read(int irq, struct msi_msg *msg);
extern void dmar_msi_write(int irq, struct msi_msg *msg);
extern int dmar_set_interrupt(struct intel_iommu *iommu);
+extern irqreturn_t dmar_fault(int irq, void *dev_id);
extern int arch_setup_dmar_msi(unsigned int irq);
+#ifdef CONFIG_DMAR
extern int iommu_detected, no_iommu;
extern struct list_head dmar_rmrr_units;
struct dmar_rmrr_unit {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d2e3cbfba14..78c1262e870 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -292,6 +292,8 @@ struct intel_iommu {
spinlock_t register_lock; /* protect register handling */
int seq_id; /* sequence id of the iommu */
int agaw; /* agaw of this iommu */
+ unsigned int irq;
+ unsigned char name[13]; /* Device Name */
#ifdef CONFIG_DMAR
unsigned long *domain_ids; /* bitmap of domains */
@@ -299,8 +301,6 @@ struct intel_iommu {
spinlock_t lock; /* protect context, domain ids */
struct root_entry *root_entry; /* virtual address */
- unsigned int irq;
- unsigned char name[7]; /* Device Name */
struct iommu_flush flush;
#endif
struct q_inval *qi; /* Queued invalidation info */
@@ -321,6 +321,7 @@ extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
extern int alloc_iommu(struct dmar_drhd_unit *drhd);
extern void free_iommu(struct intel_iommu *iommu);
extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void dmar_disable_qi(struct intel_iommu *iommu);
extern void qi_global_iec(struct intel_iommu *iommu);
extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065cdf8c09f..b1ea37fc7a2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -104,6 +104,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
+#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -145,7 +146,7 @@ extern pgprot_t protection_map[16];
*/
static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
{
- return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff);
+ return (vma->vm_flags & VM_PFN_AT_MMAP);
}
static inline int is_pfn_mapping(struct vm_area_struct *vma)
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd..2032ad2fc34 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1665,9 +1665,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
*/
- if (addr == vma->vm_start && end == vma->vm_end)
+ if (addr == vma->vm_start && end == vma->vm_end) {
vma->vm_pgoff = pfn;
- else if (is_cow_mapping(vma->vm_flags))
+ vma->vm_flags |= VM_PFN_AT_MMAP;
+ } else if (is_cow_mapping(vma->vm_flags))
return -EINVAL;
vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
@@ -1679,6 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
* needed from higher level routine calling unmap_vmas
*/
vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
+ vma->vm_flags &= ~VM_PFN_AT_MMAP;
return -EINVAL;
}