aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig76
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/misc.c10
-rw-r--r--arch/x86/boot/compressed/relocs.c2
-rw-r--r--arch/x86/boot/header.S1
-rw-r--r--arch/x86/configs/i386_defconfig19
-rw-r--r--arch/x86/configs/x86_64_defconfig29
-rw-r--r--arch/x86/ia32/ia32_aout.c11
-rw-r--r--arch/x86/ia32/ia32_signal.c21
-rw-r--r--arch/x86/ia32/sys_ia32.c9
-rw-r--r--arch/x86/kernel/acpi/boot.c25
-rw-r--r--arch/x86/kernel/alternative.c8
-rw-r--r--arch/x86/kernel/amd_iommu.c350
-rw-r--r--arch/x86/kernel/amd_iommu_init.c194
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic_32.c332
-rw-r--r--arch/x86/kernel/apic_64.c389
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c10
-rw-r--r--arch/x86/kernel/cpu/common.c28
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c274
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c86
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_64.c13
-rw-r--r--arch/x86/kernel/ds.c4
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c179
-rw-r--r--arch/x86/kernel/pci-gart_64.c162
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_32.c12
-rw-r--r--arch/x86/kernel/process_64.c133
-rw-r--r--arch/x86/kernel/ptrace.c40
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/sigframe.h5
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c105
-rw-r--r--arch/x86/kernel/smpboot.c5
-rw-r--r--arch/x86/kernel/sys_x86_64.c43
-rw-r--r--arch/x86/kernel/traps_64.c61
-rw-r--r--arch/x86/kernel/tsc.c290
-rw-r--r--arch/x86/kernel/visws_quirks.c16
-rw-r--r--arch/x86/kernel/vmi_32.c12
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kvm/mmu.c4
-rw-r--r--arch/x86/kvm/svm.c12
-rw-r--r--arch/x86/kvm/vmx.c3
-rw-r--r--arch/x86/kvm/vmx.h2
-rw-r--r--arch/x86/lib/msr-on-cpu.c78
-rw-r--r--arch/x86/lib/string_32.c42
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/mm/discontig_32.c2
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/init_32.c88
-rw-r--r--arch/x86/mm/init_64.c118
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/numa_64.c10
-rw-r--r--arch/x86/mm/pageattr-test.c9
-rw-r--r--arch/x86/mm/pageattr.c463
-rw-r--r--arch/x86/mm/pat.c132
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c4
-rw-r--r--arch/x86/oprofile/op_model_p4.c175
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/irq.c67
-rw-r--r--arch/x86/power/hibernate_asm_32.S14
-rw-r--r--arch/x86/xen/enlighten.c20
-rw-r--r--arch/x86/xen/setup.c2
89 files changed, 2905 insertions, 1614 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 21ef9dd3618..44d4f2130d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,6 +29,7 @@ config X86
select HAVE_FTRACE
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -553,6 +554,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
config AMD_IOMMU
bool "AMD IOMMU support"
select SWIOTLB
+ select PCI_MSI
depends on X86_64 && PCI && ACPI
help
With this option you can enable support for AMD IOMMU hardware in
@@ -1020,7 +1022,7 @@ config HAVE_ARCH_ALLOC_REMAP
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
+ depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -1036,7 +1038,7 @@ config ARCH_SPARSEMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
+ depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
@@ -1117,10 +1119,10 @@ config MTRR
You can safely say Y even if your machine doesn't have MTRRs, you'll
just add about 9 KB to your kernel.
- See <file:Documentation/mtrr.txt> for more information.
+ See <file:Documentation/x86/mtrr.txt> for more information.
config MTRR_SANITIZER
- bool
+ def_bool y
prompt "MTRR cleanup support"
depends on MTRR
help
@@ -1131,7 +1133,7 @@ config MTRR_SANITIZER
The largest mtrr entry size for a continous block can be set with
mtrr_chunk_size.
- If unsure, say N.
+ If unsure, say Y.
config MTRR_SANITIZER_ENABLE_DEFAULT
int "MTRR cleanup enable value (0-1)"
@@ -1191,7 +1193,6 @@ config IRQBALANCE
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
help
This kernel feature is useful for number crunching applications
that may need to compute untrusted bytecode during their
@@ -1199,7 +1200,7 @@ config SECCOMP
the process as file descriptors supporting the read/write
syscalls, it's possible to isolate those applications in
their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
+ enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
and the task is only allowed to execute a few safe syscalls
defined by each seccomp mode.
@@ -1356,14 +1357,14 @@ config PHYSICAL_ALIGN
Don't change this unless you know what you are doing.
config HOTPLUG_CPU
- bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)"
- depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
+ bool "Support for hot-pluggable CPUs"
+ depends on SMP && HOTPLUG && !X86_VOYAGER
---help---
- Say Y here to experiment with turning CPUs off and on, and to
- enable suspend on SMP systems. CPUs can be controlled through
- /sys/devices/system/cpu.
- Say N if you want to disable CPU hotplug and don't need to
- suspend.
+ Say Y here to allow turning CPUs off and on. CPUs can be
+ controlled through /sys/devices/system/cpu.
+ ( Note: power management support will enable this option
+ automatically on SMP systems. )
+ Say N if you want to disable CPU hotplug.
config COMPAT_VDSO
def_bool y
@@ -1378,6 +1379,51 @@ config COMPAT_VDSO
If unsure, say Y.
+config CMDLINE_BOOL
+ bool "Built-in kernel command line"
+ default n
+ help
+ Allow for specifying boot arguments to the kernel at
+ build time. On some systems (e.g. embedded ones), it is
+ necessary or convenient to provide some or all of the
+ kernel boot arguments with the kernel itself (that is,
+ to not rely on the boot loader to provide them.)
+
+ To compile command line arguments into the kernel,
+ set this option to 'Y', then fill in the
+ the boot arguments in CONFIG_CMDLINE.
+
+ Systems with fully functional boot loaders (i.e. non-embedded)
+ should leave this option set to 'N'.
+
+config CMDLINE
+ string "Built-in kernel command string"
+ depends on CMDLINE_BOOL
+ default ""
+ help
+ Enter arguments here that should be compiled into the kernel
+ image and used at boot time. If the boot loader provides a
+ command line at boot time, it is appended to this string to
+ form the full kernel command line, when the system boots.
+
+ However, you can use the CONFIG_CMDLINE_OVERRIDE option to
+ change this behavior.
+
+ In most cases, the command line (whether built-in or provided
+ by the boot loader) should specify the device for the root
+ file system.
+
+config CMDLINE_OVERRIDE
+ bool "Built-in command line overrides boot loader arguments"
+ default n
+ depends on CMDLINE_BOOL
+ help
+ Set this option to 'Y' to have the kernel ignore the boot loader
+ command line, and use ONLY the built-in command line.
+
+ This is used to work around broken boot loaders. This should
+ be set to 'N' under normal conditions.
+
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1781,7 +1827,7 @@ config COMPAT_FOR_U64_ALIGNMENT
config SYSVIPC_COMPAT
def_bool y
- depends on X86_64 && COMPAT && SYSVIPC
+ depends on COMPAT && SYSVIPC
endmenu
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec..29c5fbf0839 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
*/
movl output_len(%ebx), %eax
pushl %eax
+ # push arguments for decompress_kernel:
pushl %ebp # output address
movl input_len(%ebx), %eax
pushl %eax # input_len
leal input_data(%ebx), %eax
pushl %eax # input_data
leal boot_heap(%ebx), %eax
- pushl %eax # heap area as third argument
- pushl %esi # real mode pointer as second arg
+ pushl %eax # heap area
+ pushl %esi # real mode pointer
call decompress_kernel
addl $20, %esp
popl %ecx
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index aaf5a2131ef..5780d361105 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -27,7 +27,7 @@
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include <asm/page.h>
#include <asm/boot.h>
#include <asm/bootparam.h>
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s)
y--;
}
} else {
- vidmem [(x + cols * y) * 2] = c;
+ vidmem[(x + cols * y) * 2] = c;
if (++x >= cols) {
x = 0;
if (++y >= lines) {
@@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
int i;
char *ss = s;
- for (i = 0; i < n; i++) ss[i] = c;
+ for (i = 0; i < n; i++)
+ ss[i] = c;
return s;
}
@@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
const char *s = src;
char *d = dest;
- for (i = 0; i < n; i++) d[i] = s[i];
+ for (i = 0; i < n; i++)
+ d[i] = s[i];
return dest;
}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index a1310c52fc0..857e492c571 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
continue;
}
sh_symtab = sec_symtab->symtab;
- sym_strtab = sec->link->strtab;
+ sym_strtab = sec_symtab->link->strtab;
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
Elf32_Rel *rel;
Elf32_Sym *sym;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index af86e431acf..b993062e9a5 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
/* to be loaded */
ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
-SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
#ifndef SVGA_MODE
#define SVGA_MODE ASK_VGA
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 104275e191a..ef9a52005ec 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc4
-# Mon Aug 25 15:04:00 2008
+# Linux kernel version: 2.6.27-rc5
+# Wed Sep 3 17:23:09 2008
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
@@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_M586 is not set
# CONFIG_M586TSC is not set
# CONFIG_M586MMX is not set
-# CONFIG_M686 is not set
+CONFIG_M686=y
# CONFIG_MPENTIUMII is not set
# CONFIG_MPENTIUMIII is not set
# CONFIG_MPENTIUMM is not set
@@ -221,13 +221,14 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-CONFIG_MCORE2=y
+# CONFIG_MCORE2 is not set
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
CONFIG_X86_CPU=y
CONFIG_X86_CMPXCHG=y
CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_XADD=y
+# CONFIG_X86_PPRO_FENCE is not set
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_INVLPG=y
CONFIG_X86_BSWAP=y
@@ -235,14 +236,15 @@ CONFIG_X86_POPAD_OK=y
CONFIG_X86_INTEL_USERCOPY=y
CONFIG_X86_USE_PPRO_CHECKSUM=y
CONFIG_X86_TSC=y
+CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=4
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
# CONFIG_IOMMU_HELPER is not set
-CONFIG_NR_CPUS=4
-# CONFIG_SCHED_SMT is not set
+CONFIG_NR_CPUS=64
+CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +256,8 @@ CONFIG_VM86=y
# CONFIG_TOSHIBA is not set
# CONFIG_I8K is not set
CONFIG_X86_REBOOTFIXUPS=y
-# CONFIG_MICROCODE is not set
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
# CONFIG_NOHIGHMEM is not set
@@ -2115,7 +2118,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-# CONFIG_OPTIMIZE_INLINING is not set
+CONFIG_OPTIMIZE_INLINING=y
#
# Security options
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 678c8acefe0..e620ea6e2a7 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.27-rc4
-# Mon Aug 25 14:40:46 2008
+# Linux kernel version: 2.6.27-rc5
+# Wed Sep 3 17:13:39 2008
#
CONFIG_64BIT=y
# CONFIG_X86_32 is not set
@@ -218,17 +218,14 @@ CONFIG_X86_PC=y
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
-CONFIG_MCORE2=y
-# CONFIG_GENERIC_CPU is not set
+# CONFIG_MCORE2 is not set
+CONFIG_GENERIC_CPU=y
CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=64
-CONFIG_X86_INTERNODE_CACHE_BYTES=64
+CONFIG_X86_L1_CACHE_BYTES=128
+CONFIG_X86_INTERNODE_CACHE_BYTES=128
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=6
+CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_WP_WORKS_OK=y
-CONFIG_X86_INTEL_USERCOPY=y
-CONFIG_X86_USE_PPRO_CHECKSUM=y
-CONFIG_X86_P6_NOP=y
CONFIG_X86_TSC=y
CONFIG_X86_CMPXCHG64=y
CONFIG_X86_CMOV=y
@@ -243,9 +240,8 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
CONFIG_AMD_IOMMU=y
CONFIG_SWIOTLB=y
CONFIG_IOMMU_HELPER=y
-# CONFIG_MAXSMP is not set
-CONFIG_NR_CPUS=4
-# CONFIG_SCHED_SMT is not set
+CONFIG_NR_CPUS=64
+CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +250,8 @@ CONFIG_X86_LOCAL_APIC=y
CONFIG_X86_IO_APIC=y
# CONFIG_X86_MCE is not set
# CONFIG_I8K is not set
-# CONFIG_MICROCODE is not set
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_NUMA=y
@@ -290,7 +287,7 @@ CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-# CONFIG_X86_PAT is not set
+CONFIG_X86_PAT=y
CONFIG_EFI=y
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
@@ -2089,7 +2086,7 @@ CONFIG_IO_DELAY_0X80=y
CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
# CONFIG_CPA_DEBUG is not set
-# CONFIG_OPTIMIZE_INLINING is not set
+CONFIG_OPTIMIZE_INLINING=y
#
# Security options
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a0e1dbe67dc..127ec3f0721 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
dump->regs.ax = regs->ax;
dump->regs.ds = current->thread.ds;
dump->regs.es = current->thread.es;
- asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
- asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
+ savesegment(fs, fs);
+ dump->regs.fs = fs;
+ savesegment(gs, gs);
+ dump->regs.gs = gs;
dump->regs.orig_ax = regs->orig_ax;
dump->regs.ip = regs->ip;
dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ beyond_if:
current->mm->start_stack =
(unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
/* start thread */
- asm volatile("movl %0,%%fs" :: "r" (0)); \
- asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
+ loadsegment(fs, 0);
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
load_gs_index(0);
(regs)->ip = ex.a_entry;
(regs)->sp = current->mm->start_stack;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f25a1012400..8d64c1bc847 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -207,7 +207,7 @@ struct rt_sigframe
{ unsigned int cur; \
unsigned short pre; \
err |= __get_user(pre, &sc->seg); \
- asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
+ savesegment(seg, cur); \
pre |= mask; \
if (pre != cur) loadsegment(seg, pre); }
@@ -236,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
*/
err |= __get_user(gs, &sc->gs);
gs |= 3;
- asm("movl %%gs,%0" : "=r" (oldgs));
+ savesegment(gs, oldgs);
if (gs != oldgs)
load_gs_index(gs);
@@ -342,14 +342,13 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
{
int tmp, err = 0;
- tmp = 0;
- __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(gs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
- __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(fs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
- __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(ds, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
- __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
+ savesegment(es, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->es);
err |= __put_user((u32)regs->di, &sc->di);
@@ -491,8 +490,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->dx = 0;
regs->cx = 0;
- asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
- asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
@@ -588,8 +587,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
- asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index d3c64088b98..beda4232ce6 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -556,15 +556,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
return ret;
}
-/* These are here just in case some old ia32 binary calls it. */
-asmlinkage long sys32_pause(void)
-{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- return -ERESTARTNOHAND;
-}
-
-
#ifdef CONFIG_SYSCTL_SYSCALL
struct sysctl_ia32 {
unsigned int name;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 27ef365e757..c2ac1b4515a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
#include <asm/proto.h>
-#include <asm/genapic.h>
#else /* X86 */
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#warning ACPI uses CMPXCHG, i486 and later hardware
#endif
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -160,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
struct acpi_mcfg_allocation *pci_mmcfg_config;
int pci_mmcfg_config_num;
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
{
if (!strcmp(mcfg->header.oem_id, "SGI"))
@@ -253,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
return;
}
-#ifdef CONFIG_X86_32
if (boot_cpu_physical_apicid != -1U)
ver = apic_version[boot_cpu_physical_apicid];
-#endif
generic_processor_info(id, ver);
}
@@ -776,10 +773,8 @@ static void __init acpi_register_lapic_address(unsigned long address)
set_fixmap_nocache(FIX_APIC_BASE, address);
if (boot_cpu_physical_apicid == -1U) {
boot_cpu_physical_apicid = read_apic_id();
-#ifdef CONFIG_X86_32
apic_version[boot_cpu_physical_apicid] =
GET_APIC_VERSION(apic_read(APIC_LVR));
-#endif
}
}
@@ -1607,6 +1602,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
*/
{
.callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP nx6115 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
.ident = "HP NX6125 laptop",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1621,6 +1624,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP 6715b laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
+ },
+ },
{}
};
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65a0c1b4869..fb04e49776b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -231,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+ /* turn DS segment override prefix into lock prefix */
+ text_poke(*ptr, ((unsigned char []){0xf0}), 1);
};
}
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
- char insn[1];
if (noreplace_smp)
return;
- add_nops(insn, 1);
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, insn, 1);
+ /* turn lock prefix into DS segment override prefix */
+ text_poke(*ptr, ((unsigned char []){0x3E}), 1);
};
}
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 69b4d060b21..34e4d112b1e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
static DEFINE_RWLOCK(amd_iommu_devtable_lock);
+/* A list of preallocated protection domains */
+static LIST_HEAD(iommu_pd_list);
+static DEFINE_SPINLOCK(iommu_pd_list_lock);
+
/*
* general struct to manage commands send to an IOMMU
*/
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
/****************************************************************************
*
+ * Interrupt handling functions
+ *
+ ****************************************************************************/
+
+static void iommu_print_event(void *__evt)
+{
+ u32 *event = __evt;
+ int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
+ int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+ int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
+ int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+ u64 address = (u64)(((u64)event[3]) << 32) | event[2];
+
+ printk(KERN_ERR "AMD IOMMU: Event logged [");
+
+ switch (type) {
+ case EVENT_TYPE_ILL_DEV:
+ printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
+ "address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address, flags);
+ break;
+ case EVENT_TYPE_IO_FAULT:
+ printk("IO_PAGE_FAULT device=%02x:%02x.%x "
+ "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ domid, address, flags);
+ break;
+ case EVENT_TYPE_DEV_TAB_ERR:
+ printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+ "address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address, flags);
+ break;
+ case EVENT_TYPE_PAGE_TAB_ERR:
+ printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+ "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ domid, address, flags);
+ break;
+ case EVENT_TYPE_ILL_CMD:
+ printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+ break;
+ case EVENT_TYPE_CMD_HARD_ERR:
+ printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
+ "flags=0x%04x]\n", address, flags);
+ break;
+ case EVENT_TYPE_IOTLB_INV_TO:
+ printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
+ "address=0x%016llx]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address);
+ break;
+ case EVENT_TYPE_INV_DEV_REQ:
+ printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
+ "address=0x%016llx flags=0x%04x]\n",
+ PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+ address, flags);
+ break;
+ default:
+ printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
+ }
+}
+
+static void iommu_poll_events(struct amd_iommu *iommu)
+{
+ u32 head, tail;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+ while (head != tail) {
+ iommu_print_event(iommu->evt_buf + head);
+ head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
+ }
+
+ writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+ struct amd_iommu *iommu;
+
+ list_for_each_entry(iommu, &amd_iommu_list, list)
+ iommu_poll_events(iommu);
+
+ return IRQ_HANDLED;
+}
+
+/****************************************************************************
+ *
* IOMMU command queuing functions
*
****************************************************************************/
@@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
*/
static int iommu_completion_wait(struct amd_iommu *iommu)
{
- int ret, ready = 0;
+ int ret = 0, ready = 0;
unsigned status = 0;
struct iommu_cmd cmd;
- unsigned long i = 0;
+ unsigned long flags, i = 0;
memset(&cmd, 0, sizeof(cmd));
cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
@@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
iommu->need_sync = 0;
- ret = iommu_queue_command(iommu, &cmd);
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ ret = __iommu_queue_command(iommu, &cmd);
if (ret)
- return ret;
+ goto out;
while (!ready && (i < EXIT_LOOP_COUNT)) {
++i;
@@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
+out:
+ spin_unlock_irqrestore(&iommu->lock, flags);
return 0;
}
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
{
struct iommu_cmd cmd;
+ int ret;
BUG_ON(iommu == NULL);
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
cmd.data[0] = devid;
+ ret = iommu_queue_command(iommu, &cmd);
+
iommu->need_sync = 1;
- return iommu_queue_command(iommu, &cmd);
+ return ret;
}
/*
@@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
u64 address, u16 domid, int pde, int s)
{
struct iommu_cmd cmd;
+ int ret;
memset(&cmd, 0, sizeof(cmd));
address &= PAGE_MASK;
@@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ ret = iommu_queue_command(iommu, &cmd);
+
iommu->need_sync = 1;
- return iommu_queue_command(iommu, &cmd);
+ return ret;
}
/*
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
return 0;
}
+/* Flush the whole IO/TLB for a given protection domain */
+static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
+{
+ u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+
+ iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
* efficient allocator.
*
****************************************************************************/
-static unsigned long dma_mask_to_pages(unsigned long mask)
-{
- return (mask >> PAGE_SHIFT) +
- (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
-}
/*
* The address allocator core function.
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
*/
static unsigned long dma_ops_alloc_addresses(struct device *dev,
struct dma_ops_domain *dom,
- unsigned int pages)
+ unsigned int pages,
+ unsigned long align_mask,
+ u64 dma_mask)
{
- unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+ unsigned long limit;
unsigned long address;
- unsigned long size = dom->aperture_size >> PAGE_SHIFT;
unsigned long boundary_size;
boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
- limit = limit < size ? limit : size;
+ limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
+ dma_mask >> PAGE_SHIFT);
- if (dom->next_bit >= limit)
+ if (dom->next_bit >= limit) {
dom->next_bit = 0;
+ dom->need_flush = true;
+ }
address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
- 0 , boundary_size, 0);
- if (address == -1)
+ 0 , boundary_size, align_mask);
+ if (address == -1) {
address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
- 0, boundary_size, 0);
+ 0, boundary_size, align_mask);
+ dom->need_flush = true;
+ }
if (likely(address != -1)) {
dom->next_bit = address + pages;
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
if (start_page + pages > last_page)
pages = last_page - start_page;
- set_bit_string(dom->bitmap, start_page, pages);
+ iommu_area_reserve(dom->bitmap, start_page, pages);
}
static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -553,6 +672,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
dma_dom->bitmap[0] = 1;
dma_dom->next_bit = 0;
+ dma_dom->need_flush = false;
+ dma_dom->target_dev = 0xffff;
+
/* Intialize the exclusion range if necessary */
if (iommu->exclusion_start &&
iommu->exclusion_start < dma_dom->aperture_size) {
@@ -623,12 +745,13 @@ static void set_device_domain(struct amd_iommu *iommu,
u64 pte_root = virt_to_phys(domain->pt_root);
- pte_root |= (domain->mode & 0x07) << 9;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+ pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- amd_iommu_dev_table[devid].data[0] = pte_root;
- amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+ amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+ amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
amd_iommu_dev_table[devid].data[2] = domain->id;
amd_iommu_pd_table[devid] = domain;
@@ -646,6 +769,45 @@ static void set_device_domain(struct amd_iommu *iommu,
*****************************************************************************/
/*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+ if (!dev || !dev->dma_mask)
+ return false;
+
+ return true;
+}
+
+/*
+ * In this function the list of preallocated protection domains is traversed to
+ * find the domain for a specific device
+ */
+static struct dma_ops_domain *find_protection_domain(u16 devid)
+{
+ struct dma_ops_domain *entry, *ret = NULL;
+ unsigned long flags;
+
+ if (list_empty(&iommu_pd_list))
+ return NULL;
+
+ spin_lock_irqsave(&iommu_pd_list_lock, flags);
+
+ list_for_each_entry(entry, &iommu_pd_list, list) {
+ if (entry->target_dev == devid) {
+ ret = entry;
+ list_del(&ret->list);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+ return ret;
+}
+
+/*
* In the dma_ops path we only have the struct device. This function
* finds the corresponding IOMMU, the protection domain and the
* requestor id for a given device.
@@ -661,27 +823,30 @@ static int get_device_resources(struct device *dev,
struct pci_dev *pcidev;
u16 _bdf;
- BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+ *iommu = NULL;
+ *domain = NULL;
+ *bdf = 0xffff;
+
+ if (dev->bus != &pci_bus_type)
+ return 0;
pcidev = to_pci_dev(dev);
_bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
/* device not translated by any IOMMU in the system? */
- if (_bdf > amd_iommu_last_bdf) {
- *iommu = NULL;
- *domain = NULL;
- *bdf = 0xffff;
+ if (_bdf > amd_iommu_last_bdf)
return 0;
- }
*bdf = amd_iommu_alias_table[_bdf];
*iommu = amd_iommu_rlookup_table[*bdf];
if (*iommu == NULL)
return 0;
- dma_dom = (*iommu)->default_dom;
*domain = domain_for_device(*bdf);
if (*domain == NULL) {
+ dma_dom = find_protection_domain(*bdf);
+ if (!dma_dom)
+ dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
set_device_domain(*iommu, *domain, *bdf);
printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -760,17 +925,24 @@ static dma_addr_t __map_single(struct device *dev,
struct dma_ops_domain *dma_dom,
phys_addr_t paddr,
size_t size,
- int dir)
+ int dir,
+ bool align,
+ u64 dma_mask)
{
dma_addr_t offset = paddr & ~PAGE_MASK;
dma_addr_t address, start;
unsigned int pages;
+ unsigned long align_mask = 0;
int i;
pages = iommu_num_pages(paddr, size);
paddr &= PAGE_MASK;
- address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+ if (align)
+ align_mask = (1UL << get_order(size)) - 1;
+
+ address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
+ dma_mask);
if (unlikely(address == bad_dma_address))
goto out;
@@ -782,6 +954,12 @@ static dma_addr_t __map_single(struct device *dev,
}
address += offset;
+ if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
+ iommu_flush_tlb(iommu, dma_dom->domain.id);
+ dma_dom->need_flush = false;
+ } else if (unlikely(iommu_has_npcache(iommu)))
+ iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
+
out:
return address;
}
@@ -812,6 +990,9 @@ static void __unmap_single(struct amd_iommu *iommu,
}
dma_ops_free_addresses(dma_dom, dma_addr, pages);
+
+ if (amd_iommu_unmap_flush)
+ iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
}
/*
@@ -825,6 +1006,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
struct protection_domain *domain;
u16 devid;
dma_addr_t addr;
+ u64 dma_mask;
+
+ if (!check_device(dev))
+ return bad_dma_address;
+
+ dma_mask = *dev->dma_mask;
get_device_resources(dev, &iommu, &domain, &devid);
@@ -833,14 +1020,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
return (dma_addr_t)paddr;
spin_lock_irqsave(&domain->lock, flags);
- addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+ addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
+ dma_mask);
if (addr == bad_dma_address)
goto out;
- if (iommu_has_npcache(iommu))
- iommu_flush_pages(iommu, domain->id, addr, size);
-
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
out:
@@ -860,7 +1045,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
struct protection_domain *domain;
u16 devid;
- if (!get_device_resources(dev, &iommu, &domain, &devid))
+ if (!check_device(dev) ||
+ !get_device_resources(dev, &iommu, &domain, &devid))
/* device not handled by any AMD IOMMU */
return;
@@ -868,9 +1054,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
__unmap_single(iommu, domain->priv, dma_addr, size, dir);
- iommu_flush_pages(iommu, domain->id, dma_addr, size);
-
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -909,6 +1093,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
struct scatterlist *s;
phys_addr_t paddr;
int mapped_elems = 0;
+ u64 dma_mask;
+
+ if (!check_device(dev))
+ return 0;
+
+ dma_mask = *dev->dma_mask;
get_device_resources(dev, &iommu, &domain, &devid);
@@ -921,19 +1111,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
paddr = sg_phys(s);
s->dma_address = __map_single(dev, iommu, domain->priv,
- paddr, s->length, dir);
+ paddr, s->length, dir, false,
+ dma_mask);
if (s->dma_address) {
s->dma_length = s->length;
mapped_elems++;
} else
goto unmap;
- if (iommu_has_npcache(iommu))
- iommu_flush_pages(iommu, domain->id, s->dma_address,
- s->dma_length);
}
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
out:
@@ -967,7 +1155,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
u16 devid;
int i;
- if (!get_device_resources(dev, &iommu, &domain, &devid))
+ if (!check_device(dev) ||
+ !get_device_resources(dev, &iommu, &domain, &devid))
return;
spin_lock_irqsave(&domain->lock, flags);
@@ -975,12 +1164,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
for_each_sg(sglist, s, nelems, i) {
__unmap_single(iommu, domain->priv, s->dma_address,
s->dma_length, dir);
- iommu_flush_pages(iommu, domain->id, s->dma_address,
- s->dma_length);
s->dma_address = s->dma_length = 0;
}
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -998,25 +1185,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
struct protection_domain *domain;
u16 devid;
phys_addr_t paddr;
+ u64 dma_mask = dev->coherent_dma_mask;
+
+ if (!check_device(dev))
+ return NULL;
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+ flag |= __GFP_ZERO;
virt_addr = (void *)__get_free_pages(flag, get_order(size));
if (!virt_addr)
return 0;
- memset(virt_addr, 0, size);
paddr = virt_to_phys(virt_addr);
- get_device_resources(dev, &iommu, &domain, &devid);
-
if (!iommu || !domain) {
*dma_addr = (dma_addr_t)paddr;
return virt_addr;
}
+ if (!dma_mask)
+ dma_mask = *dev->dma_mask;
+
spin_lock_irqsave(&domain->lock, flags);
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
- size, DMA_BIDIRECTIONAL);
+ size, DMA_BIDIRECTIONAL, true, dma_mask);
if (*dma_addr == bad_dma_address) {
free_pages((unsigned long)virt_addr, get_order(size));
@@ -1024,10 +1219,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
goto out;
}
- if (iommu_has_npcache(iommu))
- iommu_flush_pages(iommu, domain->id, *dma_addr, size);
-
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
out:
@@ -1038,8 +1230,6 @@ out:
/*
* The exported free_coherent function for dma_ops.
- * FIXME: fix the generic x86 DMA layer so that it actually calls that
- * function.
*/
static void free_coherent(struct device *dev, size_t size,
void *virt_addr, dma_addr_t dma_addr)
@@ -1049,6 +1239,9 @@ static void free_coherent(struct device *dev, size_t size,
struct protection_domain *domain;
u16 devid;
+ if (!check_device(dev))
+ return;
+
get_device_resources(dev, &iommu, &domain, &devid);
if (!iommu || !domain)
@@ -1057,9 +1250,8 @@ static void free_coherent(struct device *dev, size_t size,
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- iommu_flush_pages(iommu, domain->id, dma_addr, size);
- if (iommu->need_sync)
+ if (unlikely(iommu->need_sync))
iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1069,6 +1261,30 @@ free_mem:
}
/*
+ * This function is called by the DMA layer to find out if we can handle a
+ * particular device. It is part of the dma_ops.
+ */
+static int amd_iommu_dma_supported(struct device *dev, u64 mask)
+{
+ u16 bdf;
+ struct pci_dev *pcidev;
+
+ /* No device or no PCI device */
+ if (!dev || dev->bus != &pci_bus_type)
+ return 0;
+
+ pcidev = to_pci_dev(dev);
+
+ bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
+
+ /* Out of our scope? */
+ if (bdf > amd_iommu_last_bdf)
+ return 0;
+
+ return 1;
+}
+
+/*
* The function for pre-allocating protection domains.
*
* If the driver core informs the DMA layer if a driver grabs a device
@@ -1097,10 +1313,9 @@ void prealloc_protection_domains(void)
if (!dma_dom)
continue;
init_unity_mappings_for_device(dma_dom, devid);
- set_device_domain(iommu, &dma_dom->domain, devid);
- printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
- dma_dom->domain.id);
- print_devid(devid, 1);
+ dma_dom->target_dev = devid;
+
+ list_add_tail(&dma_dom->list, &iommu_pd_list);
}
}
@@ -1111,6 +1326,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
.unmap_single = unmap_single,
.map_sg = map_sg,
.unmap_sg = unmap_sg,
+ .dma_supported = amd_iommu_dma_supported,
};
/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a69cc0f5204..148fcfe22f1 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/sysdev.h>
+#include <linux/interrupt.h>
+#include <linux/msi.h>
#include <asm/pci-direct.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
/*
* definitions for the ACPI scanning code
*/
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
#define IVRS_HEADER_LENGTH 48
#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
int amd_iommu_isolate; /* if 1, device isolation is enabled */
+bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
system */
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
- ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
ctrl &= ~(1 << bit);
writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
/* Function to enable the hardware */
void __init iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
- print_devid(iommu->devid, 0);
- printk(" cap 0x%hx\n", iommu->cap_ptr);
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
+ "at %02x:%02x.%x cap 0x%hx\n",
+ iommu->dev->bus->number,
+ PCI_SLOT(iommu->dev->devfn),
+ PCI_FUNC(iommu->dev->devfn),
+ iommu->cap_ptr);
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
+/* Function to enable IOMMU event logging and event interrupts */
+void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+{
+ iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+ iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+}
+
/*
* mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
* the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
****************************************************************************/
/*
+ * This function calculates the length of a given IVHD entry
+ */
+static inline int ivhd_entry_length(u8 *ivhd)
+{
+ return 0x04 << (*ivhd >> 6);
+}
+
+/*
* This function reads the last device id the IOMMU has to handle from the PCI
* capability header for this IOMMU
*/
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
default:
break;
}
- p += 0x04 << (*p >> 6);
+ p += ivhd_entry_length(p);
}
WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
static void __init free_command_buffer(struct amd_iommu *iommu)
{
- free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
+ free_pages((unsigned long)iommu->cmd_buf,
+ get_order(iommu->cmd_buf_size));
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+ iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(EVT_BUFFER_SIZE));
+
+ if (iommu->evt_buf == NULL)
+ return NULL;
+
+ entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+ memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+ &entry, sizeof(entry));
+
+ iommu->evt_buf_size = EVT_BUFFER_SIZE;
+
+ return iommu->evt_buf;
+}
+
+static void __init free_event_buffer(struct amd_iommu *iommu)
+{
+ free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
}
/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
*/
static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
- int bus = PCI_BUS(iommu->devid);
- int dev = PCI_SLOT(iommu->devid);
- int fn = PCI_FUNC(iommu->devid);
int cap_ptr = iommu->cap_ptr;
- u32 range;
+ u32 range, misc;
- iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+ pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
+ &iommu->cap);
+ pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
+ &range);
+ pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
+ &misc);
- range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
iommu->first_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_FD(range));
iommu->last_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_LD(range));
+ iommu->evt_msi_num = MMIO_MSI_NUM(misc);
}
/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
}
- p += 0x04 << (e->type >> 6);
+ p += ivhd_entry_length(p);
}
}
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
static void __init free_iommu_one(struct amd_iommu *iommu)
{
free_command_buffer(iommu);
+ free_event_buffer(iommu);
iommu_unmap_mmio_space(iommu);
}
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
/*
* Copy data from ACPI table entry to the iommu struct
*/
- iommu->devid = h->devid;
+ iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
+ if (!iommu->dev)
+ return 1;
+
iommu->cap_ptr = h->cap_ptr;
+ iommu->pci_seg = h->pci_seg;
iommu->mmio_phys = h->mmio_phys;
iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
if (!iommu->mmio_base)
@@ -661,10 +713,18 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (!iommu->cmd_buf)
return -ENOMEM;
+ iommu->evt_buf = alloc_event_buffer(iommu);
+ if (!iommu->evt_buf)
+ return -ENOMEM;
+
+ iommu->int_enabled = false;
+
init_iommu_from_pci(iommu);
init_iommu_from_acpi(iommu, h);
init_iommu_devices(iommu);
+ pci_enable_device(iommu->dev);
+
return 0;
}
@@ -706,6 +766,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
/****************************************************************************
*
+ * The following functions initialize the MSI interrupts for all IOMMUs
+ * in the system. Its a bit challenging because there could be multiple
+ * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
+ * pci_dev.
+ *
+ ****************************************************************************/
+
+static int __init iommu_setup_msix(struct amd_iommu *iommu)
+{
+ struct amd_iommu *curr;
+ struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
+ int nvec = 0, i;
+
+ list_for_each_entry(curr, &amd_iommu_list, list) {
+ if (curr->dev == iommu->dev) {
+ entries[nvec].entry = curr->evt_msi_num;
+ entries[nvec].vector = 0;
+ curr->int_enabled = true;
+ nvec++;
+ }
+ }
+
+ if (pci_enable_msix(iommu->dev, entries, nvec)) {
+ pci_disable_msix(iommu->dev);
+ return 1;
+ }
+
+ for (i = 0; i < nvec; ++i) {
+ int r = request_irq(entries->vector, amd_iommu_int_handler,
+ IRQF_SAMPLE_RANDOM,
+ "AMD IOMMU",
+ NULL);
+ if (r)
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+ for (i -= 1; i >= 0; --i)
+ free_irq(entries->vector, NULL);
+
+ pci_disable_msix(iommu->dev);
+
+ return 1;
+}
+
+static int __init iommu_setup_msi(struct amd_iommu *iommu)
+{
+ int r;
+ struct amd_iommu *curr;
+
+ list_for_each_entry(curr, &amd_iommu_list, list) {
+ if (curr->dev == iommu->dev)
+ curr->int_enabled = true;
+ }
+
+
+ if (pci_enable_msi(iommu->dev))
+ return 1;
+
+ r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
+ IRQF_SAMPLE_RANDOM,
+ "AMD IOMMU",
+ NULL);
+
+ if (r) {
+ pci_disable_msi(iommu->dev);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __init iommu_init_msi(struct amd_iommu *iommu)
+{
+ if (iommu->int_enabled)
+ return 0;
+
+ if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
+ return iommu_setup_msix(iommu);
+ else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+ return iommu_setup_msi(iommu);
+
+ return 1;
+}
+
+/****************************************************************************
+ *
* The next functions belong to the third pass of parsing the ACPI
* table. In this last pass the memory mapping requirements are
* gathered (like exclusion and unity mapping reanges).
@@ -811,7 +960,6 @@ static void init_device_table(void)
for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
set_dev_entry_bit(devid, DEV_ENTRY_VALID);
set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
- set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
}
}
@@ -825,6 +973,8 @@ static void __init enable_iommus(void)
list_for_each_entry(iommu, &amd_iommu_list, list) {
iommu_set_exclusion_range(iommu);
+ iommu_init_msi(iommu);
+ iommu_enable_event_logging(iommu);
iommu_enable(iommu);
}
}
@@ -995,11 +1145,17 @@ int __init amd_iommu_init(void)
else
printk("disabled\n");
+ if (amd_iommu_unmap_flush)
+ printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
+ else
+ printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
+
out:
return ret;
free:
- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+ free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
+ get_order(MAX_DOMAIN_ID/8));
free_pages((unsigned long)amd_iommu_pd_table,
get_order(rlookup_table_size));
@@ -1057,8 +1213,10 @@ void __init amd_iommu_detect(void)
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
- if (strcmp(str, "isolate") == 0)
+ if (strncmp(str, "isolate", 7) == 0)
amd_iommu_isolate = 1;
+ if (strncmp(str, "fullflush", 11) == 0)
+ amd_iommu_unmap_flush = true;
}
return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db1..9a32b37ee2e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_ERR
+ printk(KERN_INFO
"Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"This costs you %d MB of RAM\n",
32 << fallback_aper_order);
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 58427210505..a91c57cb666 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -60,10 +60,8 @@ unsigned long mp_lapic_addr;
static int force_enable_local_apic;
int disable_apic;
-/* Local APIC timer verification ok */
-static int local_apic_timer_verify_ok;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int local_apic_timer_disabled;
+static int disable_apic_timer __cpuinitdata;
/* Local APIC timer works in C2 */
int local_apic_timer_c2_ok;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -130,7 +128,11 @@ static inline int lapic_get_version(void)
*/
static inline int lapic_is_integrated(void)
{
+#ifdef CONFIG_X86_64
+ return 1;
+#else
return APIC_INTEGRATED(lapic_get_version());
+#endif
}
/*
@@ -244,8 +246,12 @@ int lapic_get_maxlvt(void)
* Local APIC timer
*/
-/* Clock divisor is set to 16 */
+/* Clock divisor */
+#ifdef CONFG_X86_64
+#define APIC_DIVISOR 1
+#else
#define APIC_DIVISOR 16
+#endif
/*
* This function sets up the local APIC timer, with a timeout of
@@ -253,6 +259,9 @@ int lapic_get_maxlvt(void)
* this function twice on the boot CPU, once with a bogus timeout
* value, second time for real. The other (noncalibrating) CPUs
* call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
*/
static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
{
@@ -274,14 +283,44 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
*/
tmp_value = apic_read(APIC_TDCR);
apic_write(APIC_TDCR,
- (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
- APIC_TDR_DIV_16);
+ (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+ APIC_TDR_DIV_16);
if (!oneshot)
apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
}
/*
+ * Setup extended LVT, AMD specific (K8, family 10h)
+ *
+ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ */
+
+#define APIC_EILVT_LVTOFF_MCE 0
+#define APIC_EILVT_LVTOFF_IBS 1
+
+static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+{
+ unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+ unsigned int v = (mask << 16) | (msg_type << 8) | vector;
+
+ apic_write(reg, v);
+}
+
+u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+{
+ setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+ return APIC_EILVT_LVTOFF_MCE;
+}
+
+u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+{
+ setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+ return APIC_EILVT_LVTOFF_IBS;
+}
+
+/*
* Program the next event, relative to now
*/
static int lapic_next_event(unsigned long delta,
@@ -300,8 +339,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
unsigned long flags;
unsigned int v;
- /* Lapic used for broadcast ? */
- if (!local_apic_timer_verify_ok)
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
return;
local_irq_save(flags);
@@ -514,7 +553,7 @@ static int __init calibrate_APIC_clock(void)
return -1;
}
- local_apic_timer_verify_ok = 1;
+ levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
/* We trust the pm timer based calibration */
if (!pm_referenced) {
@@ -548,11 +587,11 @@ static int __init calibrate_APIC_clock(void)
if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
else
- local_apic_timer_verify_ok = 0;
+ levt->features |= CLOCK_EVT_FEAT_DUMMY;
} else
local_irq_enable();
- if (!local_apic_timer_verify_ok) {
+ if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
printk(KERN_WARNING
"APIC timer disabled due to verification failure.\n");
return -1;
@@ -574,7 +613,8 @@ void __init setup_boot_APIC_clock(void)
* timer as a dummy clock event source on SMP systems, so the
* broadcast mechanism is used. On UP systems simply ignore it.
*/
- if (local_apic_timer_disabled) {
+ if (disable_apic_timer) {
+ printk(KERN_INFO "Disabling APIC timer\n");
/* No broadcast on UP ! */
if (num_possible_cpus() > 1) {
lapic_clockevent.mult = 1;
@@ -643,7 +683,11 @@ static void local_apic_timer_interrupt(void)
/*
* the NMI deadlock-detector uses this.
*/
+#ifdef CONFIG_X86_64
+ add_pda(apic_timer_irqs, 1);
+#else
per_cpu(irq_stat, cpu).apic_timer_irqs++;
+#endif
evt->event_handler(evt);
}
@@ -683,35 +727,6 @@ int setup_profiling_timer(unsigned int multiplier)
}
/*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
- apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
-}
-
-/*
* Local APIC start and shutdown
*/
@@ -756,7 +771,7 @@ void clear_local_APIC(void)
}
/* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
if (maxlvt >= 5) {
v = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -773,10 +788,6 @@ void clear_local_APIC(void)
if (maxlvt >= 4)
apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-#ifdef CONFIG_X86_MCE_P4THERMAL
- if (maxlvt >= 5)
- apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
-#endif
/* Integrated APIC (!82489DX) ? */
if (lapic_is_integrated()) {
if (maxlvt > 3)
@@ -791,7 +802,7 @@ void clear_local_APIC(void)
*/
void disable_local_APIC(void)
{
- unsigned long value;
+ unsigned int value;
clear_local_APIC();
@@ -803,6 +814,7 @@ void disable_local_APIC(void)
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
+#ifdef CONFIG_X86_32
/*
* When LAPIC was disabled by the BIOS and enabled by the kernel,
* restore the disabled state.
@@ -814,6 +826,7 @@ void disable_local_APIC(void)
l &= ~MSR_IA32_APICBASE_ENABLE;
wrmsr(MSR_IA32_APICBASE, l, h);
}
+#endif
}
/*
@@ -830,11 +843,15 @@ void lapic_shutdown(void)
return;
local_irq_save(flags);
- clear_local_APIC();
- if (enabled_via_apicbase)
+#ifdef CONFIG_X86_32
+ if (!enabled_via_apicbase)
+ clear_local_APIC();
+ else
+#endif
disable_local_APIC();
+
local_irq_restore(flags);
}
@@ -879,6 +896,12 @@ int __init verify_local_APIC(void)
*/
reg0 = apic_read(APIC_ID);
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+ apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+ reg1 = apic_read(APIC_ID);
+ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+ apic_write(APIC_ID, reg0);
+ if (reg1 != (reg0 ^ APIC_ID_MASK))
+ return 0;
/*
* The next two are just to see if we have sane values.
@@ -904,14 +927,15 @@ void __init sync_Arb_IDs(void)
*/
if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return;
+
/*
* Wait for idle.
*/
apic_wait_icr_idle();
apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR,
- APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_write(APIC_ICR, APIC_DEST_ALLINC |
+ APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
/*
@@ -919,7 +943,7 @@ void __init sync_Arb_IDs(void)
*/
void __init init_bsp_APIC(void)
{
- unsigned long value;
+ unsigned int value;
/*
* Don't do the setup now if we have a SMP BIOS as the
@@ -940,11 +964,13 @@ void __init init_bsp_APIC(void)
value &= ~APIC_VECTOR_MASK;
value |= APIC_SPIV_APIC_ENABLED;
+#ifdef CONFIG_X86_32
/* This bit is reserved on P4/Xeon and should be cleared */
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
(boot_cpu_data.x86 == 15))
value &= ~APIC_SPIV_FOCUS_DISABLED;
else
+#endif
value |= APIC_SPIV_FOCUS_DISABLED;
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
@@ -963,6 +989,16 @@ static void __cpuinit lapic_setup_esr(void)
{
unsigned long oldvalue, value, maxlvt;
if (lapic_is_integrated() && !esr_disable) {
+ if (esr_disable) {
+ /*
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ printk(KERN_INFO "Leaving ESR disabled.\n");
+ return;
+ }
/* !82489DX */
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -983,16 +1019,7 @@ static void __cpuinit lapic_setup_esr(void)
"vector: 0x%08lx after: 0x%08lx\n",
oldvalue, value);
} else {
- if (esr_disable)
- /*
- * Something untraceable is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk(KERN_INFO "Leaving ESR disabled.\n");
- else
- printk(KERN_INFO "No ESR for 82489DX.\n");
+ printk(KERN_INFO "No ESR for 82489DX.\n");
}
}
@@ -1130,13 +1157,17 @@ void __cpuinit setup_local_APIC(void)
void __cpuinit end_local_APIC_setup(void)
{
- unsigned long value;
-
lapic_setup_esr();
- /* Disable the local apic timer */
- value = apic_read(APIC_LVTT);
- value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, value);
+
+#ifdef CONFIG_X86_32
+ {
+ unsigned int value;
+ /* Disable the local apic timer */
+ value = apic_read(APIC_LVTT);
+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, value);
+ }
+#endif
setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
@@ -1367,6 +1398,7 @@ void smp_error_interrupt(struct pt_regs *regs)
*/
void __init connect_bsp_APIC(void)
{
+#ifdef CONFIG_X86_32
if (pic_mode) {
/*
* Do not trust the local APIC being empty at bootup.
@@ -1381,6 +1413,7 @@ void __init connect_bsp_APIC(void)
outb(0x70, 0x22);
outb(0x01, 0x23);
}
+#endif
enable_apic_mode();
}
@@ -1393,6 +1426,9 @@ void __init connect_bsp_APIC(void)
*/
void disconnect_bsp_APIC(int virt_wire_setup)
{
+ unsigned int value;
+
+#ifdef CONFIG_X86_32
if (pic_mode) {
/*
* Put the board back into PIC mode (has an effect only on
@@ -1404,54 +1440,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
"entering PIC mode.\n");
outb(0x70, 0x22);
outb(0x00, 0x23);
- } else {
- /* Go back to Virtual Wire compatibility mode */
- unsigned long value;
+ return;
+ }
+#endif
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write(APIC_SPIV, value);
+ /* Go back to Virtual Wire compatibility mode */
- if (!virt_wire_setup) {
- /*
- * For LVT0 make it edge triggered, active high,
- * external and enabled
- */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write(APIC_LVT0, value);
- } else {
- /* Disable LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- }
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write(APIC_SPIV, value);
+ if (!virt_wire_setup) {
/*
- * For LVT1 make it edge triggered, active high, nmi and
- * enabled
+ * For LVT0 make it edge triggered, active high,
+ * external and enabled
*/
- value = apic_read(APIC_LVT1);
- value &= ~(
- APIC_MODE_MASK | APIC_SEND_PENDING |
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write(APIC_LVT1, value);
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
}
+
+ /*
+ * For LVT1 make it edge triggered, active high,
+ * nmi and enabled
+ */
+ value = apic_read(APIC_LVT1);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write(APIC_LVT1, value);
}
void __cpuinit generic_processor_info(int apicid, int version)
{
int cpu;
cpumask_t tmp_map;
- physid_mask_t phys_cpu;
/*
* Validate version
@@ -1464,9 +1499,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
}
apic_version[apicid] = version;
- phys_cpu = apicid_to_cpu_present(apicid);
- physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
-
if (num_processors >= NR_CPUS) {
printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
" Processor ignored.\n", NR_CPUS);
@@ -1477,17 +1509,19 @@ void __cpuinit generic_processor_info(int apicid, int version)
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
- if (apicid == boot_cpu_physical_apicid)
+ physid_set(apicid, phys_cpu_present_map);
+ if (apicid == boot_cpu_physical_apicid) {
/*
* x86_bios_cpu_apicid is required to have processors listed
* in same order as logical cpu numbers. Hence the first
* entry is BSP, and so on.
*/
cpu = 0;
-
+ }
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
+#ifdef CONFIG_X86_32
/*
* Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
* but we need to work other dependencies like SMP_SUSPEND etc
@@ -1507,7 +1541,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
def_to_bigsmp = 1;
}
}
-#ifdef CONFIG_SMP
+#endif
+
+#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
/* are we being called early in kernel startup? */
if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1520,6 +1556,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
}
#endif
+
cpu_set(cpu, cpu_possible_map);
cpu_set(cpu, cpu_present_map);
}
@@ -1530,6 +1567,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
#ifdef CONFIG_PM
static struct {
+ /*
+ * 'active' is true if the local APIC was enabled by us and
+ * not the BIOS; this signifies that we are also responsible
+ * for disabling it before entering apm/acpi suspend
+ */
int active;
/* r/w apic fields */
unsigned int apic_id;
@@ -1570,7 +1612,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
@@ -1594,16 +1636,23 @@ static int lapic_resume(struct sys_device *dev)
local_irq_save(flags);
- /*
- * Make sure the APICBASE points to the right address
- *
- * FIXME! This will be wrong if we ever support suspend on
- * SMP! We'll need to do this as part of the CPU restore!
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
+#ifdef CONFIG_X86_64
+ if (x2apic)
+ enable_x2apic();
+ else
+#endif
+ {
+ /*
+ * Make sure the APICBASE points to the right address
+ *
+ * FIXME! This will be wrong if we ever support suspend on
+ * SMP! We'll need to do this as part of the CPU restore!
+ */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1613,7 +1662,7 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
@@ -1627,7 +1676,9 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
+
local_irq_restore(flags);
+
return 0;
}
@@ -1683,20 +1734,20 @@ static int __init parse_lapic(char *arg)
}
early_param("lapic", parse_lapic);
-static int __init parse_nolapic(char *arg)
+static int __init setup_disableapic(char *arg)
{
disable_apic = 1;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
-early_param("nolapic", parse_nolapic);
+early_param("disableapic", setup_disableapic);
-static int __init parse_disable_lapic_timer(char *arg)
+/* same as disableapic, for compatibility */
+static int __init setup_nolapic(char *arg)
{
- local_apic_timer_disabled = 1;
- return 0;
+ return setup_disableapic(arg);
}
-early_param("nolapic_timer", parse_disable_lapic_timer);
+early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)
{
@@ -1705,15 +1756,40 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
}
early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+static int __init parse_disable_apic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
+
static int __init apic_set_verbosity(char *arg)
{
- if (!arg)
+ if (!arg) {
+#ifdef CONFIG_X86_64
+ skip_ioapic_setup = 0;
+ ioapic_force = 1;
+ return 0;
+#endif
return -EINVAL;
+ }
- if (strcmp(arg, "debug") == 0)
+ if (strcmp("debug", arg) == 0)
apic_verbosity = APIC_DEBUG;
- else if (strcmp(arg, "verbose") == 0)
+ else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
+ else {
+ printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug\n", arg);
+ return -EINVAL;
+ }
return 0;
}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1a6011855af..53898b65a6a 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -45,6 +45,7 @@
#include <mach_ipi.h>
#include <mach_apic.h>
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
static int disable_apic_timer __cpuinitdata;
static int apic_calibrate_pmtmr __initdata;
int disable_apic;
@@ -80,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
static void lapic_timer_broadcast(cpumask_t mask);
static void apic_pm_activate(void);
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
static struct clock_event_device lapic_clockevent = {
.name = "lapic",
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -106,11 +110,15 @@ static inline int lapic_get_version(void)
}
/*
- * Check, if the APIC is integrated or a seperate chip
+ * Check, if the APIC is integrated or a separate chip
*/
static inline int lapic_is_integrated(void)
{
+#ifdef CONFIG_X86_64
return 1;
+#else
+ return APIC_INTEGRATED(lapic_get_version());
+#endif
}
/*
@@ -125,6 +133,11 @@ static int modern_apic(void)
return lapic_get_version() >= 0x14;
}
+/*
+ * Paravirt kernels also might be using these below ops. So we still
+ * use generic apic_read()/apic_write(), which might be pointing to different
+ * ops in PARAVIRT case.
+ */
void xapic_wait_icr_idle(void)
{
while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -149,7 +162,7 @@ u32 safe_xapic_wait_icr_idle(void)
void xapic_icr_write(u32 low, u32 id)
{
- apic_write(APIC_ICR2, id << 24);
+ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
}
@@ -160,7 +173,7 @@ u64 xapic_icr_read(void)
icr2 = apic_read(APIC_ICR2);
icr1 = apic_read(APIC_ICR);
- return (icr1 | ((u64)icr2 << 32));
+ return icr1 | ((u64)icr2 << 32);
}
static struct apic_ops xapic_ops = {
@@ -173,7 +186,6 @@ static struct apic_ops xapic_ops = {
};
struct apic_ops __read_mostly *apic_ops = &xapic_ops;
-
EXPORT_SYMBOL_GPL(apic_ops);
static void x2apic_wait_icr_idle(void)
@@ -243,6 +255,17 @@ int lapic_get_maxlvt(void)
}
/*
+ * Local APIC timer
+ */
+
+/* Clock divisor */
+#ifdef CONFG_X86_64
+#define APIC_DIVISOR 1
+#else
+#define APIC_DIVISOR 16
+#endif
+
+/*
* This function sets up the local APIC timer, with a timeout of
* 'clocks' APIC bus clock. During calibration we actually call
* this function twice on the boot CPU, once with a bogus timeout
@@ -252,7 +275,6 @@ int lapic_get_maxlvt(void)
* We do reads before writes even if unnecessary, to get around the
* P5 APIC double write bug.
*/
-
static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
{
unsigned int lvtt_value, tmp_value;
@@ -260,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
lvtt_value = LOCAL_TIMER_VECTOR;
if (!oneshot)
lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ if (!lapic_is_integrated())
+ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
if (!irqen)
lvtt_value |= APIC_LVT_MASKED;
@@ -269,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
* Divide PICLK by 16
*/
tmp_value = apic_read(APIC_TDCR);
- apic_write(APIC_TDCR, (tmp_value
- & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
- | APIC_TDR_DIV_16);
+ apic_write(APIC_TDCR,
+ (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+ APIC_TDR_DIV_16);
if (!oneshot)
- apic_write(APIC_TMICT, clocks);
+ apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
}
/*
@@ -444,7 +469,7 @@ static int __init calibrate_APIC_clock(void)
lapic_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &lapic_clockevent);
- calibration_result = result / HZ;
+ calibration_result = (result * APIC_DIVISOR) / HZ;
/*
* Do a sanity check on the APIC calibration result
@@ -466,10 +491,10 @@ static int __init calibrate_APIC_clock(void)
void __init setup_boot_APIC_clock(void)
{
/*
- * The local apic timer can be disabled via the kernel commandline.
- * Register the lapic timer as a dummy clock event source on SMP
- * systems, so the broadcast mechanism is used. On UP systems simply
- * ignore it.
+ * The local apic timer can be disabled via the kernel
+ * commandline or from the CPU detection code. Register the lapic
+ * timer as a dummy clock event source on SMP systems, so the
+ * broadcast mechanism is used. On UP systems simply ignore it.
*/
if (disable_apic_timer) {
printk(KERN_INFO "Disabling APIC timer\n");
@@ -481,7 +506,9 @@ void __init setup_boot_APIC_clock(void)
return;
}
- printk(KERN_INFO "Using local APIC timer interrupts.\n");
+ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+ "calibrating APIC timer ...\n");
+
if (calibrate_APIC_clock()) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1)
@@ -500,6 +527,7 @@ void __init setup_boot_APIC_clock(void)
printk(KERN_WARNING "APIC timer registered as dummy,"
" due to nmi_watchdog=%d!\n", nmi_watchdog);
+ /* Setup the lapic or request the broadcast */
setup_APIC_timer();
}
@@ -538,7 +566,11 @@ static void local_apic_timer_interrupt(void)
/*
* the NMI deadlock-detector uses this.
*/
+#ifdef CONFIG_X86_64
add_pda(apic_timer_irqs, 1);
+#else
+ per_cpu(irq_stat, cpu).apic_timer_irqs++;
+#endif
evt->event_handler(evt);
}
@@ -569,6 +601,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
irq_enter();
local_apic_timer_interrupt();
irq_exit();
+
set_irq_regs(old_regs);
}
@@ -622,6 +655,13 @@ void clear_local_APIC(void)
apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
}
+ /* lets not touch this if we didn't frob it */
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
+ if (maxlvt >= 5) {
+ v = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+ }
+#endif
/*
* Clean APIC state for other OSs:
*/
@@ -632,8 +672,14 @@ void clear_local_APIC(void)
apic_write(APIC_LVTERR, APIC_LVT_MASKED);
if (maxlvt >= 4)
apic_write(APIC_LVTPC, APIC_LVT_MASKED);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
+
+ /* Integrated APIC (!82489DX) ? */
+ if (lapic_is_integrated()) {
+ if (maxlvt > 3)
+ /* Clear ESR due to Pentium errata 3AP and 11AP */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
}
/**
@@ -652,8 +698,28 @@ void disable_local_APIC(void)
value = apic_read(APIC_SPIV);
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
+
+#ifdef CONFIG_X86_32
+ /*
+ * When LAPIC was disabled by the BIOS and enabled by the kernel,
+ * restore the disabled state.
+ */
+ if (enabled_via_apicbase) {
+ unsigned int l, h;
+
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_ENABLE;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
+#endif
}
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
+ * not power-off. Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
void lapic_shutdown(void)
{
unsigned long flags;
@@ -663,7 +729,13 @@ void lapic_shutdown(void)
local_irq_save(flags);
- disable_local_APIC();
+#ifdef CONFIG_X86_32
+ if (!enabled_via_apicbase)
+ clear_local_APIC();
+ else
+#endif
+ disable_local_APIC();
+
local_irq_restore(flags);
}
@@ -734,8 +806,11 @@ int __init verify_local_APIC(void)
*/
void __init sync_Arb_IDs(void)
{
- /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
- if (modern_apic())
+ /*
+ * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+ * needed on AMD.
+ */
+ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return;
/*
@@ -744,8 +819,8 @@ void __init sync_Arb_IDs(void)
apic_wait_icr_idle();
apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
- | APIC_DM_INIT);
+ apic_write(APIC_ICR, APIC_DEST_ALLINC |
+ APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
/*
@@ -762,8 +837,6 @@ void __init init_bsp_APIC(void)
if (smp_found_config || !cpu_has_apic)
return;
- value = apic_read(APIC_LVR);
-
/*
* Do not trust the local APIC being empty at bootup.
*/
@@ -775,7 +848,15 @@ void __init init_bsp_APIC(void)
value = apic_read(APIC_SPIV);
value &= ~APIC_VECTOR_MASK;
value |= APIC_SPIV_APIC_ENABLED;
- value |= APIC_SPIV_FOCUS_DISABLED;
+
+#ifdef CONFIG_X86_32
+ /* This bit is reserved on P4/Xeon and should be cleared */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 15))
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+ else
+#endif
+ value |= APIC_SPIV_FOCUS_DISABLED;
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
@@ -784,9 +865,50 @@ void __init init_bsp_APIC(void)
*/
apic_write(APIC_LVT0, APIC_DM_EXTINT);
value = APIC_DM_NMI;
+ if (!lapic_is_integrated()) /* 82489DX */
+ value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
}
+static void __cpuinit lapic_setup_esr(void)
+{
+ unsigned long oldvalue, value, maxlvt;
+ if (lapic_is_integrated() && !esr_disable) {
+ if (esr_disable) {
+ /*
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ printk(KERN_INFO "Leaving ESR disabled.\n");
+ return;
+ }
+ /* !82489DX */
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write(APIC_LVTERR, value);
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08lx after: 0x%08lx\n",
+ oldvalue, value);
+ } else {
+ printk(KERN_INFO "No ESR for 82489DX.\n");
+ }
+}
+
+
/**
* setup_local_APIC - setup the local APIC
*/
@@ -892,21 +1014,20 @@ void __cpuinit setup_local_APIC(void)
preempt_enable();
}
-static void __cpuinit lapic_setup_esr(void)
-{
- unsigned maxlvt = lapic_get_maxlvt();
-
- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
- /*
- * spec says clear errors after enabling vector.
- */
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
-}
-
void __cpuinit end_local_APIC_setup(void)
{
lapic_setup_esr();
+
+#ifdef CONFIG_X86_32
+ {
+ unsigned int value;
+ /* Disable the local apic timer */
+ value = apic_read(APIC_LVTT);
+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, value);
+ }
+#endif
+
setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
@@ -1108,6 +1229,8 @@ void __init init_apic_mappings(void)
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
+int apic_version[MAX_APICS];
+
int __init APIC_init_uniprocessor(void)
{
if (disable_apic) {
@@ -1209,17 +1332,57 @@ asmlinkage void smp_error_interrupt(void)
}
/**
- * * connect_bsp_APIC - attach the APIC to the interrupt system
- * */
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
void __init connect_bsp_APIC(void)
{
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+ /*
+ * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
+ * local APIC to INT and NMI lines.
+ */
+ apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+ "enabling APIC mode.\n");
+ outb(0x70, 0x22);
+ outb(0x01, 0x23);
+ }
+#endif
enable_apic_mode();
}
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup: indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
void disconnect_bsp_APIC(int virt_wire_setup)
{
+ unsigned int value;
+
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Put the board back into PIC mode (has an effect only on
+ * certain older boards). Note that APIC interrupts, including
+ * IPIs, won't work beyond this point! The only exception are
+ * INIT IPIs.
+ */
+ apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+ "entering PIC mode.\n");
+ outb(0x70, 0x22);
+ outb(0x00, 0x23);
+ return;
+ }
+#endif
+
/* Go back to Virtual Wire compatibility mode */
- unsigned long value;
/* For the spurious interrupt use vector F, and enable it */
value = apic_read(APIC_SPIV);
@@ -1245,7 +1408,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT0, APIC_LVT_MASKED);
}
- /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ /*
+ * For LVT1 make it edge triggered, active high,
+ * nmi and enabled
+ */
value = apic_read(APIC_LVT1);
value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1260,9 +1426,20 @@ void __cpuinit generic_processor_info(int apicid, int version)
int cpu;
cpumask_t tmp_map;
+ /*
+ * Validate version
+ */
+ if (version == 0x0) {
+ printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
+ "fixing up to 0x10. (tell your hw vendor)\n",
+ version);
+ version = 0x10;
+ }
+ apic_version[apicid] = version;
+
if (num_processors >= NR_CPUS) {
printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
+ " Processor ignored.\n", NR_CPUS);
return;
}
@@ -1282,6 +1459,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
+#ifdef CONFIG_X86_32
+ /*
+ * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
+ * but we need to work other dependencies like SMP_SUSPEND etc
+ * before this can be done without some confusion.
+ * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
+ * - Ashok Raj <ashok.raj@intel.com>
+ */
+ if (max_physical_apicid >= 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+#endif
+
+#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
/* are we being called early in kernel startup? */
if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1293,6 +1493,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
per_cpu(x86_cpu_to_apicid, cpu) = apicid;
per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
}
+#endif
cpu_set(cpu, cpu_possible_map);
cpu_set(cpu, cpu_present_map);
@@ -1309,9 +1510,11 @@ int hard_smp_processor_id(void)
#ifdef CONFIG_PM
static struct {
- /* 'active' is true if the local APIC was enabled by us and
- not the BIOS; this signifies that we are also responsible
- for disabling it before entering apm/acpi suspend */
+ /*
+ * 'active' is true if the local APIC was enabled by us and
+ * not the BIOS; this signifies that we are also responsible
+ * for disabling it before entering apm/acpi suspend
+ */
int active;
/* r/w apic fields */
unsigned int apic_id;
@@ -1352,10 +1555,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_INTEL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
+
local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
@@ -1374,13 +1578,24 @@ static int lapic_resume(struct sys_device *dev)
maxlvt = lapic_get_maxlvt();
local_irq_save(flags);
- if (!x2apic) {
+
+#ifdef CONFIG_X86_64
+ if (x2apic)
+ enable_x2apic();
+ else
+#endif
+ {
+ /*
+ * Make sure the APICBASE points to the right address
+ *
+ * FIXME! This will be wrong if we ever support suspend on
+ * SMP! We'll need to do this as part of the CPU restore!
+ */
rdmsr(MSR_IA32_APICBASE, l, h);
l &= ~MSR_IA32_APICBASE_BASE;
l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
wrmsr(MSR_IA32_APICBASE, l, h);
- } else
- enable_x2apic();
+ }
apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1390,7 +1605,7 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_INTEL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
@@ -1404,10 +1619,17 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
+
local_irq_restore(flags);
+
return 0;
}
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
static struct sysdev_class lapic_sysclass = {
.name = "lapic",
.resume = lapic_resume,
@@ -1533,28 +1755,7 @@ early_param("nox2apic", setup_nox2apic);
/*
* APIC command line parameters
*/
-static int __init apic_set_verbosity(char *str)
-{
- if (str == NULL) {
- skip_ioapic_setup = 0;
- ioapic_force = 1;
- return 0;
- }
- if (strcmp("debug", str) == 0)
- apic_verbosity = APIC_DEBUG;
- else if (strcmp("verbose", str) == 0)
- apic_verbosity = APIC_VERBOSE;
- else {
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
- " use apic=verbose or apic=debug\n", str);
- return -EINVAL;
- }
-
- return 0;
-}
-early_param("apic", apic_set_verbosity);
-
-static __init int setup_disableapic(char *str)
+static int __init setup_disableapic(char *arg)
{
disable_apic = 1;
setup_clear_cpu_cap(X86_FEATURE_APIC);
@@ -1563,9 +1764,9 @@ static __init int setup_disableapic(char *str)
early_param("disableapic", setup_disableapic);
/* same as disableapic, for compatibility */
-static __init int setup_nolapic(char *str)
+static int __init setup_nolapic(char *arg)
{
- return setup_disableapic(str);
+ return setup_disableapic(arg);
}
early_param("nolapic", setup_nolapic);
@@ -1576,14 +1777,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
}
early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-static __init int setup_noapictimer(char *str)
+static int __init parse_disable_apic_timer(char *arg)
{
- if (str[0] != ' ' && str[0] != 0)
- return 0;
disable_apic_timer = 1;
- return 1;
+ return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
}
-__setup("noapictimer", setup_noapictimer);
+early_param("nolapic_timer", parse_nolapic_timer);
static __init int setup_apicpmtimer(char *s)
{
@@ -1593,6 +1799,31 @@ static __init int setup_apicpmtimer(char *s)
}
__setup("apicpmtimer", setup_apicpmtimer);
+static int __init apic_set_verbosity(char *arg)
+{
+ if (!arg) {
+#ifdef CONFIG_X86_64
+ skip_ioapic_setup = 0;
+ ioapic_force = 1;
+ return 0;
+#endif
+ return -EINVAL;
+ }
+
+ if (strcmp("debug", arg) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", arg) == 0)
+ apic_verbosity = APIC_VERBOSE;
+ else {
+ printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic", apic_set_verbosity);
+
static int __init lapic_insert_resource(void)
{
if (!apic_phys)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9ee24e6bc4b..5145a6e72bb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,12 +228,12 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
-#include <linux/smp_lock.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/i8253.h>
+#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
@@ -2217,7 +2217,7 @@ static int __init apm_init(void)
dmi_check_system(apm_dmi_table);
- if (apm_info.bios.version == 0 || paravirt_enabled()) {
+ if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
printk(KERN_INFO "apm: BIOS not found.\n");
return -ENODEV;
}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391..fdd585f9c53 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
{
const char *str;
switch (status) {
- case 0: str = "Call completed without error"; break;
- case -1: str = "Not implemented"; break;
- case -2: str = "Invalid argument"; break;
- case -3: str = "Call completed with error"; break;
- default: str = "Unknown BIOS status code"; break;
+ case 0: str = "Call completed without error"; break;
+ case -1: str = "Not implemented"; break;
+ case -2: str = "Invalid argument"; break;
+ case -3: str = "Call completed with error"; break;
+ default: str = "Unknown BIOS status code"; break;
}
return str;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 84c4040efa8..7581b62df18 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -572,35 +572,15 @@ void __init early_cpu_init(void)
/*
* The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6, unfortunately, that's not true in practice because
+ * family >= 6; unfortunately, that's not true in practice because
* of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect. Hence, probe for it based on first
- * principles.
- *
- * Note: no 64-bit chip is known to lack these, but put the code here
- * for consistency with 32 bits, and to make it utterly trivial to
- * diagnose the problem should it ever surface.
+ * are not easy to detect. In the latter case it doesn't even *fail*
+ * reliably, so probing for it doesn't even work. Disable it completely
+ * unless we can find a reliable way to detect all the broken cases.
*/
static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
{
- const u32 nopl_signature = 0x888c53b1; /* Random number */
- u32 has_nopl = nopl_signature;
-
clear_cpu_cap(c, X86_FEATURE_NOPL);
- if (c->x86 >= 6) {
- asm volatile("\n"
- "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
- "2:\n"
- " .section .fixup,\"ax\"\n"
- "3: xor %0,%0\n"
- " jmp 2b\n"
- " .previous\n"
- _ASM_EXTABLE(1b,3b)
- : "+a" (has_nopl));
-
- if (has_nopl == nopl_signature)
- set_cpu_cap(c, X86_FEATURE_NOPL);
- }
}
static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fb..b8e05ee4f73 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
}
if (c->x86 != 0xF) {
- printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n");
+ printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
return 0;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 15e13c01cc3..3b5f06423e7 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
#include <asm/cpufeature.h>
#define PFX "speedstep-centrino: "
-#define MAINTAINER "cpufreq@lists.linux.org.uk"
+#define MAINTAINER "cpufreq@vger.kernel.org"
#define dprintk(msg...) \
cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80e..4e8d77f01ee 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1<<(hi - 1)) - 1);
if (tmp != mask_lo) {
- static int once = 1;
-
- if (once) {
- printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
- once = 0;
- }
+ WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
mask_lo = tmp;
}
}
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb371..4c4214690dd 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
}
/* RED-PEN: base can be > 32bit */
len += seq_printf(seq,
- "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
+ "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_attrib_to_str(type), mtrr_usage_table[i]);
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
}
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 58ac5d3d436..c78c04821ea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
/* take out UC ranges */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
- if (type != MTRR_TYPE_UNCACHABLE)
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
continue;
size = range_state[i].size_pfn;
if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
enable_mtrr_cleanup = 1;
return 0;
}
-early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
struct var_mtrr_state {
unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
}
}
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ char factor;
+ unsigned long base = sizek;
+
+ if (base & ((1<<10) - 1)) {
+ /* not MB alignment */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)){
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
static unsigned int __init
range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
align = max_align;
sizek = 1 << align;
- if (debug_print)
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk, &start_factor),
+ size_base = to_size_factor(sizek, &size_factor),
+
printk(KERN_DEBUG "Setting variable MTRR %d, "
- "base: %ldMB, range: %ldMB, type %s\n",
- reg, range_startk >> 10, sizek >> 10,
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE)?"UC":
((type == MTRR_TYPE_WRBACK)?"WB":"Other")
);
+ }
save_var_mtrr(reg++, range_startk, sizek, type);
range_startk += sizek;
range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
/* try to append some small hole */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* no increase */
if (range0_sizek == state->range_sizek) {
if (debug_print)
printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
return 0;
}
- range0_sizek -= chunk_sizek;
- if (range0_sizek && sizek) {
- while (range0_basek + range0_sizek > (basek + sizek)) {
- range0_sizek -= chunk_sizek;
- if (!range0_sizek)
- break;
- }
+ /* only cut back, when it is not the last */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* one hole in the middle */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* one hole in middle or at end */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* hole size should be less than half of range0 size */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
}
if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
(range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
-
- }
-
- range_basek = range0_basek + range0_sizek;
- range_sizek = chunk_sizek;
-
- if (range_basek + range_sizek > basek &&
- range_basek + range_sizek <= (basek + sizek)) {
- /* one hole */
- second_basek = basek;
- second_sizek = range_basek + range_sizek - basek;
}
- /* if last piece, only could one hole near end */
- if ((second_basek || !basek) &&
- range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
- (chunk_sizek >> 1)) {
- /*
- * one hole in middle (second_sizek is 0) or at end
- * (second_sizek is 0 )
- */
- hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
- - second_sizek;
- hole_basek = range_basek + range_sizek - hole_sizek
- - second_sizek;
- } else {
- /* fallback for big hole, or several holes */
+ if (range0_sizek < state->range_sizek) {
+ /* need to handle left over */
range_sizek = state->range_sizek - range0_sizek;
- second_basek = 0;
- second_sizek = 0;
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
}
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
- (range_basek + range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
- MTRR_TYPE_WRBACK);
if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
if (debug_print)
printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10, (hole_basek + hole_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
- MTRR_TYPE_UNCACHABLE);
-
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
}
return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
};
/*
- * gran_size: 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 4G
- * so we need (2+13)*6
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
*/
-#define NUM_RESULT 90
+#define NUM_RESULT 136
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static int __init mtrr_cleanup(unsigned address_bits)
{
unsigned long extra_remove_base, extra_remove_size;
- unsigned long i, base, size, def, dummy;
+ unsigned long base, size, def, dummy;
mtrr_type type;
int nr_range, nr_range_new;
u64 chunk_size, gran_size;
unsigned long range_sums, range_sums_new;
int index_good;
int num_reg_good;
+ int i;
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
continue;
if (!size)
type = MTRR_NUM_TYPES;
+ if (type == MTRR_TYPE_WRPROT)
+ type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ for (i = 0; i < num_var_ranges; i++) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+
memset(range, 0, sizeof(range));
extra_remove_size = 0;
- if (mtrr_tom2) {
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
extra_remove_size =
(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
- }
nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
extra_remove_size);
+ /*
+ * [0, 1M) should always be coverred by var mtrr with WB
+ * and fixed mtrrs should take effective before var mtrr for it
+ */
+ nr_range = add_range_with_merge(range, nr_range, 0,
+ (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM coverred: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
int num_reg;
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
- debug_print = 1;
+ debug_print++;
/* convert ranges to var ranges state */
num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
result[i].lose_cover_sizek =
(range_sums - range_sums_new) << PSHIFT;
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ lose_base, lose_factor);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
- debug_print = 0;
+ debug_print--;
memset(result, 0, sizeof(result[0]));
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
- for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
- for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+ char gran_factor;
+ unsigned long gran_base;
+
+ if (debug_print)
+ gran_base = to_size_factor(gran_size >> 10, &gran_factor);
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
int num_reg;
- if (debug_print)
- printk(KERN_INFO
- "\ngran_size: %lldM chunk_size_size: %lldM\n",
- gran_size >> 20, chunk_size >> 20);
+ if (debug_print) {
+ char chunk_factor;
+ unsigned long chunk_base;
+
+ chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
+ printk(KERN_INFO "\n");
+ printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ }
if (i >= NUM_RESULT)
continue;
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
/* print out all */
for (i = 0; i < NUM_RESULT; i++) {
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
- result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad?"-":"",
+ lose_base, lose_factor);
}
/* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
nr_mtrr_spare_reg = num_var_ranges - 1;
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i]) {
+ if (!min_loss_pfn[i])
num_reg_good = i;
- break;
- }
}
index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
}
if (index_good != -1) {
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
- printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
- result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
- result[i].num_reg,
- result[i].lose_cover_sizek >> 10);
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
+ result[i].num_reg, lose_base, lose_factor);
/* convert ranges to var ranges state */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
- debug_print = 1;
+ debug_print++;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ debug_print--;
set_var_mtrr_all(address_bits);
return 1;
}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4f..6bff382094f 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
#define P4_CCCR_ENABLE (1 << 12)
#define P4_CCCR_OVF (1 << 31)
+#define P4_CONTROLS 18
+static unsigned int p4_controls[18] = {
+ MSR_P4_BPU_CCCR0,
+ MSR_P4_BPU_CCCR1,
+ MSR_P4_BPU_CCCR2,
+ MSR_P4_BPU_CCCR3,
+ MSR_P4_MS_CCCR0,
+ MSR_P4_MS_CCCR1,
+ MSR_P4_MS_CCCR2,
+ MSR_P4_MS_CCCR3,
+ MSR_P4_FLAME_CCCR0,
+ MSR_P4_FLAME_CCCR1,
+ MSR_P4_FLAME_CCCR2,
+ MSR_P4_FLAME_CCCR3,
+ MSR_P4_IQ_CCCR0,
+ MSR_P4_IQ_CCCR1,
+ MSR_P4_IQ_CCCR2,
+ MSR_P4_IQ_CCCR3,
+ MSR_P4_IQ_CCCR4,
+ MSR_P4_IQ_CCCR5,
+};
/*
* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
* CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR0;
cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+
+ /*
+ * If we're on the kdump kernel or other situation, we may
+ * still have other performance counter registers set to
+ * interrupt and they'll keep interrupting forever because
+ * of the P4_CCCR_OVF quirk. So we need to ACK all the
+ * pending interrupts and disable all the registers here,
+ * before reenabling the NMI delivery. Refer to p4_rearm()
+ * about the P4_CCCR_OVF quirk.
+ */
+ if (reset_devices) {
+ unsigned int low, high;
+ int i;
+
+ for (i = 0; i < P4_CONTROLS; i++) {
+ rdmsr(p4_controls[i], low, high);
+ low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
+ wrmsr(p4_controls[i], low, high);
+ }
+ }
} else {
/* logical cpu 1 */
perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
wrmsr(cccr_msr, cccr_val, 0);
write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
+
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = cccr_msr;
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
return 1;
}
@@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8e9cd6a8ec1..6a44d646599 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
-#include <linux/smp_lock.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a4..e90a60ef10c 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
#include <linux/errno.h>
#include <linux/crash_dump.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
/**
* copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
return 0;
vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!vaddr)
+ return -ENOMEM;
if (userbuf) {
- if (copy_to_user(buf, (vaddr + offset), csize)) {
+ if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
return -EFAULT;
}
} else
- memcpy(buf, (vaddr + offset), csize);
+ memcpy(buf, vaddr + offset, csize);
iounmap(vaddr);
return csize;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ab21c270bfa..2b69994fd3a 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -479,12 +479,12 @@ static int ds_release(struct task_struct *task, enum ds_qualifier qual)
goto out;
kfree(context->buffer[qual]);
- context->buffer[qual] = 0;
+ context->buffer[qual] = NULL;
current->mm->total_vm -= context->pages[qual];
current->mm->locked_vm -= context->pages[qual];
context->pages[qual] = 0;
- context->owner[qual] = 0;
+ context->owner[qual] = NULL;
/*
* we put the context twice:
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fa..24bb5faf5ef 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,20 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
+#ifdef CONFIG_DMAR
+static void __init intel_g33_dmar(int num, int slot, int func)
+{
+ struct acpi_table_header *dmar_tbl;
+ acpi_status status;
+
+ status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
+ if (ACPI_SUCCESS(status)) {
+ printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
+ dmar_disabled = 1;
+ }
+}
+#endif
+
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +128,10 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
+#ifdef CONFIG_DMAR
+ { PCI_VENDOR_ID_INTEL, 0x29c0,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
+#endif
{}
};
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b..945a31cdd81 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
if (memmap.map == NULL)
printk(KERN_ERR "Could not map the EFI memory map!\n");
memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
if (memmap.desc_size != sizeof(efi_memory_desc_t))
- printk(KERN_WARNING "Kernel-defined memdesc"
- "doesn't match the one from EFI!\n");
+ printk(KERN_WARNING
+ "Kernel-defined memdesc doesn't match the one from EFI!\n");
+
if (add_efi_memmap)
do_add_efi_memmap();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9bfc4d72fb2..d16084f9064 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
}
load_idt((const struct desc_ptr *)&idt_descr);
- early_printk("Kernel alive\n");
+ if (console_loglevel == 10)
+ early_printk("Kernel alive\n");
x86_64_init_pda();
- early_printk("Kernel really alive\n");
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377..e835b4eea70 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
*
* Note that the stack is not yet set up!
*/
-#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
-#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
-#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
-
default_entry:
#ifdef CONFIG_X86_PAE
@@ -196,9 +192,9 @@ default_entry:
movl $pa(pg0), %edi
movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_pmd), %edx
- movl $PTE_ATTR, %eax
+ movl $PTE_IDENT_ATTR, %eax
10:
- leal PDE_ATTR(%edi),%ecx /* Create PMD entry */
+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
movl %ecx,(%edx) /* Store PMD entry */
/* Upper half already zero */
addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
* End condition: we must map up to and including INIT_MAP_BEYOND_END
* bytes beyond the end of our own page tables.
*/
- leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+ leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
1:
@@ -224,7 +220,7 @@ default_entry:
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
#else /* Not PAE */
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(pg0), %edi
movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_dir), %edx
- movl $PTE_ATTR, %eax
+ movl $PTE_IDENT_ATTR, %eax
10:
- leal PDE_ATTR(%edi),%ecx /* Create PDE entry */
+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
movl %ecx,(%edx) /* Store identity PDE entry */
movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
* bytes beyond the end of our own page tables; the +0x007 is
* the attribute bits
*/
- leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+ leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+ movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
movl %eax,pa(swapper_pg_dir+0xffc)
#endif
jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
- .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
- .long pa(swapper_pg_pmd+PGD_ATTR),0
- .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
- .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
- .long pa(swapper_pg_pmd+PGD_ATTR),0
- .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
- .long pa(swapper_pg_pmd+PGD_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe88..26cfdc1d7c7 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
movq %rdi, %rax
shrq $PMD_SHIFT, %rax
andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+ leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
leaq level2_spare_pgt(%rip), %rbx
movq %rdx, 0(%rbx, %rax, 8)
ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
/* Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
*/
- PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
NEXT_PAGE(level2_kernel_pgt)
/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc08..b71e02d42f4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ skip:
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_call_count);
- seq_printf(p, " function call interrupts\n");
+ seq_printf(p, " Function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d..f065fe9071b 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ skip:
seq_printf(p, "CAL: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
- seq_printf(p, " function call interrupts\n");
+ seq_printf(p, " Function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb2133..304d8bad655 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
static u32 *flush_words;
struct pci_device_id k8_nb_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
{}
};
EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index f2d43bc7551..ff7d3b0124f 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
if (PageHighMem(pg)) {
data = ioremap_cache(pa_data, sizeof(*data));
if (!data) {
+ kfree(node);
error = -ENXIO;
goto err_dir;
}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b..10435a120d2 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
*/
void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
gdb_regs[GDB_AX] = regs->ax;
gdb_regs[GDB_BX] = regs->bx;
gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs[GDB_SI] = regs->si;
gdb_regs[GDB_DI] = regs->di;
gdb_regs[GDB_BP] = regs->bp;
- gdb_regs[GDB_PS] = regs->flags;
gdb_regs[GDB_PC] = regs->ip;
#ifdef CONFIG_X86_32
+ gdb_regs[GDB_PS] = regs->flags;
gdb_regs[GDB_DS] = regs->ds;
gdb_regs[GDB_ES] = regs->es;
gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs[GDB_R13] = regs->r13;
gdb_regs[GDB_R14] = regs->r14;
gdb_regs[GDB_R15] = regs->r15;
+ gdb_regs32[GDB_PS] = regs->flags;
+ gdb_regs32[GDB_CS] = regs->cs;
+ gdb_regs32[GDB_SS] = regs->ss;
#endif
gdb_regs[GDB_SP] = regs->sp;
}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
*/
void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
gdb_regs[GDB_AX] = 0;
gdb_regs[GDB_BX] = 0;
gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
#else
- gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
- gdb_regs[GDB_PC] = 0;
+ gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs32[GDB_CS] = __KERNEL_CS;
+ gdb_regs32[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_PC] = p->thread.ip;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
*/
void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
regs->ax = gdb_regs[GDB_AX];
regs->bx = gdb_regs[GDB_BX];
regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
regs->si = gdb_regs[GDB_SI];
regs->di = gdb_regs[GDB_DI];
regs->bp = gdb_regs[GDB_BP];
- regs->flags = gdb_regs[GDB_PS];
regs->ip = gdb_regs[GDB_PC];
#ifdef CONFIG_X86_32
+ regs->flags = gdb_regs[GDB_PS];
regs->ds = gdb_regs[GDB_DS];
regs->es = gdb_regs[GDB_ES];
regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
regs->r13 = gdb_regs[GDB_R13];
regs->r14 = gdb_regs[GDB_R14];
regs->r15 = gdb_regs[GDB_R15];
+ regs->flags = gdb_regs32[GDB_PS];
+ regs->cs = gdb_regs32[GDB_CS];
+ regs->ss = gdb_regs32[GDB_SS];
#endif
}
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
if (remcomInBuffer[0] == 's') {
linux_regs->flags |= X86_EFLAGS_TF;
kgdb_single_step = 1;
- if (kgdb_contthread) {
- atomic_set(&kgdb_cpu_doing_single_step,
- raw_smp_processor_id());
- }
+ atomic_set(&kgdb_cpu_doing_single_step,
+ raw_smp_processor_id());
}
get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
return NOTIFY_DONE;
case DIE_NMI_IPI:
- if (atomic_read(&kgdb_active) != -1) {
- /* KGDB CPU roundup */
- kgdb_nmicallback(raw_smp_processor_id(), regs);
- was_in_debug_nmi[raw_smp_processor_id()] = 1;
- touch_nmi_watchdog();
- }
+ /* Just ignore, we will handle the roundup on DIE_NMI. */
return NOTIFY_DONE;
case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
case DIE_DEBUG:
if (atomic_read(&kgdb_cpu_doing_single_step) ==
- raw_smp_processor_id() &&
- user_mode(regs))
- return single_step_cont(regs, args);
+ raw_smp_processor_id()) {
+ if (user_mode(regs))
+ return single_step_cont(regs, args);
+ break;
+ } else if (test_thread_flag(TIF_SINGLESTEP))
+ /* This means a user thread is single stepping
+ * a system call which should be ignored
+ */
+ return NOTIFY_DONE;
/* fall through */
default:
if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2..478bca986ec 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
}
-static void kvm_release_pt(u32 pfn)
+static void kvm_release_pt(unsigned long pfn)
{
struct kvm_mmu_op_release_pt rpt = {
.header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2cc4a..2c97f07f1c2 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
+/*
+ * This function is called as soon the LAPIC NMI watchdog driver has everything
+ * in place and it's ready to check if the NMIs belong to the NMI watchdog
+ */
+void cpu_nmi_set_wd_enabled(void)
+{
+ __get_cpu_var(wd_enabled) = 1;
+}
+
void setup_apic_nmi_watchdog(void *unused)
{
if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
- /* enable it before to avoid race with handler */
- __get_cpu_var(wd_enabled) = 1;
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e667227480..7a13fac63a1 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
static void __init platform_detect(void)
{
size_t propsize;
- u32 rev;
+ __be32 rev;
if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
&propsize) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
- rev = 0;
+ rev = cpu_to_be32(0);
}
olpc_platform_info.boardrev = be32_to_cpu(rev);
}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
static void __init platform_detect(void)
{
/* stopgap until OFW support is added to the kernel */
- olpc_platform_info.boardrev = be32_to_cpu(0xc2);
+ olpc_platform_info.boardrev = 0xc2;
}
#endif
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781..9fe644f4861 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
start = start_##ops##_##x; \
end = end_##ops##_##x; \
goto patch_site
- switch(type) {
+ switch (type) {
PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index dcdac6c826e..080d1d27f37 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -261,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
badbit, tbl, start_addr, npages);
}
- set_bit_string(tbl->it_map, index, npages);
+ iommu_area_reserve(tbl->it_map, index, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
}
@@ -491,6 +491,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
npages = size >> PAGE_SHIFT;
order = get_order(size);
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
/* alloc enough pages (and possibly more) */
ret = (void *)__get_free_pages(flag, order);
if (!ret)
@@ -510,8 +512,22 @@ error:
return ret;
}
+static void calgary_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle)
+{
+ unsigned int npages;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ size = PAGE_ALIGN(size);
+ npages = size >> PAGE_SHIFT;
+
+ iommu_free(tbl, dma_handle, npages);
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
static struct dma_mapping_ops calgary_dma_ops = {
.alloc_coherent = calgary_alloc_coherent,
+ .free_coherent = calgary_free_coherent,
.map_single = calgary_map_single,
.unmap_single = calgary_unmap_single,
.map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 87d4d6964ec..0a3824e837b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address);
/* Dummy device used for NULL arguments (normally ISA). Better would
be probably a smaller DMA mask, but this is bug-to-bug compatible
to older i386. */
-struct device fallback_dev = {
+struct device x86_dma_fallback_dev = {
.bus_id = "fallback device",
.coherent_dma_mask = DMA_32BIT_MASK,
- .dma_mask = &fallback_dev.coherent_dma_mask,
+ .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
};
+EXPORT_SYMBOL(x86_dma_fallback_dev);
int dma_set_mask(struct device *dev, u64 mask)
{
@@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void)
* using 512M as goal
*/
align = 64ULL<<20;
- size = round_up(dma32_bootmem_size, align);
+ size = roundup(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
512ULL<<20);
if (dma32_bootmem_ptr)
@@ -133,6 +134,37 @@ unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
EXPORT_SYMBOL(iommu_num_pages);
#endif
+void *dma_generic_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag)
+{
+ unsigned long dma_mask;
+ struct page *page;
+ dma_addr_t addr;
+
+ dma_mask = dma_alloc_coherent_mask(dev, flag);
+
+ flag |= __GFP_ZERO;
+again:
+ page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
+ if (!page)
+ return NULL;
+
+ addr = page_to_phys(page);
+ if (!is_buffer_dma_capable(dma_mask, addr, size)) {
+ __free_pages(page, get_order(size));
+
+ if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
+ flag = (flag & ~GFP_DMA32) | GFP_DMA;
+ goto again;
+ }
+
+ return NULL;
+ }
+
+ *dma_addr = addr;
+ return page_address(page);
+}
+
/*
* See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
* documentation.
@@ -241,147 +273,6 @@ int dma_supported(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_supported);
-/* Allocate DMA memory on node near device */
-static noinline struct page *
-dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
-{
- int node;
-
- node = dev_to_node(dev);
-
- return alloc_pages_node(node, gfp, order);
-}
-
-/*
- * Allocate memory for a coherent mapping.
- */
-void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp)
-{
- struct dma_mapping_ops *ops = get_dma_ops(dev);
- void *memory = NULL;
- struct page *page;
- unsigned long dma_mask = 0;
- dma_addr_t bus;
- int noretry = 0;
-
- /* ignore region specifiers */
- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
- return memory;
-
- if (!dev) {
- dev = &fallback_dev;
- gfp |= GFP_DMA;
- }
- dma_mask = dev->coherent_dma_mask;
- if (dma_mask == 0)
- dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
-
- /* Device not DMA able */
- if (dev->dma_mask == NULL)
- return NULL;
-
- /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
- if (gfp & __GFP_DMA)
- noretry = 1;
-
-#ifdef CONFIG_X86_64
- /* Why <=? Even when the mask is smaller than 4GB it is often
- larger than 16MB and in this case we have a chance of
- finding fitting memory in the next higher zone first. If
- not retry with true GFP_DMA. -AK */
- if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
- gfp |= GFP_DMA32;
- if (dma_mask < DMA_32BIT_MASK)
- noretry = 1;
- }
-#endif
-
- again:
- page = dma_alloc_pages(dev,
- noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
- if (page == NULL)
- return NULL;
-
- {
- int high, mmu;
- bus = page_to_phys(page);
- memory = page_address(page);
- high = (bus + size) >= dma_mask;
- mmu = high;
- if (force_iommu && !(gfp & GFP_DMA))
- mmu = 1;
- else if (high) {
- free_pages((unsigned long)memory,
- get_order(size));
-
- /* Don't use the 16MB ZONE_DMA unless absolutely
- needed. It's better to use remapping first. */
- if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
- gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
- goto again;
- }
-
- /* Let low level make its own zone decisions */
- gfp &= ~(GFP_DMA32|GFP_DMA);
-
- if (ops->alloc_coherent)
- return ops->alloc_coherent(dev, size,
- dma_handle, gfp);
- return NULL;
- }
-
- memset(memory, 0, size);
- if (!mmu) {
- *dma_handle = bus;
- return memory;
- }
- }
-
- if (ops->alloc_coherent) {
- free_pages((unsigned long)memory, get_order(size));
- gfp &= ~(GFP_DMA|GFP_DMA32);
- return ops->alloc_coherent(dev, size, dma_handle, gfp);
- }
-
- if (ops->map_simple) {
- *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
- size,
- PCI_DMA_BIDIRECTIONAL);
- if (*dma_handle != bad_dma_address)
- return memory;
- }
-
- if (panic_on_overflow)
- panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
- (unsigned long)size);
- free_pages((unsigned long)memory, get_order(size));
- return NULL;
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-/*
- * Unmap coherent memory.
- * The caller must ensure that the device has finished accessing the mapping.
- */
-void dma_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t bus)
-{
- struct dma_mapping_ops *ops = get_dma_ops(dev);
-
- int order = get_order(size);
- WARN_ON(irqs_disabled()); /* for portability */
- if (dma_release_from_coherent(dev, order, vaddr))
- return;
- if (ops->unmap_single)
- ops->unmap_single(dev, bus, size, 0);
- free_pages((unsigned long)vaddr, order);
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
static int __init pci_iommu_init(void)
{
calgary_iommu_init();
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 49285f8fd4d..145f1c83369 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
#include <linux/sysdev.h>
+#include <linux/io.h>
#include <asm/atomic.h>
-#include <asm/io.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
static unsigned long next_bit; /* protected by iommu_bitmap_lock */
-static int need_flush; /* global flush state. set for each gart wrap */
+static bool need_flush; /* global flush state. set for each gart wrap */
-static unsigned long alloc_iommu(struct device *dev, int size)
+static unsigned long alloc_iommu(struct device *dev, int size,
+ unsigned long align_mask)
{
unsigned long offset, flags;
unsigned long boundary_size;
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size, align_mask);
if (offset == -1) {
- need_flush = 1;
+ need_flush = true;
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size,
+ align_mask);
}
if (offset != -1) {
next_bit = offset+size;
if (next_bit >= iommu_pages) {
next_bit = 0;
- need_flush = 1;
+ need_flush = true;
}
}
if (iommu_fullflush)
- need_flush = 1;
+ need_flush = true;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
return offset;
@@ -134,7 +136,7 @@ static void flush_gart(void)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
k8_flush_garts();
- need_flush = 0;
+ need_flush = false;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
@@ -173,7 +175,8 @@ static void dump_leak(void)
iommu_leak_pages);
for (i = 0; i < iommu_leak_pages; i += 2) {
printk(KERN_DEBUG "%lu: ", iommu_pages-i);
- printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
+ printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
+ 0);
printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
}
printk(KERN_DEBUG "\n");
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
- u64 mask = *dev->dma_mask;
- int high = addr + size > mask;
- int mmu = high;
-
- if (force_iommu)
- mmu = 1;
-
- return mmu;
+ return force_iommu ||
+ !is_buffer_dma_capable(*dev->dma_mask, addr, size);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
- u64 mask = *dev->dma_mask;
- int high = addr + size > mask;
- int mmu = high;
-
- return mmu;
+ return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
}
/* Map a single continuous physical area into the IOMMU.
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
- size_t size, int dir)
+ size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size);
- unsigned long iommu_page = alloc_iommu(dev, npages);
+ unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
int i;
if (iommu_page == -1) {
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
}
-static dma_addr_t
-gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
-{
- dma_addr_t map = dma_map_area(dev, paddr, size, dir);
-
- flush_gart();
-
- return map;
-}
-
/* Map a single area into the IOMMU */
static dma_addr_t
gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
unsigned long bus;
if (!dev)
- dev = &fallback_dev;
+ dev = &x86_dma_fallback_dev;
if (!need_iommu(dev, paddr, size))
return paddr;
- bus = gart_map_simple(dev, paddr, size, dir);
+ bus = dma_map_area(dev, paddr, size, dir, 0);
+ flush_gart();
return bus;
}
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
unsigned long addr = sg_phys(s);
if (nonforced_iommu(dev, addr, s->length)) {
- addr = dma_map_area(dev, addr, s->length, dir);
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
if (addr == bad_dma_address) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int nelems, struct scatterlist *sout,
unsigned long pages)
{
- unsigned long iommu_start = alloc_iommu(dev, pages);
+ unsigned long iommu_start = alloc_iommu(dev, pages, 0);
unsigned long iommu_page = iommu_start;
struct scatterlist *s;
int i;
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
return 0;
if (!dev)
- dev = &fallback_dev;
+ dev = &x86_dma_fallback_dev;
out = 0;
start = 0;
@@ -499,6 +483,46 @@ error:
return 0;
}
+/* allocate and map a coherent mapping */
+static void *
+gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+ gfp_t flag)
+{
+ dma_addr_t paddr;
+ unsigned long align_mask;
+ struct page *page;
+
+ if (force_iommu && !(flag & GFP_DMA)) {
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ page = alloc_pages(flag | __GFP_ZERO, get_order(size));
+ if (!page)
+ return NULL;
+
+ align_mask = (1UL << get_order(size)) - 1;
+ paddr = dma_map_area(dev, page_to_phys(page), size,
+ DMA_BIDIRECTIONAL, align_mask);
+
+ flush_gart();
+ if (paddr != bad_dma_address) {
+ *dma_addr = paddr;
+ return page_address(page);
+ }
+ __free_pages(page, get_order(size));
+ } else
+ return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+
+ return NULL;
+}
+
+/* free a coherent mapping */
+static void
+gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr)
+{
+ gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
struct pci_dev *dev;
void *gatt;
int i, error;
- unsigned long start_pfn, end_pfn;
printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
info->aper_size = aper_size >> 20;
gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
- gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
+ gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(gatt_size));
if (!gatt)
panic("Cannot allocate GATT table");
if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
panic("Could not set GART PTEs to uncacheable pages");
- memset(gatt, 0, gatt_size);
agp_gatt_table = gatt;
enable_gart_translations();
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
if (!error)
error = sysdev_register(&device_gart);
if (error)
- panic("Could not register gart_sysdev -- would corrupt data on next suspend");
+ panic("Could not register gart_sysdev -- "
+ "would corrupt data on next suspend");
flush_gart();
printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
aper_base, aper_size>>10);
- /* need to map that range */
- end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- if (end_pfn > max_low_pfn_mapped) {
- start_pfn = (aper_base>>PAGE_SHIFT);
- init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- }
return 0;
nommu:
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
return -1;
}
-extern int agp_amd64_init(void);
-
static struct dma_mapping_ops gart_dma_ops = {
.map_single = gart_map_single,
- .map_simple = gart_map_simple,
.unmap_single = gart_unmap_single,
- .sync_single_for_cpu = NULL,
- .sync_single_for_device = NULL,
- .sync_single_range_for_cpu = NULL,
- .sync_single_range_for_device = NULL,
- .sync_sg_for_cpu = NULL,
- .sync_sg_for_device = NULL,
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
+ .alloc_coherent = gart_alloc_coherent,
+ .free_coherent = gart_free_coherent,
};
void gart_iommu_shutdown(void)
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
- unsigned long aper_size;
+ unsigned long aper_base, aper_size;
+ unsigned long start_pfn, end_pfn;
unsigned long scratch;
long i;
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void)
(no_agp && init_k8_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
printk(KERN_WARNING "More than 4GB of memory "
- "but GART IOMMU not available.\n"
- KERN_WARNING "falling back to iommu=soft.\n");
+ "but GART IOMMU not available.\n");
+ printk(KERN_WARNING "falling back to iommu=soft.\n");
}
return;
}
+ /* need to map that range */
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+ if (end_pfn > max_low_pfn_mapped) {
+ start_pfn = (aper_base>>PAGE_SHIFT);
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ }
+
printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
- aper_size = info.aper_size * 1024 * 1024;
iommu_size = check_iommu_size(info.aper_base, aper_size);
iommu_pages = iommu_size >> PAGE_SHIFT;
- iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
+ iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(iommu_pages/8));
if (!iommu_gart_bitmap)
panic("Cannot allocate iommu bitmap\n");
- memset(iommu_gart_bitmap, 0, iommu_pages/8);
#ifdef CONFIG_IOMMU_LEAK
if (leak_trace) {
- iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
+ iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
get_order(iommu_pages*sizeof(void *)));
- if (iommu_leak_tab)
- memset(iommu_leak_tab, 0, iommu_pages * 8);
- else
+ if (!iommu_leak_tab)
printk(KERN_DEBUG
"PCI-DMA: Cannot allocate leak trace area\n");
}
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void)
* Out of IOMMU space handling.
* Reserve some invalid pages at the beginning of the GART.
*/
- set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
agp_memory_reserved = iommu_size;
printk(KERN_INFO
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p)
if (!strncmp(p, "leak", 4)) {
leak_trace = 1;
p += 4;
- if (*p == '=') ++p;
+ if (*p == '=')
+ ++p;
if (isdigit(*p) && get_option(&p, &arg))
iommu_leak_pages = arg;
}
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3f91f71cdc3..c70ab5a5d4c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
- if (hwdev && bus + size > *hwdev->dma_mask) {
+ if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
if (*hwdev->dma_mask >= DMA_32BIT_MASK)
printk(KERN_ERR
"nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
return nents;
}
+static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr)
+{
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
struct dma_mapping_ops nommu_dma_ops = {
+ .alloc_coherent = dma_generic_alloc_coherent,
+ .free_coherent = nommu_free_coherent,
.map_single = nommu_map_single,
.map_sg = nommu_map_sg,
.is_phys = 1,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea27..a311ffcaad1 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
#include <linux/platform_device.h>
-#include <linux/errno.h>
+#include <linux/err.h>
#include <linux/init.h>
static __init int add_pcspkr(void)
{
struct platform_device *pd;
- int ret;
- pd = platform_device_alloc("pcspkr", -1);
- if (!pd)
- return -ENOMEM;
+ pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
- ret = platform_device_add(pd);
- if (ret)
- platform_device_put(pd);
-
- return ret;
+ return IS_ERR(pd) ? PTR_ERR(pd) : 0;
}
device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f94bb1c811..c622772744d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -184,7 +184,8 @@ static void mwait_idle(void)
static void poll_idle(void)
{
local_irq_enable();
- cpu_relax();
+ while (!need_resched())
+ cpu_relax();
}
/*
@@ -245,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
return 1;
}
+static cpumask_t c1e_mask = CPU_MASK_NONE;
+static int c1e_detected;
+
+void c1e_remove_cpu(int cpu)
+{
+ cpu_clear(cpu, c1e_mask);
+}
+
/*
* C1E aware idle routine. We check for C1E active in the interrupt
* pending message MSR. If we detect C1E, then we handle it the same
@@ -252,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
*/
static void c1e_idle(void)
{
- static cpumask_t c1e_mask = CPU_MASK_NONE;
- static int c1e_detected;
-
if (need_resched())
return;
@@ -264,8 +270,10 @@ static void c1e_idle(void)
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
c1e_detected = 1;
- mark_tsc_unstable("TSC halt in C1E");
- printk(KERN_INFO "System has C1E enabled\n");
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ printk(KERN_INFO "System has AMD C1E enabled\n");
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
}
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 1962d27c6b8..205188db962 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
+#include <linux/dmi.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -55,6 +56,7 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
#include <asm/kdebug.h>
+#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/smp.h>
@@ -90,6 +92,7 @@ static void cpu_exit_clear(void)
cpu_clear(cpu, cpu_callin_map);
numa_remove_cpu(cpu);
+ c1e_remove_cpu(cpu);
}
/* We don't actually take CPU down, just spin without interrupts. */
@@ -161,6 +164,7 @@ void __show_registers(struct pt_regs *regs, int all)
unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
+ const char *board;
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -173,11 +177,15 @@ void __show_registers(struct pt_regs *regs, int all)
}
printk("\n");
- printk("Pid: %d, comm: %s %s (%s %.*s)\n",
+
+ board = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!board)
+ board = "";
+ printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
task_pid_nr(current), current->comm,
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ init_utsname()->version, board);
printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 08a9df08adb..2a8ccb9238b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
#include <linux/kdebug.h>
#include <linux/tick.h>
#include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
-#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
@@ -89,11 +89,13 @@ void exit_idle(void)
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
-#include <asm/nmi.h>
+#include <linux/nmi.h>
/* We halt the CPU with physical CPU hotplug */
static inline void play_dead(void)
{
idle_task_exit();
+ c1e_remove_cpu(raw_smp_processor_id());
+
mb();
/* Ack it */
__get_cpu_var(cpu_state) = CPU_DEAD;
@@ -152,7 +154,7 @@ void cpu_idle(void)
}
/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs * regs)
+void __show_regs(struct pt_regs *regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -161,59 +163,61 @@ void __show_regs(struct pt_regs * regs)
printk("\n");
print_modules();
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+ printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
- printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
- regs->flags);
- printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
+ regs->sp, regs->flags);
+ printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
- printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->dx, regs->si, regs->di);
- printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+ printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
regs->bp, regs->r8, regs->r9);
- printk("R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk("R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
-
- asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
- asm("movl %%es,%0" : "=r" (es));
+ printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4();
- printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs,fsindex,gs,gsindex,shadowgs);
- printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
- printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+ printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs, fsindex, gs, gsindex, shadowgs);
+ printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ es, cr0);
+ printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void show_regs(struct pt_regs *regs)
{
- printk("CPU %d:", smp_processor_id());
+ printk(KERN_INFO "CPU %d:", smp_processor_id());
__show_regs(regs);
show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
}
@@ -322,10 +326,10 @@ void prepare_to_copy(struct task_struct *tsk)
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
unsigned long unused,
- struct task_struct * p, struct pt_regs * regs)
+ struct task_struct *p, struct pt_regs *regs)
{
int err;
- struct pt_regs * childregs;
+ struct pt_regs *childregs;
struct task_struct *me = current;
childregs = ((struct pt_regs *)
@@ -370,10 +374,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
- else
-#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
- if (err)
+ else
+#endif
+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ if (err)
goto out;
}
err = 0;
@@ -566,7 +570,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
unsigned fsindex, gsindex;
/* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter>5)
+ if (next_p->fpu_counter > 5)
prefetch(next->xstate);
/*
@@ -574,13 +578,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
load_sp0(tss, next);
- /*
+ /*
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
+ loadsegment(es, next->es);
savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
@@ -606,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_leave_lazy_cpu_mode();
- /*
+ /*
* Switch FS and GS.
*
* Segment register != 0 always requires a reload. Also
@@ -615,13 +619,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
- /*
+ /*
* Check if the user used a selector != 0; if yes
* clear 64bit base, since overloaded base is always
* mapped to the Null selector
*/
if (fsindex)
- prev->fs = 0;
+ prev->fs = 0;
}
/* when next process has a 64bit base use it */
if (next->fs)
@@ -631,7 +635,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
- prev->gs = 0;
+ prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -640,12 +644,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Must be after DS reload */
unlazy_fpu(prev_p);
- /*
+ /*
* Switch the PDA and FPU contexts.
*/
prev->usersp = read_pda(oldrsp);
write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ write_pda(pcurrent, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) +
@@ -686,7 +690,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
char __user * __user *envp, struct pt_regs *regs)
{
long error;
- char * filename;
+ char *filename;
filename = getname(name);
error = PTR_ERR(filename);
@@ -744,55 +748,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
- u64 fp,ip;
+ u64 fp, ip;
int count = 0;
- if (!p || p == current || p->state==TASK_RUNNING)
- return 0;
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
stack = (unsigned long)task_stack_page(p);
if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
return 0;
fp = *(u64 *)(p->thread.sp);
- do {
+ do {
if (fp < (unsigned long)stack ||
fp > (unsigned long)stack+THREAD_SIZE)
- return 0;
+ return 0;
ip = *(u64 *)(fp+8);
if (!in_sched_functions(ip))
return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
+ fp = *(u64 *)fp;
+ } while (count++ < 16);
return 0;
}
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
-{
- int ret = 0;
+{
+ int ret = 0;
int doit = task == current;
int cpu;
- switch (code) {
+ switch (code) {
case ARCH_SET_GS:
if (addr >= TASK_SIZE_OF(task))
- return -EPERM;
+ return -EPERM;
cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
+ /* handle small bases via the GDT because that's faster to
switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, GS_TLS, addr);
- if (doit) {
+ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, GS_TLS, addr);
+ if (doit) {
load_TLS(&task->thread, cpu);
- load_gs_index(GS_TLS_SEL);
+ load_gs_index(GS_TLS_SEL);
}
- task->thread.gsindex = GS_TLS_SEL;
+ task->thread.gsindex = GS_TLS_SEL;
task->thread.gs = 0;
- } else {
+ } else {
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
load_gs_index(0);
ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
- }
+ }
}
put_cpu();
break;
@@ -846,8 +850,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gs;
- }
- else
+ } else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 58ce4b50211..e375b658efc 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/regset.h>
+#include <linux/tracehook.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/security.h>
@@ -734,7 +735,7 @@ static int ptrace_bts_config(struct task_struct *child,
goto errout;
if (cfg.flags & PTRACE_BTS_O_ALLOC) {
- ds_ovfl_callback_t ovfl = 0;
+ ds_ovfl_callback_t ovfl = NULL;
unsigned int sig = 0;
/* we ignore the error in case we were not tracing child */
@@ -748,7 +749,7 @@ static int ptrace_bts_config(struct task_struct *child,
ovfl = ptrace_bts_ovfl;
}
- error = ds_request_bts(child, /* base = */ 0, cfg.size, ovfl);
+ error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
if (error < 0)
goto errout;
@@ -1086,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_SIZE:
- ret = ds_get_bts_index(child, /* pos = */ 0);
+ ret = ds_get_bts_index(child, /* pos = */ NULL);
break;
case PTRACE_BTS_GET:
@@ -1469,30 +1470,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
force_sig_info(SIGTRAP, &info, tsk);
}
-static void syscall_trace(struct pt_regs *regs)
-{
- if (!(current->ptrace & PT_PTRACED))
- return;
-
-#if 0
- printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
- current->comm,
- regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
- current_thread_info()->flags, current->ptrace);
-#endif
-
- ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
- ? 0x80 : 0));
- /*
- * this isn't the same as continuing with a signal, but it will do
- * for normal use. strace only continues with a signal if the
- * stopping signal is not SIGTRAP. -brl
- */
- if (current->exit_code) {
- send_sig(current->exit_code, current, 1);
- current->exit_code = 0;
- }
-}
#ifdef CONFIG_X86_32
# define IS_IA32 1
@@ -1526,8 +1503,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
- if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+ tracehook_report_syscall_entry(regs))
+ ret = -1L;
if (unlikely(current->audit_context)) {
if (IS_IA32)
@@ -1553,7 +1531,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
if (test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ tracehook_report_syscall_exit(regs, 0);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1569,6 +1547,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
* system call instruction.
*/
if (test_thread_flag(TIF_SINGLESTEP) &&
- (current->ptrace & PT_PTRACED))
+ tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
send_sigtrap(current, regs, 0);
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb..f4c93f1cfc1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+/*
+ * Keyboard reset and triple fault may result in INIT, not RESET, which
+ * doesn't work when we're in vmx root mode. Try ACPI first.
+ */
+enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 673f12cf6eb..46c98efbbf8 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
#define RAMDISK_LOAD_FLAG 0x4000
static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
@@ -665,11 +668,28 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = virt_to_phys(&__bss_start);
bss_resource.end = virt_to_phys(&__bss_stop)-1;
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
parse_early_param();
+#ifdef CONFIG_X86_64
+ check_efer();
+#endif
+
#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
/*
* Must be before kernel pagetables are setup
@@ -738,7 +758,6 @@ void __init setup_arch(char **cmdline_p)
#else
num_physpages = max_pfn;
- check_efer();
if (cpu_has_x2apic)
check_x2apic();
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 6dd7e2b70a4..cc673aa55ce 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -34,4 +34,9 @@ struct rt_sigframe {
struct siginfo info;
/* fp state follows here */
};
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ sigset_t *set, struct pt_regs *regs);
#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 8d380b699c0..b21070ea33a 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/tracehook.h>
#include <linux/elf.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -556,8 +557,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -566,6 +565,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
+
return 0;
}
@@ -659,5 +661,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
+
clear_thread_flag(TIF_IRET);
}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 4665b598a37..823a55bf8c3 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,20 @@
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/ptrace.h>
+#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/compiler.h>
+#include <linux/uaccess.h>
+
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
#include <asm/mce.h>
+#include <asm/syscall.h>
#include <asm/syscalls.h>
#include "sigframe.h"
@@ -42,11 +45,6 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs * regs);
-
asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
@@ -66,7 +64,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
-#define COPY(x) err |= __get_user(regs->x, &sc->x)
+#define COPY(x) (err |= __get_user(regs->x, &sc->x))
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip);
@@ -96,7 +94,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
}
{
- struct _fpstate __user * buf;
+ struct _fpstate __user *buf;
err |= __get_user(buf, &sc->fpstate);
err |= restore_i387_xstate(buf);
}
@@ -122,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
-
+
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -132,16 +130,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
return ax;
badframe:
- signal_fault(regs,frame,"sigreturn");
+ signal_fault(regs, frame, "sigreturn");
return 0;
-}
+}
/*
* Set up a signal frame.
*/
static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
+ unsigned long mask, struct task_struct *me)
{
int err = 0;
@@ -197,7 +196,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
}
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs)
+ sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
void __user *fp = NULL;
@@ -210,19 +209,19 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
if (save_i387_xstate(fp) < 0)
- err |= -1;
+ err |= -1;
} else
frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
- if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (ka->sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(&frame->info, info);
if (err)
goto give_sigsegv;
}
-
+
/* Create the ucontext. */
if (cpu_has_xsave)
err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
@@ -235,9 +234,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
- if (sizeof(*set) == 16) {
+ if (sizeof(*set) == 16) {
__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
- __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
+ __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
} else
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -248,7 +247,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
- goto give_sigsegv;
+ goto give_sigsegv;
}
if (err)
@@ -256,7 +255,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up registers for signal handler */
regs->di = sig;
- /* In case the signal handler was declared without prototypes */
+ /* In case the signal handler was declared without prototypes */
regs->ax = 0;
/* This also works for non SA_SIGINFO handlers because they expect the
@@ -279,37 +278,8 @@ give_sigsegv:
}
/*
- * Return -1L or the syscall number that @regs is executing.
- */
-static long current_syscall(struct pt_regs *regs)
-{
- /*
- * We always sign-extend a -1 value being set here,
- * so this is always either -1L or a syscall number.
- */
- return regs->orig_ax;
-}
-
-/*
- * Return a value that is -EFOO if the system call in @regs->orig_ax
- * returned an error. This only works for @regs from @current.
- */
-static long current_syscall_ret(struct pt_regs *regs)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- /*
- * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
- * and will match correctly in comparisons.
- */
- return (int) regs->ax;
-#endif
- return regs->ax;
-}
-
-/*
* OK, we're invoking a handler
- */
+ */
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -318,9 +288,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
int ret;
/* Are we from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
@@ -353,7 +323,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
else
ret = ia32_setup_frame(sig, ka, oldset, regs);
- } else
+ } else
#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
@@ -377,15 +347,16 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+ sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked,sig);
+ sigaddset(&current->blocked, sig);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
+
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
}
return ret;
@@ -442,9 +413,9 @@ static void do_signal(struct pt_regs *regs)
}
/* Did we come from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
@@ -482,17 +453,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{
- struct task_struct *me = current;
+{
+ struct task_struct *me = current;
if (show_unhandled_signals && printk_ratelimit()) {
printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
- me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
+ me->comm, me->pid, where, frame, regs->ip,
+ regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
- force_sig(SIGSEGV, me);
-}
+ force_sig(SIGSEGV, me);
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index aa804c64b16..2ff0bbcd5bd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1313,16 +1313,13 @@ __init void prefill_possible_map(void)
if (!num_processors)
num_processors = 1;
-#ifdef CONFIG_HOTPLUG_CPU
if (additional_cpus == -1) {
if (disabled_cpus > 0)
additional_cpus = disabled_cpus;
else
additional_cpus = 0;
}
-#else
- additional_cpus = 0;
-#endif
+
possible = num_processors + additional_cpus;
if (possible > NR_CPUS)
possible = NR_CPUS;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index c9288c883e2..6bc211accf0 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,16 +13,17 @@
#include <linux/utsname.h>
#include <linux/personality.h>
#include <linux/random.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long off)
{
long error;
- struct file * file;
+ struct file *file;
error = -EINVAL;
if (off & ~PAGE_MASK)
@@ -57,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
unmapped base down for this case. This can give
conflicts with the heap, but we assume that glibc
malloc knows how to fall back to mmap. Give it 1GB
- of playground for now. -AK */
- *begin = 0x40000000;
- *end = 0x80000000;
+ of playground for now. -AK */
+ *begin = 0x40000000;
+ *end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
if (new_begin)
@@ -67,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
}
} else {
*begin = TASK_UNMAPPED_BASE;
- *end = TASK_SIZE;
+ *end = TASK_SIZE;
}
-}
+}
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -79,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
unsigned long start_addr;
unsigned long begin, end;
-
+
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -97,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
&& len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
+ mm->cached_hole_size = 0;
mm->free_area_cache = begin;
}
addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
+ if (addr < begin)
+ addr = begin;
start_addr = addr;
full_search:
@@ -128,7 +129,7 @@ full_search:
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
+ mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
@@ -178,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr-len);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
+ return mm->free_area_cache = addr-len;
}
if (mm->mmap_base < len)
@@ -195,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr);
if (!vma || addr+len <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
+ return mm->free_area_cache = addr;
/* remember the largest hole we saw so far */
if (addr + mm->cached_hole_size < vma->vm_start)
@@ -225,13 +226,13 @@ bottomup:
}
-asmlinkage long sys_uname(struct new_utsname __user * name)
+asmlinkage long sys_uname(struct new_utsname __user *name)
{
int err;
down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof(*name));
up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index b42068fb7b7..2887a789e38 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -32,6 +32,8 @@
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
@@ -45,9 +47,6 @@
#include <asm/unwind.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/pda.h>
@@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
void printk_address(unsigned long address, int reliable)
{
- printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
+ printk(" [<%016lx>] %s%pS\n",
+ address, reliable ? "" : "? ", (void *) address);
}
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
[STACKFAULT_STACK - 1] = "#SS",
[MCE_STACK - 1] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+ [N_EXCEPTION_STACKS ...
+ N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
#endif
};
unsigned k;
@@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
}
/*
- * x86-64 can have up to three kernel stacks:
+ * x86-64 can have up to three kernel stacks:
* process stack
* interrupt stack
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
@@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!bp) {
if (task == current) {
/* Grab bp right from our regs */
- asm("movq %%rbp, %0" : "=r" (bp) :);
+ asm("movq %%rbp, %0" : "=r" (bp) : );
} else {
/* bp is the last reg pushed by switch_to */
bp = *(unsigned long *) task->thread.sp;
@@ -356,11 +357,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irqstack_end =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ unsigned long *irqstack =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
- // debugging aid: "show_stack(NULL, NULL);" prints the
- // back trace for this cpu.
+ /*
+ * debugging aid: "show_stack(NULL, NULL);" prints the
+ * back trace for this cpu.
+ */
if (sp == NULL) {
if (task)
@@ -404,7 +409,7 @@ void dump_stack(void)
#ifdef CONFIG_FRAME_POINTER
if (!bp)
- asm("movq %%rbp, %0" : "=r" (bp):);
+ asm("movq %%rbp, %0" : "=r" (bp) : );
#endif
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -414,7 +419,6 @@ void dump_stack(void)
init_utsname()->version);
show_trace(NULL, NULL, &stack, bp);
}
-
EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
@@ -492,7 +496,7 @@ unsigned __kprobes long oops_begin(void)
raw_local_irq_save(flags);
cpu = smp_processor_id();
if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
+ if (cpu == die_owner)
/* nested oops. should stop eventually */;
else
__raw_spin_lock(&die_lock);
@@ -637,7 +641,7 @@ kernel_trap:
}
#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
@@ -647,7 +651,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
}
#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
siginfo_t info; \
info.si_signo = signr; \
@@ -682,7 +686,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
}
-asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
+asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
@@ -777,9 +781,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
}
static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
+ NOTIFY_STOP)
return;
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
reason);
@@ -881,7 +886,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
else if (user_mode(eregs))
regs = task_pt_regs(current);
/* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ kernel process stack. */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -890,7 +895,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
}
/* runs on IST stack. */
-asmlinkage void __kprobes do_debug(struct pt_regs * regs,
+asmlinkage void __kprobes do_debug(struct pt_regs *regs,
unsigned long error_code)
{
struct task_struct *tsk = current;
@@ -1034,7 +1039,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
asmlinkage void bad_intr(void)
{
- printk("bad interrupt");
+ printk("bad interrupt");
}
asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1046,7 +1051,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
conditional_sti(regs);
if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel simd math error", 19))
+ kernel_math_error(regs, "kernel simd math error", 19))
return;
/*
@@ -1091,7 +1096,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
force_sig_info(SIGFPE, &info, task);
}
-asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
{
}
@@ -1148,8 +1153,10 @@ void __init trap_init(void)
set_intr_gate(0, &divide_error);
set_intr_gate_ist(1, &debug, DEBUG_STACK);
set_intr_gate_ist(2, &nmi, NMI_STACK);
- set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
- set_system_gate(4, &overflow); /* int4 can be called from all */
+ /* int3 can be called from all */
+ set_system_gate_ist(3, &int3, DEBUG_STACK);
+ /* int4 can be called from all */
+ set_system_gate(4, &overflow);
set_intr_gate(5, &bounds);
set_intr_gate(6, &invalid_op);
set_intr_gate(7, &device_not_available);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f98e9de1b8..161bb850fc4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
/*
* Read TSC and the reference counters. Take care of SMI disturbance
*/
-static u64 tsc_read_refs(u64 *pm, u64 *hpet)
+static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
for (i = 0; i < MAX_RETRIES; i++) {
t1 = get_cycles();
if (hpet)
- *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
else
- *pm = acpi_pm_read_early();
+ *p = acpi_pm_read_early();
t2 = get_cycles();
if ((t2 - t1) < SMI_TRESHOLD)
return t2;
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
}
/*
+ * Calculate the TSC frequency from HPET reference
+ */
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
+{
+ u64 tmp;
+
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tmp, 1000000);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+ u64 tmp;
+
+ if (!pm1 && !pm2)
+ return ULONG_MAX;
+
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tmp = pm2 * 1000000000LL;
+ do_div(tmp, PMTMR_TICKS_PER_SEC);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+#define CAL_MS 10
+#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS 1000
+
+#define CAL2_MS 50
+#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS 5000
+
+
+/*
* Try to calibrate the TSC against the Programmable
* Interrupt Timer and return the frequency of the TSC
* in kHz.
*
* Return ULONG_MAX on failure to calibrate.
*/
-static unsigned long pit_calibrate_tsc(void)
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
u64 tsc, t1, t2, delta;
unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
* (LSB then MSB) to begin countdown.
*/
outb(0xb0, 0x43);
- outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
- outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+ outb(latch & 0xff, 0x42);
+ outb(latch >> 8, 0x42);
tsc = t1 = t2 = get_cycles();
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
/*
* Sanity checks:
*
- * If we were not able to read the PIT more than 5000
+ * If we were not able to read the PIT more than loopmin
* times, then we have been hit by a massive SMI
*
* If the maximum is 10 times larger than the minimum,
* then we got hit by an SMI as well.
*/
- if (pitcnt < 5000 || tscmax > 10 * tscmin)
+ if (pitcnt < loopmin || tscmax > 10 * tscmin)
return ULONG_MAX;
/* Calculate the PIT value */
delta = t2 - t1;
- do_div(delta, 50);
+ do_div(delta, ms);
return delta;
}
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ * - the PIT is running at roughly 1.19MHz
+ *
+ * - each IO is going to take about 1us on real hardware,
+ * but we allow it to be much faster (by a factor of 10) or
+ * _slightly_ slower (ie we allow up to a 2us read+counter
+ * update - anything else implies a unacceptably slow CPU
+ * or PIT for the fast calibration to work.
+ *
+ * - with 256 PIT ticks to read the value, we have 214us to
+ * see the same MSB (and overhead like doing a single TSC
+ * read per MSB value etc).
+ *
+ * - We're doing 2 reads per loop (LSB, MSB), and we expect
+ * them each to take about a microsecond on real hardware.
+ * So we expect a count value of around 100. But we'll be
+ * generous, and accept anything over 50.
+ *
+ * - if the PIT is stuck, and we see *many* more reads, we
+ * return early (and the next caller of pit_expect_msb()
+ * then consider it a failure when they don't see the
+ * next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+ int count = 0;
+
+ for (count = 0; count < 50000; count++) {
+ /* Ignore LSB */
+ inb(0x42);
+ if (inb(0x42) != val)
+ break;
+ }
+ return count > 50;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for a
+ * 15ms calibration, which assuming a 2us counter read
+ * error should give us roughly 150 ppm precision for
+ * the calibration.
+ */
+#define QUICK_PIT_MS 15
+#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Counter 2, mode 0 (one-shot), binary count
+ *
+ * NOTE! Mode 2 decrements by two (and then the
+ * output is flipped each time, giving the same
+ * final output frequency as a decrement-by-one),
+ * so mode 0 is much better when looking at the
+ * individual counts.
+ */
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ if (pit_expect_msb(0xff)) {
+ int i;
+ u64 t1, t2, delta;
+ unsigned char expect = 0xfe;
+
+ t1 = get_cycles();
+ for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+ if (!pit_expect_msb(expect))
+ goto failed;
+ }
+ t2 = get_cycles();
+
+ /*
+ * Make sure we can rely on the second TSC timestamp:
+ */
+ if (!pit_expect_msb(expect))
+ goto failed;
+
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement QUICK_PIT_ITERATIONS
+ * times, and each MSB had many hits, so we never
+ * had any sudden jumps.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable.
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
+ * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
+ */
+ delta = (t2 - t1)*PIT_TICK_RATE;
+ do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
+ printk("Fast TSC calibration using PIT\n");
+ return delta;
+ }
+failed:
+ return 0;
+}
/**
* native_calibrate_tsc - calibrate the tsc on boot
*/
unsigned long native_calibrate_tsc(void)
{
- u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
+ u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags;
- int hpet = is_hpet_enabled(), i;
+ unsigned long flags, latch, ms, fast_calibrate;
+ int hpet = is_hpet_enabled(), i, loopmin;
+
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ if (fast_calibrate)
+ return fast_calibrate;
/*
* Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
*/
- for (i = 0; i < 5; i++) {
+
+ /* Preset PIT loop values */
+ latch = CAL_LATCH;
+ ms = CAL_MS;
+ loopmin = CAL_PIT_LOOPS;
+
+ for (i = 0; i < 3; i++) {
unsigned long tsc_pit_khz;
/*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
* read the end value.
*/
local_irq_save(flags);
- tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
- tsc_pit_khz = pit_calibrate_tsc();
- tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+ tsc1 = tsc_read_refs(&ref1, hpet);
+ tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
+ tsc2 = tsc_read_refs(&ref2, hpet);
local_irq_restore(flags);
/* Pick the lowest PIT TSC calibration so far */
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !pm1 && !pm2)
+ if (!hpet && !ref1 && !ref2)
continue;
/* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
continue;
tsc2 = (tsc2 - tsc1) * 1000000LL;
+ if (hpet)
+ tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+ else
+ tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
- if (hpet) {
- if (hpet2 < hpet1)
- hpet2 += 0x100000000ULL;
- hpet2 -= hpet1;
- tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
- do_div(tsc1, 1000000);
- } else {
- if (pm2 < pm1)
- pm2 += (u64)ACPI_PM_OVRRUN;
- pm2 -= pm1;
- tsc1 = pm2 * 1000000000LL;
- do_div(tsc1, PMTMR_TICKS_PER_SEC);
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
+ /*
+ * If both calibration results are inside a 10% window
+ * then we can be sure, that the calibration
+ * succeeded. We break out of the loop right away. We
+ * use the reference value, as it is more precise.
+ */
+ if (delta >= 90 && delta <= 110) {
+ printk(KERN_INFO
+ "TSC: PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
+ return tsc_ref_min;
}
- do_div(tsc2, tsc1);
- tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+ /*
+ * Check whether PIT failed more than once. This
+ * happens in virtualized environments. We need to
+ * give the virtual PC a slightly longer timeframe for
+ * the HPET/PMTIMER to make the result precise.
+ */
+ if (i == 1 && tsc_pit_min == ULONG_MAX) {
+ latch = CAL2_LATCH;
+ ms = CAL2_MS;
+ loopmin = CAL2_PIT_LOOPS;
+ }
}
/*
@@ -270,7 +463,7 @@ unsigned long native_calibrate_tsc(void)
printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk("TSC: No reference (HPET/PMTIMER) available\n");
return 0;
}
@@ -278,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed due to SMI disturbance.\n");
+ "failed.\n");
return 0;
}
@@ -290,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
}
/* We don't have an alternative source, use the PIT calibration value */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
- "to SMI disturbance. Using PIT calibration\n");
+ printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
+ "Using PIT calibration\n");
return tsc_pit_min;
}
- /* Check the reference deviation */
- delta = ((u64) tsc_pit_min) * 100;
- do_div(delta, tsc_ref_min);
-
- /*
- * If both calibration results are inside a 5% window, the we
- * use the lower frequency of those as it is probably the
- * closest estimate.
- */
- if (delta >= 95 && delta <= 105) {
- printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
- hpet ? "HPET" : "PMTIMER");
- printk(KERN_INFO "TSC: using %s calibration value\n",
- tsc_pit_min <= tsc_ref_min ? "PIT" :
- hpet ? "HPET" : "PMTIMER");
- return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
- }
-
- printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
- hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
-
/*
* The calibration values differ too much. In doubt, we use
* the PIT value as we know that there are PMTIMERs around
- * running at double speed.
+ * running at double speed. At least we let the user know:
*/
+ printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 594ef47f0a6..61a97e616f7 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
#include <asm/visws/cobalt.h>
#include <asm/visws/piix4.h>
#include <asm/arch_hooks.h>
+#include <asm/io_apic.h>
#include <asm/fixmap.h>
#include <asm/reboot.h>
#include <asm/setup.h>
#include <asm/e820.h>
-#include <asm/smp.h>
#include <asm/io.h>
#include <mach_ipi.h>
#include "mach_apic.h"
-#include <linux/init.h>
-#include <linux/smp.h>
-
#include <linux/kernel_stat.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/apic.h>
#include <asm/i8259.h>
#include <asm/irq_vectors.h>
-#include <asm/visws/cobalt.h>
#include <asm/visws/lithium.h>
-#include <asm/visws/piix4.h>
#include <linux/sched.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/pci.h>
#include <linux/pci_ids.h>
extern int no_broadcast;
-#include <asm/io.h>
#include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/visws/cobalt.h>
-#include <asm/visws/lithium.h>
char visws_board_type = -1;
char visws_board_rev = -1;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 61531d5c950..8b6c393ab9f 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
const void *desc)
{
u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
+ vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
}
static void vmi_load_sp0(struct tss_struct *tss,
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pte(u32 pfn)
+static void vmi_release_pte(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pmd(u32 pfn)
+static void vmi_release_pmd(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c..7766d36983f 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
}
-static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
switch (type) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bfe2bd305e..3da2508eb22 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -711,6 +711,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
u64 *spte;
int young = 0;
+ /* always return old for EPT */
+ if (!shadow_accessed_mask)
+ return 0;
+
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
int _young;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e2ee264740c..8233b86c778 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -62,6 +62,7 @@ static int npt = 1;
module_param(npt, int, S_IRUGO);
static void kvm_reput_irq(struct vcpu_svm *svm);
+static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
@@ -878,6 +879,10 @@ set:
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+ unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
+
+ if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
+ force_new_asid(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
@@ -1027,6 +1032,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
(u32)fault_address, (u32)(fault_address >> 32),
handler);
+ /*
+ * FIXME: Tis shouldn't be necessary here, but there is a flush
+ * missing in the MMU code. Until we find this bug, flush the
+ * complete TLB here on an NPF
+ */
+ if (npt_enabled)
+ svm_flush_tlb(&svm->vcpu);
if (event_injection)
kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2a69773e3b2..7041cc52b56 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3301,8 +3301,7 @@ static int __init vmx_init(void)
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
VMX_EPT_WRITABLE_MASK |
VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
- kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
- VMX_EPT_FAKE_DIRTY_MASK, 0ull,
+ kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
} else
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index b32d4e5b123..17e25995b65 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -355,8 +355,6 @@ enum vmcs_field {
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
-#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62)
-#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 01b868ba82f..321cf720dbb 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -16,37 +16,46 @@ static void __rdmsr_on_cpu(void *info)
rdmsr(rv->msr_no, rv->l, rv->h);
}
-static void __rdmsr_safe_on_cpu(void *info)
+static void __wrmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
- rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+ wrmsr(rv->msr_no, rv->l, rv->h);
}
-static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
- int err = 0;
+ int err;
struct msr_info rv;
rv.msr_no = msr_no;
- if (safe) {
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
- &rv, 1);
- err = err ? err : rv.err;
- } else {
- err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
- }
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
*l = rv.l;
*h = rv.h;
return err;
}
-static void __wrmsr_on_cpu(void *info)
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ rv.msr_no = msr_no;
+ rv.l = l;
+ rv.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
{
struct msr_info *rv = info;
- wrmsr(rv->msr_no, rv->l, rv->h);
+ rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
}
static void __wrmsr_safe_on_cpu(void *info)
@@ -56,45 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
}
-static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
- int err = 0;
+ int err;
struct msr_info rv;
rv.msr_no = msr_no;
- rv.l = l;
- rv.h = h;
- if (safe) {
- err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
- &rv, 1);
- err = err ? err : rv.err;
- } else {
- err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
- }
-
- return err;
-}
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *l = rv.l;
+ *h = rv.h;
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- return _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
+ return err ? err : rv.err;
}
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- return _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
-}
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
- return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
-}
+ int err;
+ struct msr_info rv;
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
+ rv.msr_no = msr_no;
+ rv.l = l;
+ rv.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
}
EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 94972e7c094..82004d2bf05 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (src), "1" (dest) : "memory");
+ : "0" (src), "1" (dest) : "memory");
return dest;
}
EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
"stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
- :"0" (src), "1" (dest), "2" (count) : "memory");
+ : "0" (src), "1" (dest), "2" (count) : "memory");
return dest;
}
EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
return dest;
}
EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
"2:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"3:"
- :"=a" (res), "=&S" (d0), "=&D" (d1)
- :"1" (cs), "2" (ct)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1)
+ : "1" (cs), "2" (ct)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
"3:\tsbbl %%eax,%%eax\n\t"
"orb $1,%%al\n"
"4:"
- :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
- :"1" (cs), "2" (ct), "3" (count)
- :"memory");
+ : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ : "1" (cs), "2" (ct), "3" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
"movl $1,%1\n"
"2:\tmovl %1,%0\n\t"
"decl %0"
- :"=a" (res), "=&S" (d0)
- :"1" (s), "0" (c)
- :"memory");
+ : "=a" (res), "=&S" (d0)
+ : "1" (s), "0" (c)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
"scasb\n\t"
"notl %0\n\t"
"decl %0"
- :"=c" (res), "=&D" (d0)
- :"1" (s), "a" (0), "0" (0xffffffffu)
- :"memory");
+ : "=c" (res), "=&D" (d0)
+ : "1" (s), "a" (0), "0" (0xffffffffu)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
- :"=D" (res), "=&c" (d0)
- :"a" (c), "0" (cs), "1" (count)
- :"memory");
+ : "=D" (res), "=&c" (d0)
+ : "a" (c), "0" (cs), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
"cmpl $-1,%1\n\t"
"jne 1b\n"
"3:\tsubl %2,%0"
- :"=a" (res), "=&d" (d0)
- :"c" (s), "1" (count)
- :"memory");
+ : "=a" (res), "=&d" (d0)
+ : "c" (s), "1" (count)
+ : "memory");
return res;
}
EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 42e8a50303f..8e2d55f754b 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"2:"
- :"=a" (__res), "=&c" (d0), "=&S" (d1)
- :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
- :"dx", "di");
+ : "=a" (__res), "=&c" (d0), "=&S" (d1)
+ : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+ : "dx", "di");
return __res;
}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 62fa440678d..847c164725f 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
get_memcfg_numa();
- kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
+ kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
do {
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4..e7277cbcfb4 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
- cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
+ prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
+ cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
if (!st->level) {
/* First entry */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4974e97dedf..c3789bb1930 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -195,11 +195,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- unsigned pages_2m = 0, pages_4k = 0;
+ unsigned pages_2m, pages_4k;
+ int mapping_iter;
+
+ /*
+ * First iteration will setup identity mapping using large/small pages
+ * based on use_pse, with other attributes same as set by
+ * the early code in head_32.S
+ *
+ * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+ * as desired for the kernel identity mapping.
+ *
+ * This two pass mechanism conforms to the TLB app note which says:
+ *
+ * "Software should not write to a paging-structure entry in a way
+ * that would change, for any linear address, both the page size
+ * and either the page frame or attributes."
+ */
+ mapping_iter = 1;
if (!cpu_has_pse)
use_pse = 0;
+repeat:
+ pages_2m = pages_4k = 0;
pfn = start_pfn;
pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
@@ -225,6 +244,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute + _PAGE_PSE.
+ */
+ pgprot_t init_prot =
+ __pgprot(PTE_IDENT_ATTR |
+ _PAGE_PSE);
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -234,7 +260,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
- set_pmd(pmd, pfn_pmd(pfn, prot));
+ if (mapping_iter == 1)
+ set_pmd(pmd, pfn_pmd(pfn, init_prot));
+ else
+ set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
continue;
@@ -246,17 +275,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute.
+ */
+ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
if (is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
pages_4k++;
- set_pte(pte, pfn_pte(pfn, prot));
+ if (mapping_iter == 1)
+ set_pte(pte, pfn_pte(pfn, init_prot));
+ else
+ set_pte(pte, pfn_pte(pfn, prot));
}
}
}
- update_page_count(PG_LEVEL_2M, pages_2m);
- update_page_count(PG_LEVEL_4K, pages_4k);
+ if (mapping_iter == 1) {
+ /*
+ * update direct mapping page count only in the first
+ * iteration.
+ */
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
+
+ /*
+ * local global flush tlb, which will flush the previous
+ * mappings present in both small and large page TLB's.
+ */
+ __flush_tlb_all();
+
+ /*
+ * Second iteration will set the actual desired PTE attributes.
+ */
+ mapping_iter = 2;
+ goto repeat;
+ }
}
/*
@@ -459,11 +514,7 @@ static void __init pagetable_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
- paravirt_pagetable_setup_start(pgd_base);
-
permanent_kmaps_init(pgd_base);
-
- paravirt_pagetable_setup_done(pgd_base);
}
#ifdef CONFIG_ACPI_SLEEP
@@ -723,7 +774,7 @@ void __init setup_bootmem_allocator(void)
after_init_bootmem = 1;
}
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse)
{
unsigned long puds, pmds, ptes, tables, start;
@@ -733,7 +784,7 @@ static void __init find_early_table_space(unsigned long end)
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
- if (cpu_has_pse) {
+ if (use_pse) {
unsigned long extra;
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -773,12 +824,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
pgd_t *pgd_base = swapper_pg_dir;
unsigned long start_pfn, end_pfn;
unsigned long big_page_start;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ int use_pse = 0;
+#else
+ int use_pse = cpu_has_pse;
+#endif
/*
* Find space for the kernel direct mapping tables.
*/
if (!after_init_bootmem)
- find_early_table_space(end);
+ find_early_table_space(end, use_pse);
#ifdef CONFIG_X86_PAE
set_nx();
@@ -824,7 +885,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
if (start_pfn < end_pfn)
kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
- cpu_has_pse);
+ use_pse);
/* tail is not big page alignment ? */
start_pfn = end_pfn;
@@ -987,7 +1048,6 @@ void __init mem_init(void)
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
- cpa_init();
save_pg_dir();
zap_low_mappings();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1f6806b62eb..83e13f2d53d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -281,7 +281,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
+ unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
pmd_t *last_pmd = pmd + PTRS_PER_PMD;
@@ -327,7 +327,8 @@ static __ref void unmap_low_page(void *adr)
}
static unsigned long __meminit
-phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+ pgprot_t prot)
{
unsigned pages = 0;
unsigned long last_map_addr = end;
@@ -345,36 +346,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
break;
}
+ /*
+ * We will re-use the existing mapping.
+ * Xen for example has some special requirements, like mapping
+ * pagetable pages as RO. So assume someone who pre-setup
+ * these mappings are more intelligent.
+ */
if (pte_val(*pte))
continue;
if (0)
printk(" pte=%p addr=%lx pte=%016lx\n",
pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
- set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
- last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
pages++;
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
}
+
update_page_count(PG_LEVEL_4K, pages);
return last_map_addr;
}
static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
+ pgprot_t prot)
{
pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
- return phys_pte_init(pte, address, end);
+ return phys_pte_init(pte, address, end, prot);
}
static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
unsigned long pages = 0;
unsigned long last_map_addr = end;
- unsigned long start = address;
int i = pmd_index(address);
@@ -382,6 +390,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
+ pgprot_t new_prot = prot;
if (address >= end) {
if (!after_bootmem) {
@@ -395,27 +404,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
last_map_addr = phys_pte_update(pmd, address,
- end);
+ end, prot);
spin_unlock(&init_mm.page_table_lock);
+ continue;
}
- /* Count entries we're using from level2_ident_pgt */
- if (start == 0)
- pages++;
- continue;
+ /*
+ * If we are ok with PG_LEVEL_2M mapping, then we will
+ * use the existing mapping,
+ *
+ * Otherwise, we will split the large page mapping but
+ * use the same existing protection bits except for
+ * large page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_2M))
+ continue;
+ new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
}
if (page_size_mask & (1<<PG_LEVEL_2M)) {
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte(address >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = (address & PMD_MASK) + PMD_SIZE;
continue;
}
pte = alloc_low_page(&pte_phys);
- last_map_addr = phys_pte_init(pte, address, end);
+ last_map_addr = phys_pte_init(pte, address, end, new_prot);
unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
@@ -428,12 +450,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
static unsigned long __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
pmd_t *pmd = pmd_offset(pud, 0);
unsigned long last_map_addr;
- last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
+ last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
__flush_tlb_all();
return last_map_addr;
}
@@ -450,6 +472,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
+ pgprot_t prot = PAGE_KERNEL;
if (addr >= end)
break;
@@ -461,10 +484,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
}
if (pud_val(*pud)) {
- if (!pud_large(*pud))
+ if (!pud_large(*pud)) {
last_map_addr = phys_pmd_update(pud, addr, end,
- page_size_mask);
- continue;
+ page_size_mask, prot);
+ continue;
+ }
+ /*
+ * If we are ok with PG_LEVEL_1G mapping, then we will
+ * use the existing mapping.
+ *
+ * Otherwise, we will split the gbpage mapping but use
+ * the same existing protection bits except for large
+ * page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_1G))
+ continue;
+ prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
}
if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -478,7 +517,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
}
pmd = alloc_low_page(&pmd_phys);
- last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
+ prot);
unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
@@ -486,6 +526,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
+
update_page_count(PG_LEVEL_1G, pages);
return last_map_addr;
@@ -502,27 +543,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-static void __init find_early_table_space(unsigned long end)
+static void __init find_early_table_space(unsigned long end, int use_pse,
+ int use_gbpages)
{
unsigned long puds, pmds, ptes, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
- if (direct_gbpages) {
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ if (use_gbpages) {
unsigned long extra;
extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
} else
pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
- if (cpu_has_pse) {
+ if (use_pse) {
unsigned long extra;
extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
} else
ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
+ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
/*
* RED-PEN putting page tables only on node 0 could
@@ -584,6 +626,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
pgd_populate(&init_mm, pgd, __va(pud_phys));
spin_unlock(&init_mm.page_table_lock);
}
+ __flush_tlb_all();
return last_map_addr;
}
@@ -627,6 +670,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
+ int use_pse, use_gbpages;
printk(KERN_INFO "init_memory_mapping\n");
@@ -640,9 +684,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
if (!after_bootmem)
init_gbpages();
- if (direct_gbpages)
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ use_pse = use_gbpages = 0;
+#else
+ use_pse = cpu_has_pse;
+ use_gbpages = direct_gbpages;
+#endif
+
+ if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
- if (cpu_has_pse)
+ if (use_pse)
page_size_mask |= 1 << PG_LEVEL_2M;
memset(mr, 0, sizeof(mr));
@@ -703,7 +759,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
if (!after_bootmem)
- find_early_table_space(end);
+ find_early_table_space(end, use_pse, use_gbpages);
for (i = 0; i < nr_range; i++)
last_map_addr = kernel_physical_mapping_init(
@@ -862,8 +918,6 @@ void __init mem_init(void)
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
-
- cpa_init();
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index cac6da54203..6ab3196d12b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -83,6 +83,25 @@ int page_is_ram(unsigned long pagenr)
return 0;
}
+int pagerange_is_ram(unsigned long start, unsigned long end)
+{
+ int ram_page = 0, not_rampage = 0;
+ unsigned long page_nr;
+
+ for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+ ++page_nr) {
+ if (page_is_ram(page_nr))
+ ram_page = 1;
+ else
+ not_rampage = 1;
+
+ if (ram_page == not_rampage)
+ return -1;
+ }
+
+ return ram_page;
+}
+
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a4dd793d600..cebcbf152d4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
return 0;
addr = 0x8000;
- nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
+ nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
nodemap_size, L1_CACHE_BYTES);
if (nodemap_addr == -1UL) {
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
- const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+ const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
- start = round_up(start, ZONE_ALIGN);
+ start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
nid = phys_to_nid(nodedata_phys);
if (nid == nodeid)
- bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
else
- bootmap_start = round_up(start, PAGE_SIZE);
+ bootmap_start = roundup(start, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d4aa503caaa..e1d10690921 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
GPS = (1<<30)
};
-#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
+#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
static int pte_testbit(pte_t pte)
{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
unsigned int level;
int i, k;
int err;
+ unsigned long test_addr;
if (print)
printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
continue;
}
- err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
+ test_addr = addr[i];
+ err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
failed++;
continue;
}
- err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
+ test_addr = addr[i];
+ err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 43e2f8483e4..a9ec89c3fbc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
* The current flushing context - we pass it instead of 5 arguments:
*/
struct cpa_data {
- unsigned long vaddr;
+ unsigned long *vaddr;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
- int flushtlb;
+ int flags;
unsigned long pfn;
unsigned force_split : 1;
+ int curpage;
};
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -84,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -190,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
+static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+{
+ unsigned int i, level;
+ unsigned long *addr;
+
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_range, NULL, 1);
+
+ if (!cache)
+ return;
+
+ /* 4M threshold */
+ if (numpages >= 1024) {
+ if (boot_cpu_data.x86_model >= 4)
+ wbinvd();
+ return;
+ }
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr++) {
+ pte_t *pte = lookup_address(*addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *) *addr, PAGE_SIZE);
+ }
+}
+
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
@@ -398,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
*/
new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
__set_pmd_pte(kpte, address, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
}
@@ -408,84 +455,6 @@ out_unlock:
return do_split;
}
-static LIST_HEAD(page_pool);
-static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed;
-
-static void cpa_fill_pool(struct page **ret)
-{
- gfp_t gfp = GFP_KERNEL;
- unsigned long flags;
- struct page *p;
-
- /*
- * Avoid recursion (on debug-pagealloc) and also signal
- * our priority to get to these pagetables:
- */
- if (current->flags & PF_MEMALLOC)
- return;
- current->flags |= PF_MEMALLOC;
-
- /*
- * Allocate atomically from atomic contexts:
- */
- if (in_atomic() || irqs_disabled() || debug_pagealloc)
- gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-
- while (pool_pages < pool_size || (ret && !*ret)) {
- p = alloc_pages(gfp, 0);
- if (!p) {
- pool_failed++;
- break;
- }
- /*
- * If the call site needs a page right now, provide it:
- */
- if (ret && !*ret) {
- *ret = p;
- continue;
- }
- spin_lock_irqsave(&pgd_lock, flags);
- list_add(&p->lru, &page_pool);
- pool_pages++;
- spin_unlock_irqrestore(&pgd_lock, flags);
- }
-
- current->flags &= ~PF_MEMALLOC;
-}
-
-#define SHIFT_MB (20 - PAGE_SHIFT)
-#define ROUND_MB_GB ((1 << 10) - 1)
-#define SHIFT_MB_GB 10
-#define POOL_PAGES_PER_GB 16
-
-void __init cpa_init(void)
-{
- struct sysinfo si;
- unsigned long gb;
-
- si_meminfo(&si);
- /*
- * Calculate the number of pool pages:
- *
- * Convert totalram (nr of pages) to MiB and round to the next
- * GiB. Shift MiB to Gib and multiply the result by
- * POOL_PAGES_PER_GB:
- */
- if (debug_pagealloc) {
- gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
- pool_size = POOL_PAGES_PER_GB * gb;
- } else {
- pool_size = 1;
- }
- pool_low = pool_size;
-
- cpa_fill_pool(NULL);
- printk(KERN_DEBUG
- "CPA: page pool initialized %lu of %lu pages preallocated\n",
- pool_pages, pool_size);
-}
-
static int split_large_page(pte_t *kpte, unsigned long address)
{
unsigned long flags, pfn, pfninc = 1;
@@ -494,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
pgprot_t ref_prot;
struct page *base;
- /*
- * Get a page from the pool. The pool list is protected by the
- * pgd_lock, which we have to take anyway for the split
- * operation:
- */
- spin_lock_irqsave(&pgd_lock, flags);
- if (list_empty(&page_pool)) {
- spin_unlock_irqrestore(&pgd_lock, flags);
- base = NULL;
- cpa_fill_pool(&base);
- if (!base)
- return -ENOMEM;
- spin_lock_irqsave(&pgd_lock, flags);
- } else {
- base = list_first_entry(&page_pool, struct page, lru);
- list_del(&base->lru);
- pool_pages--;
-
- if (pool_pages < pool_low)
- pool_low = pool_pages;
- }
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+ spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
* up for us already:
@@ -572,11 +528,8 @@ out_unlock:
* If we dropped out via the lookup_address check under
* pgd_lock then stick the page back into the pool:
*/
- if (base) {
- list_add(&base->lru, &page_pool);
- pool_pages++;
- } else
- pool_used++;
+ if (base)
+ __free_page(base);
spin_unlock_irqrestore(&pgd_lock, flags);
return 0;
@@ -584,11 +537,16 @@ out_unlock:
static int __change_page_attr(struct cpa_data *cpa, int primary)
{
- unsigned long address = cpa->vaddr;
+ unsigned long address;
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ if (cpa->flags & CPA_ARRAY)
+ address = cpa->vaddr[cpa->curpage];
+ else
+ address = *cpa->vaddr;
+
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
@@ -600,7 +558,7 @@ repeat:
return 0;
WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", address,
- cpa->vaddr);
+ *cpa->vaddr);
return -EINVAL;
}
@@ -626,7 +584,7 @@ repeat:
*/
if (pte_val(old_pte) != pte_val(new_pte)) {
set_pte_atomic(kpte, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
}
cpa->numpages = 1;
return 0;
@@ -650,7 +608,25 @@ repeat:
*/
err = split_large_page(kpte, address);
if (!err) {
- cpa->flushtlb = 1;
+ /*
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
+ *
+ * With out this, we violate the TLB application note, that says
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
+ *
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
+ */
+ flush_tlb_all();
goto repeat;
}
@@ -663,6 +639,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
int ret = 0;
+ unsigned long temp_cpa_vaddr, vaddr;
if (cpa->pfn >= max_pfn_mapped)
return 0;
@@ -675,16 +652,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (!(within(cpa->vaddr, PAGE_OFFSET,
+ if (cpa->flags & CPA_ARRAY)
+ vaddr = cpa->vaddr[cpa->curpage];
+ else
+ vaddr = *cpa->vaddr;
+
+ if (!(within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
#ifdef CONFIG_X86_64
- || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+ || within(vaddr, PAGE_OFFSET + (1UL<<32),
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
#endif
)) {
alias_cpa = *cpa;
- alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~CPA_ARRAY;
+
ret = __change_page_attr_set_clr(&alias_cpa, 0);
}
@@ -696,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
return 0;
/*
@@ -707,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
alias_cpa = *cpa;
- alias_cpa.vaddr =
- (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+ temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~CPA_ARRAY;
/*
* The high mapping range is imprecise, so ignore the return value.
@@ -728,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
* preservation check.
*/
cpa->numpages = numpages;
+ /* for array changes, we can't use large page */
+ if (cpa->flags & CPA_ARRAY)
+ cpa->numpages = 1;
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, checkalias);
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
if (ret)
return ret;
@@ -746,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
*/
BUG_ON(cpa->numpages > numpages);
numpages -= cpa->numpages;
- cpa->vaddr += cpa->numpages * PAGE_SIZE;
+ if (cpa->flags & CPA_ARRAY)
+ cpa->curpage++;
+ else
+ *cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
}
return 0;
}
@@ -757,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
}
-static int change_page_attr_set_clr(unsigned long addr, int numpages,
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
- int force_split)
+ int force_split, int array)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -774,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
return 0;
/* Ensure we are PAGE_SIZE aligned */
- if (addr & ~PAGE_MASK) {
- addr &= PAGE_MASK;
- /*
- * People should not be passing in unaligned addresses:
- */
- WARN_ON_ONCE(1);
+ if (!array) {
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
+ } else {
+ int i;
+ for (i = 0; i < numpages; i++) {
+ if (addr[i] & ~PAGE_MASK) {
+ addr[i] &= PAGE_MASK;
+ WARN_ON_ONCE(1);
+ }
+ }
}
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+
cpa.vaddr = addr;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
- cpa.flushtlb = 0;
+ cpa.flags = 0;
+ cpa.curpage = 0;
cpa.force_split = force_split;
+ if (array)
+ cpa.flags |= CPA_ARRAY;
+
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -797,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
/*
* Check whether we really changed something:
*/
- if (!cpa.flushtlb)
+ if (!(cpa.flags & CPA_FLUSHTLB))
goto out;
/*
@@ -812,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
* error case we fall back to cpa_flush_all (which uses
* wbindv):
*/
- if (!ret && cpu_has_clflush)
- cpa_flush_range(addr, numpages, cache);
- else
+ if (!ret && cpu_has_clflush) {
+ if (cpa.flags & CPA_ARRAY)
+ cpa_flush_array(addr, numpages, cache);
+ else
+ cpa_flush_range(*addr, numpages, cache);
+ } else
cpa_flush_all(cache);
out:
- cpa_fill_pool(NULL);
-
return ret;
}
-static inline int change_page_attr_set(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+ array);
}
-static inline int change_page_attr_clear(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+ array);
}
int _set_memory_uc(unsigned long addr, int numpages)
@@ -840,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
/*
* for now UC MINUS. see comments in ioremap_nocache()
*/
- return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_CACHE_UC_MINUS));
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 0);
}
int set_memory_uc(unsigned long addr, int numpages)
@@ -857,10 +874,48 @@ int set_memory_uc(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_uc);
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+ unsigned long start;
+ unsigned long end;
+ int i;
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ */
+ for (i = 0; i < addrinarray; i++) {
+ start = __pa(addr[i]);
+ for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+ goto out;
+ }
+
+ return change_page_attr_set(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+out:
+ for (i = 0; i < addrinarray; i++) {
+ unsigned long tmp = __pa(addr[i]);
+
+ if (tmp == start)
+ break;
+ for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ free_memtype(tmp, end);
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
int _set_memory_wc(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_CACHE_WC));
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_CACHE_WC), 0);
}
int set_memory_wc(unsigned long addr, int numpages)
@@ -878,8 +933,8 @@ EXPORT_SYMBOL(set_memory_wc);
int _set_memory_wb(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages,
- __pgprot(_PAGE_CACHE_MASK));
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_CACHE_MASK), 0);
}
int set_memory_wb(unsigned long addr, int numpages)
@@ -890,37 +945,59 @@ int set_memory_wb(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_wb);
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+ int i;
+
+ for (i = 0; i < addrinarray; i++) {
+ unsigned long start = __pa(addr[i]);
+ unsigned long end;
+
+ for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+ if (end != __pa(addr[i + 1]))
+ break;
+ i++;
+ }
+ free_memtype(start, end);
+ }
+ return change_page_attr_clear(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK), 1);
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
int set_memory_x(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_x);
int set_memory_nx(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_nx);
int set_memory_ro(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+EXPORT_SYMBOL_GPL(set_memory_ro);
int set_memory_rw(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+EXPORT_SYMBOL_GPL(set_memory_rw);
int set_memory_np(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0),
- __pgprot(0), 1);
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(0), 1, 0);
}
int set_pages_uc(struct page *page, int numpages)
@@ -973,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
static int __set_pages_p(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .mask_clr = __pgprot(0)};
+ .mask_clr = __pgprot(0),
+ .flags = 0};
- return __change_page_attr_set_clr(&cpa, 1);
+ /*
+ * No alias checking needed for setting present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
}
static int __set_pages_np(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .flags = 0};
- return __change_page_attr_set_clr(&cpa, 1);
+ /*
+ * No alias checking needed for setting not present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
}
void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1008,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
/*
* The return value is ignored as the calls cannot fail.
- * Large pages are kept enabled at boot time, and are
- * split up quickly with DEBUG_PAGEALLOC. If a splitup
- * fails here (due to temporary memory shortage) no damage
- * is done because we just keep the largepage intact up
- * to the next attempt when it will likely be split up:
+ * Large pages for identity mappings are not used at boot time
+ * and hence no memory allocations during large page split.
*/
if (enable)
__set_pages_p(page, numpages);
@@ -1024,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
-
- /*
- * Try to refill the page pool here. We can do this only after
- * the tlb flush.
- */
- cpa_fill_pool(NULL);
}
-#ifdef CONFIG_DEBUG_FS
-static int dpa_show(struct seq_file *m, void *v)
-{
- seq_puts(m, "DEBUG_PAGEALLOC\n");
- seq_printf(m, "pool_size : %lu\n", pool_size);
- seq_printf(m, "pool_pages : %lu\n", pool_pages);
- seq_printf(m, "pool_low : %lu\n", pool_low);
- seq_printf(m, "pool_used : %lu\n", pool_used);
- seq_printf(m, "pool_failed : %lu\n", pool_failed);
-
- return 0;
-}
-
-static int dpa_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, dpa_show, NULL);
-}
-
-static const struct file_operations dpa_fops = {
- .open = dpa_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init debug_pagealloc_proc_init(void)
-{
- struct dentry *de;
-
- de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
- &dpa_fops);
- if (!de)
- return -ENOMEM;
-
- return 0;
-}
-__initcall(debug_pagealloc_proc_init);
-#endif
-
#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2a50e0fa64a..738fd0f2495 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
* Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
*/
-#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
+#include <linux/mm.h>
#include <linux/fs.h>
-#include <linux/bootmem.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <asm/msr.h>
-#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
#include <asm/processor.h>
-#include <asm/page.h>
+#include <asm/tlbflush.h>
#include <asm/pgtable.h>
-#include <asm/pat.h>
-#include <asm/e820.h>
-#include <asm/cacheflush.h>
#include <asm/fcntl.h>
+#include <asm/e820.h>
#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
#include <asm/io.h>
#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
static int debug_enable;
+
static int __init pat_debug_setup(char *str)
{
debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
*/
struct memtype {
- u64 start;
- u64 end;
- unsigned long type;
- struct list_head nd;
+ u64 start;
+ u64 end;
+ unsigned long type;
+ struct list_head nd;
};
static LIST_HEAD(memtype_list);
-static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
/*
* Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
return req_type;
}
-static int chk_conflict(struct memtype *new, struct memtype *entry,
- unsigned long *type)
+static int
+chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
{
if (new->type != entry->type) {
if (type) {
@@ -211,6 +212,66 @@ static struct memtype *cached_entry;
static u64 cached_start;
/*
+ * For RAM pages, mark the pages as non WB memory type using
+ * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
+ * set_memory_wc() on a RAM page at a time before marking it as WB again.
+ * This is ok, because only one driver will be owning the page and
+ * doing set_memory_*() calls.
+ *
+ * For now, we use PageNonWB to track that the RAM page is being mapped
+ * as non WB. In future, we will have to use one more flag
+ * (or some other mechanism in page_struct) to distinguish between
+ * UC and WC mapping.
+ */
+static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
+ unsigned long *new_type)
+{
+ struct page *page;
+ u64 pfn, end_pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ if (page_mapped(page) || PageNonWB(page))
+ goto out;
+
+ SetPageNonWB(page);
+ }
+ return 0;
+
+out:
+ end_pfn = pfn;
+ for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ page = pfn_to_page(pfn);
+ ClearPageNonWB(page);
+ }
+
+ return -EINVAL;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+ struct page *page;
+ u64 pfn, end_pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ if (page_mapped(page) || !PageNonWB(page))
+ goto out;
+
+ ClearPageNonWB(page);
+ }
+ return 0;
+
+out:
+ end_pfn = pfn;
+ for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ page = pfn_to_page(pfn);
+ SetPageNonWB(page);
+ }
+ return -EINVAL;
+}
+
+/*
* req_type typically has one of the:
* - _PAGE_CACHE_WB
* - _PAGE_CACHE_WC
@@ -226,14 +287,15 @@ static u64 cached_start;
* it will return a negative return value.
*/
int reserve_memtype(u64 start, u64 end, unsigned long req_type,
- unsigned long *new_type)
+ unsigned long *new_type)
{
struct memtype *new, *entry;
unsigned long actual_type;
struct list_head *where;
+ int is_range_ram;
int err = 0;
- BUG_ON(start >= end); /* end is exclusive */
+ BUG_ON(start >= end); /* end is exclusive */
if (!pat_enabled) {
/* This is identical to page table setting without PAT */
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
actual_type = _PAGE_CACHE_WB;
else
actual_type = _PAGE_CACHE_UC_MINUS;
- } else
+ } else {
actual_type = pat_x_mtrr_type(start, end,
req_type & _PAGE_CACHE_MASK);
+ }
+
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return reserve_ram_pages_type(start, end, req_type, new_type);
+ else if (is_range_ram < 0)
+ return -EINVAL;
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
return -ENOMEM;
- new->start = start;
- new->end = end;
- new->type = actual_type;
+ new->start = start;
+ new->end = end;
+ new->type = actual_type;
if (new_type)
*new_type = actual_type;
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
start, end, cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
+
return err;
}
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end)
{
struct memtype *entry;
int err = -EINVAL;
+ int is_range_ram;
if (!pat_enabled)
return 0;
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end)
if (is_ISA_range(start, end - 1))
return 0;
+ is_range_ram = pagerange_is_ram(start, end);
+ if (is_range_ram == 1)
+ return free_ram_pages_type(start, end);
+ else if (is_range_ram < 0)
+ return -EINVAL;
+
spin_lock(&memtype_lock);
list_for_each_entry(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end)
}
dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+
return err;
}
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
{
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
u64 addr = (u64)pfn << PAGE_SHIFT;
unsigned long flags;
- unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
reserve_memtype(addr, addr + size, want_flags, &flags);
if (flags != want_flags) {
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
free_memtype(addr, addr + size);
}
-#if defined(CONFIG_DEBUG_FS)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
/* get Nth element of the linked list */
static struct memtype *memtype_get_idx(loff_t pos)
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
}
spin_unlock(&memtype_lock);
kfree(print_entry);
+
return NULL;
}
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
print_entry->start, print_entry->end);
kfree(print_entry);
+
return 0;
}
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void)
late_initcall(pat_memtype_list_init);
-#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d50302774fe..86f2ffc43c3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(void *p)
+static void pgd_ctor(pgd_t *pgd)
{
- pgd_t *pgd = p;
-
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
pgd_list_add(pgd);
}
-static void pgd_dtor(void *pgd)
+static void pgd_dtor(pgd_t *pgd)
{
unsigned long flags; /* can be called from interrupt context */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cab0abbd1eb..0951db9ee51 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
if (!arg)
return -EINVAL;
- __VMALLOC_RESERVE = memparse(arg, &arg);
+ /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+ __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
return 0;
}
early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 0227694f7da..8a5f1614a3d 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -295,10 +295,12 @@ static void nmi_cpu_shutdown(void *dummy)
static void nmi_shutdown(void)
{
- struct op_msrs *msrs = &get_cpu_var(cpu_msrs);
+ struct op_msrs *msrs;
+
nmi_enabled = 0;
on_each_cpu(nmi_cpu_shutdown, NULL, 1);
unregister_die_notifier(&profile_exceptions_nb);
+ msrs = &get_cpu_var(cpu_msrs);
model->shutdown(msrs);
free_msrs();
put_cpu_var(cpu_msrs);
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 56b4757a1f4..43ac5af338d 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
#include <linux/oprofile.h>
#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <linux/nmi.h>
#include <asm/msr.h>
-#include <asm/ptrace.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
-#include <asm/nmi.h>
+
#include "op_x86_model.h"
#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
static inline void setup_num_counters(void)
{
#ifdef CONFIG_SMP
- if (smp_num_siblings == 2){
+ if (smp_num_siblings == 2) {
num_counters = NUM_COUNTERS_HT2;
num_controls = NUM_CONTROLS_HT2;
}
@@ -86,7 +87,7 @@ struct p4_event_binding {
#define CTR_FLAME_2 (1 << 6)
#define CTR_IQ_5 (1 << 7)
-static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
+static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
{ CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
{ CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
{ CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
{ CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
};
-#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT
+#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
/* p4 event codes in libop/op_event.h are indices into this table. */
static struct p4_event_binding p4_events[NUM_EVENTS] = {
-
+
{ /* BRANCH_RETIRED */
- 0x05, 0x06,
+ 0x05, 0x06,
{ {CTR_IQ_4, MSR_P4_CRU_ESCR2},
{CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
-
+
{ /* MISPRED_BRANCH_RETIRED */
- 0x04, 0x03,
+ 0x04, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
-
+
{ /* TC_DELIVER_MODE */
0x01, 0x01,
- { { CTR_MS_0, MSR_P4_TC_ESCR0},
+ { { CTR_MS_0, MSR_P4_TC_ESCR0},
{ CTR_MS_2, MSR_P4_TC_ESCR1} }
},
-
+
{ /* BPU_FETCH_REQUEST */
- 0x00, 0x03,
+ 0x00, 0x03,
{ { CTR_BPU_0, MSR_P4_BPU_ESCR0},
{ CTR_BPU_2, MSR_P4_BPU_ESCR1} }
},
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* LOAD_PORT_REPLAY */
- 0x02, 0x04,
+ 0x02, 0x04,
{ { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
{ CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
},
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* BSQ_CACHE_REFERENCE */
- 0x07, 0x0c,
+ 0x07, 0x0c,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ CTR_BPU_2, MSR_P4_BSU_ESCR1} }
},
{ /* IOQ_ALLOCATION */
- 0x06, 0x03,
+ 0x06, 0x03,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ 0, 0 } }
},
{ /* IOQ_ACTIVE_ENTRIES */
- 0x06, 0x1a,
+ 0x06, 0x1a,
{ { CTR_BPU_2, MSR_P4_FSB_ESCR1},
{ 0, 0 } }
},
{ /* FSB_DATA_ACTIVITY */
- 0x06, 0x17,
+ 0x06, 0x17,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
{ /* BSQ_ALLOCATION */
- 0x07, 0x05,
+ 0x07, 0x05,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ 0, 0 } }
},
{ /* BSQ_ACTIVE_ENTRIES */
0x07, 0x06,
- { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
+ { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
{ 0, 0 } }
},
{ /* X87_ASSIST */
- 0x05, 0x03,
+ 0x05, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_SP_UOP */
- 0x01, 0x08,
+ 0x01, 0x08,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_DP_UOP */
- 0x01, 0x0c,
+ 0x01, 0x0c,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* SCALAR_SP_UOP */
- 0x01, 0x0a,
+ 0x01, 0x0a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* 64BIT_MMX_UOP */
- 0x01, 0x02,
+ 0x01, 0x02,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* 128BIT_MMX_UOP */
- 0x01, 0x1a,
+ 0x01, 0x1a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* X87_FP_UOP */
- 0x01, 0x04,
+ 0x01, 0x04,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* X87_SIMD_MOVES_UOP */
- 0x01, 0x2e,
+ 0x01, 0x2e,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* MACHINE_CLEAR */
- 0x05, 0x02,
+ 0x05, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
-
+
{ /* TC_MS_XFER */
- 0x00, 0x05,
+ 0x00, 0x05,
{ { CTR_MS_0, MSR_P4_MS_ESCR0},
{ CTR_MS_2, MSR_P4_MS_ESCR1} }
},
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* INSTR_RETIRED */
- 0x04, 0x02,
+ 0x04, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
- { /* UOP_TYPE */
- 0x02, 0x02,
+ { /* UOP_TYPE */
+ 0x02, 0x02,
{ { CTR_IQ_4, MSR_P4_RAT_ESCR0},
{ CTR_IQ_5, MSR_P4_RAT_ESCR1} }
},
{ /* RETIRED_MISPRED_BRANCH_TYPE */
- 0x02, 0x05,
+ 0x02, 0x05,
{ { CTR_MS_0, MSR_P4_TBPU_ESCR0},
{ CTR_MS_2, MSR_P4_TBPU_ESCR1} }
},
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
-#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
+#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
+#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
#define CCCR_RESERVED_BITS 0x38030FFF
#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
-#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
+#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
+#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0)
-#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0)
+#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
+#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
#ifdef CONFIG_SMP
int cpu = smp_processor_id();
return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
-#endif
+#endif
return 0;
}
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
static void p4_fill_in_addresses(struct op_msrs * const msrs)
{
- unsigned int i;
+ unsigned int i;
unsigned int addr, cccraddr, stag;
setup_num_counters();
stag = get_stagger();
/* initialize some registers */
- for (i = 0; i < num_counters; ++i) {
+ for (i = 0; i < num_counters; ++i)
msrs->counters[i].addr = 0;
- }
- for (i = 0; i < num_controls; ++i) {
+ for (i = 0; i < num_controls; ++i)
msrs->controls[i].addr = 0;
- }
-
+
/* the counter & cccr registers we pay attention to */
for (i = 0; i < num_counters; ++i) {
addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
- if (reserve_perfctr_nmi(addr)){
+ if (reserve_perfctr_nmi(addr)) {
msrs->counters[i].addr = addr;
msrs->controls[i].addr = cccraddr;
}
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_MS_ESCR0 + stag;
- addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_IX_ESCR0 + stag;
- addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
/* there are 2 remaining non-contiguously located ESCRs */
- if (num_counters == NUM_COUNTERS_NON_HT) {
+ if (num_counters == NUM_COUNTERS_NON_HT) {
/* standard non-HT CPUs handle both remaining ESCRs*/
if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
unsigned int stag;
stag = get_stagger();
-
+
/* convert from counter *number* to counter *bit* */
counter_bit = 1 << VIRT_CTR(stag, ctr);
-
+
/* find our event binding structure. */
if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx out of range\n",
+ printk(KERN_ERR
+ "oprofile: P4 event code 0x%lx out of range\n",
counter_config[ctr].event);
return;
}
-
+
ev = &(p4_events[counter_config[ctr].event - 1]);
-
+
for (i = 0; i < maxbind; i++) {
if (ev->bindings[i].virt_counter & counter_bit) {
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
}
ESCR_SET_EVENT_SELECT(escr, ev->event_select);
- ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
+ ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
ESCR_WRITE(escr, high, ev, i);
-
+
/* modify CCCR */
CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
CCCR_CLEAR(cccr);
CCCR_SET_REQUIRED_BITS(cccr);
CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
- if (stag == 0) {
+ if (stag == 0)
CCCR_SET_PMI_OVF_0(cccr);
- } else {
+ else
CCCR_SET_PMI_OVF_1(cccr);
- }
CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
return;
}
}
- printk(KERN_ERR
+ printk(KERN_ERR
"oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
counter_config[ctr].event, stag, ctr);
}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
stag = get_stagger();
rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (! MISC_PMC_ENABLED_P(low)) {
+ if (!MISC_PMC_ENABLED_P(low)) {
printk(KERN_ERR "oprofile: P4 PMC not available\n");
return;
}
/* clear the cccrs we will use */
for (i = 0 ; i < num_counters ; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
/* clear all escrs (including those outside our concern) */
for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
wrmsr(msrs->controls[i].addr, 0, 0);
}
/* setup all counters */
for (i = 0 ; i < num_counters ; ++i) {
- if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) {
+ if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
reset_value[i] = counter_config[i].count;
pmc_setup_one_p4_counter(i);
CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
stag = get_stagger();
for (i = 0; i < num_counters; ++i) {
-
- if (!reset_value[i])
+
+ if (!reset_value[i])
continue;
- /*
+ /*
* there is some eccentricity in the hardware which
* requires that we perform 2 extra corrections:
*
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
*
* - write the counter back twice to ensure it gets
* updated properly.
- *
+ *
* the former seems to be related to extra NMIs happening
* during the current NMI; the latter is reported as errata
* N15 in intel doc 249199-029, pentium 4 specification
* update, though their suggested work-around does not
* appear to solve the problem.
*/
-
+
real = VIRT_CTR(stag, i);
CCCR_READ(low, high, real);
- CTR_READ(ctr, high, real);
+ CTR_READ(ctr, high, real);
if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
oprofile_add_sample(regs, i);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
CCCR_CLEAR_OVF(low);
CCCR_WRITE(low, high, real);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
}
}
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
int i;
for (i = 0 ; i < num_counters ; ++i) {
- if (CTR_IS_RESERVED(msrs,i))
+ if (CTR_IS_RESERVED(msrs, i))
release_perfctr_nmi(msrs->counters[i].addr);
}
- /* some of the control registers are specially reserved in
+ /*
+ * some of the control registers are specially reserved in
* conjunction with the counter registers (hence the starting offset).
* This saves a few bits.
*/
for (i = num_counters ; i < num_controls ; ++i) {
- if (CTRL_IS_RESERVED(msrs,i))
+ if (CTRL_IS_RESERVED(msrs, i))
release_evntsel_nmi(msrs->controls[i].addr);
}
}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 6a0fca78c36..22e057665e5 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -580,7 +580,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
- switch(action) {
+ switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8e077185e18..006599db0dc 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1043,35 +1043,44 @@ static void __init pcibios_fixup_irqs(void)
if (io_apic_assign_pci_irqs) {
int irq;
- if (pin) {
- /*
- * interrupt pins are numbered starting
- * from 1
- */
- pin--;
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn), pin);
- /*
- * Busses behind bridges are typically not listed in the MP-table.
- * In this case we have to look up the IRQ based on the parent bus,
- * parent slot, and pin number. The SMP code detects such bridged
- * busses itself so we should get into this branch reliably.
- */
- if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
- struct pci_dev *bridge = dev->bus->self;
-
- pin = (pin + PCI_SLOT(dev->devfn)) % 4;
- irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
- PCI_SLOT(bridge->devfn), pin);
- if (irq >= 0)
- dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
- pci_name(bridge),
- 'A' + pin, irq);
- }
- if (irq >= 0) {
- dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
- dev->irq = irq;
- }
+ if (!pin)
+ continue;
+
+ /*
+ * interrupt pins are numbered starting from 1
+ */
+ pin--;
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+ PCI_SLOT(dev->devfn), pin);
+ /*
+ * Busses behind bridges are typically not listed in the
+ * MP-table. In this case we have to look up the IRQ
+ * based on the parent bus, parent slot, and pin number.
+ * The SMP code detects such bridged busses itself so we
+ * should get into this branch reliably.
+ */
+ if (irq < 0 && dev->bus->parent) {
+ /* go back to the bridge */
+ struct pci_dev *bridge = dev->bus->self;
+ int bus;
+
+ pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+ bus = bridge->bus->number;
+ irq = IO_APIC_get_PCI_irq_vector(bus,
+ PCI_SLOT(bridge->devfn), pin);
+ if (irq >= 0)
+ dev_warn(&dev->dev,
+ "using bridge %s INT %c to "
+ "get IRQ %d\n",
+ pci_name(bridge),
+ 'A' + pin, irq);
+ }
+ if (irq >= 0) {
+ dev_info(&dev->dev,
+ "PCI->APIC IRQ transform: INT %c "
+ "-> IRQ %d\n",
+ 'A' + pin, irq);
+ dev->irq = irq;
}
}
#endif
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 4fc7e872c85..d1e9b53f9d3 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,5 +1,3 @@
-.text
-
/*
* This may not use any stack, nor any variable that is not "NoSave":
*
@@ -12,17 +10,18 @@
#include <asm/segment.h>
#include <asm/page.h>
#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
- .text
+.text
ENTRY(swsusp_arch_suspend)
-
movl %esp, saved_context_esp
movl %ebx, saved_context_ebx
movl %ebp, saved_context_ebp
movl %esi, saved_context_esi
movl %edi, saved_context_edi
- pushfl ; popl saved_context_eflags
+ pushfl
+ popl saved_context_eflags
call swsusp_save
ret
@@ -59,7 +58,7 @@ done:
movl mmu_cr4_features, %ecx
jecxz 1f # cr4 Pentium and higher, skip if zero
movl %ecx, %edx
- andl $~(1<<7), %edx; # PGE
+ andl $~(X86_CR4_PGE), %edx
movl %edx, %cr4; # turn off PGE
1:
movl %cr3, %eax; # flush TLB
@@ -74,7 +73,8 @@ done:
movl saved_context_esi, %esi
movl saved_context_edi, %edi
- pushl saved_context_eflags ; popfl
+ pushl saved_context_eflags
+ popfl
xorl %eax, %eax
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7b3508952b9..a27d562a974 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -844,7 +844,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
@@ -854,7 +854,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(u32 pfn)
+static void xen_release_pte_init(unsigned long pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
@@ -870,7 +870,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
@@ -888,12 +888,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
}
}
-static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PTE);
}
-static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
@@ -941,7 +941,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
}
/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(u32 pfn, unsigned level)
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
@@ -955,23 +955,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
}
}
-static void xen_release_pte(u32 pfn)
+static void xen_release_pte(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PTE);
}
-static void xen_release_pmd(u32 pfn)
+static void xen_release_pmd(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PMD);
}
#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
}
-static void xen_release_pud(u32 pfn)
+static void xen_release_pud(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PUD);
}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b6acc3a0af4..d6790108388 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -42,7 +42,7 @@ char * __init xen_memory_setup(void)
e820.nr_map = 0;
- e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM);
+ e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
/*
* Even though this is normal, usable memory under Xen, reserve