aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile42
-rw-r--r--arch/x86/kernel/acpi/boot.c169
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S4
-rw-r--r--arch/x86/kernel/acpi/sleep.c1
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S4
-rw-r--r--arch/x86/kernel/alternative.c23
-rw-r--r--arch/x86/kernel/apic/Makefile19
-rw-r--r--arch/x86/kernel/apic/apic.c (renamed from arch/x86/kernel/apic.c)330
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c (renamed from arch/x86/kernel/genapic_flat_64.c)192
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c267
-rw-r--r--arch/x86/kernel/apic/es7000_32.c780
-rw-r--r--arch/x86/kernel/apic/io_apic.c (renamed from arch/x86/kernel/io_apic.c)701
-rw-r--r--arch/x86/kernel/apic/ipi.c164
-rw-r--r--arch/x86/kernel/apic/nmi.c (renamed from arch/x86/kernel/nmi.c)12
-rw-r--r--arch/x86/kernel/apic/numaq_32.c557
-rw-r--r--arch/x86/kernel/apic/probe_32.c284
-rw-r--r--arch/x86/kernel/apic/probe_64.c100
-rw-r--r--arch/x86/kernel/apic/summit_32.c579
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c245
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c (renamed from arch/x86/kernel/genx2apic_phys.c)156
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c (renamed from arch/x86/kernel/genx2apic_uv_x.c)161
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c56
-rw-r--r--arch/x86/kernel/cpu/amd.c56
-rw-r--r--arch/x86/kernel/cpu/centaur.c36
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rw-r--r--arch/x86/kernel/cpu/common.c579
-rw-r--r--arch/x86/kernel/cpu/cpu.h11
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c901
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c6
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c49
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c71
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c83
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c214
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1101
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c202
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1069
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c20
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/umc.c2
-rw-r--r--arch/x86/kernel/crash.c4
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c35
-rw-r--r--arch/x86/kernel/e820.c135
-rw-r--r--arch/x86/kernel/early_printk.c22
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c1
-rw-r--r--arch/x86/kernel/efi_stub_32.S3
-rw-r--r--arch/x86/kernel/efi_stub_64.S7
-rw-r--r--arch/x86/kernel/entry_32.S473
-rw-r--r--arch/x86/kernel/entry_64.S78
-rw-r--r--arch/x86/kernel/es7000_32.c378
-rw-r--r--arch/x86/kernel/genapic_64.c82
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c198
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/head64.c25
-rw-r--r--arch/x86/kernel/head_32.S120
-rw-r--r--arch/x86/kernel/head_64.S23
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/ioport.c14
-rw-r--r--arch/x86/kernel/ipi.c190
-rw-r--r--arch/x86/kernel/irq.c111
-rw-r--r--arch/x86/kernel/irq_32.c61
-rw-r--r--arch/x86/kernel/irq_64.c43
-rw-r--r--arch/x86/kernel/irqinit_32.c39
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes.c2
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c19
-rw-r--r--arch/x86/kernel/machine_kexec_64.c179
-rw-r--r--arch/x86/kernel/mca_32.c5
-rw-r--r--arch/x86/kernel/microcode_intel.c10
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/module_32.c6
-rw-r--r--arch/x86/kernel/module_64.c32
-rw-r--r--arch/x86/kernel/mpparse.c443
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/numaq_32.c293
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c10
-rw-r--r--arch/x86/kernel/paravirt.c57
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/probe_roms_32.c2
-rw-r--r--arch/x86/kernel/process.c194
-rw-r--r--arch/x86/kernel/process_32.c241
-rw-r--r--arch/x86/kernel/process_64.c230
-rw-r--r--arch/x86/kernel/ptrace.c24
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S26
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S312
-rw-r--r--arch/x86/kernel/setup.c197
-rw-r--r--arch/x86/kernel/setup_percpu.c679
-rw-r--r--arch/x86/kernel/signal.c463
-rw-r--r--arch/x86/kernel/smp.c15
-rw-r--r--arch/x86/kernel/smpboot.c225
-rw-r--r--arch/x86/kernel/smpcommon.c30
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/summit_32.c188
-rw-r--r--arch/x86/kernel/syscall_table_32.S20
-rw-r--r--arch/x86/kernel/time_32.c8
-rw-r--r--arch/x86/kernel/tlb_32.c256
-rw-r--r--arch/x86/kernel/tlb_64.c284
-rw-r--r--arch/x86/kernel/tlb_uv.c75
-rw-r--r--arch/x86/kernel/trampoline_32.S2
-rw-r--r--arch/x86/kernel/trampoline_64.S23
-rw-r--r--arch/x86/kernel/traps.c64
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/visws_quirks.c12
-rw-r--r--arch/x86/kernel/vm86_32.c20
-rw-r--r--arch/x86/kernel/vmi_32.c19
-rw-r--r--arch/x86/kernel/vmiclock_32.c6
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S32
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S136
-rw-r--r--arch/x86/kernel/vsmp_64.c24
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
135 files changed, 10332 insertions, 6960 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d..339ce35648e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
CFLAGS_tsc.o := $(nostackp)
+CFLAGS_paravirt.o := $(nostackp)
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
+obj-y += setup.o i8259.o irqinit_$(BITS).o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -49,31 +50,27 @@ obj-y += step.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
+obj-y += reboot.o
obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
-obj-$(CONFIG_X86_SMP) += smp.o
-obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o
-obj-$(CONFIG_X86_32_SMP) += smpcommon.o
-obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o
+obj-$(CONFIG_SMP) += setup_percpu.o
+obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
-obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
-obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_ES7000) += es7000_32.o
-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
-obj-y += vsmp_64.o
+obj-$(CONFIG_X86_VSMP) += vsmp_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module_$(BITS).o
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -114,16 +111,13 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
- obj-y += bios_uv.o uv_irq.o uv_sysfs.o
- obj-y += genx2apic_cluster.o
- obj-y += genx2apic_phys.o
- obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
- obj-$(CONFIG_AUDIT) += audit_64.o
-
- obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
- obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
- obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
-
- obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
+ obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
+ obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
+ obj-$(CONFIG_AUDIT) += audit_64.o
+
+ obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
+ obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
+ obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
+
+ obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7678f10c456..a18eb7ce223 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,15 +37,10 @@
#include <asm/pgtable.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
-#include <asm/genapic.h>
#include <asm/io.h>
#include <asm/mpspec.h>
#include <asm/smp.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-# include <mach_apic.h>
-#endif
-
static int __initdata acpi_force = 0;
u32 acpi_rsdt_forced;
#ifdef CONFIG_ACPI
@@ -56,16 +51,7 @@ int acpi_disabled = 1;
EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
-
-#include <asm/proto.h>
-
-#else /* X86 */
-
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <mach_apic.h>
-#include <mach_mpparse.h>
-#endif /* CONFIG_X86_LOCAL_APIC */
-
+# include <asm/proto.h>
#endif /* X86 */
#define BAD_MADT_ENTRY(entry, end) ( \
@@ -121,35 +107,18 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
*/
char *__init __acpi_map_table(unsigned long phys, unsigned long size)
{
- unsigned long base, offset, mapped_size;
- int idx;
if (!phys || !size)
return NULL;
- if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
- return __va(phys);
-
- offset = phys & (PAGE_SIZE - 1);
- mapped_size = PAGE_SIZE - offset;
- clear_fixmap(FIX_ACPI_END);
- set_fixmap(FIX_ACPI_END, phys);
- base = fix_to_virt(FIX_ACPI_END);
-
- /*
- * Most cases can be covered by the below.
- */
- idx = FIX_ACPI_END;
- while (mapped_size < size) {
- if (--idx < FIX_ACPI_BEGIN)
- return NULL; /* cannot handle this */
- phys += PAGE_SIZE;
- clear_fixmap(idx);
- set_fixmap(idx, phys);
- mapped_size += PAGE_SIZE;
- }
+ return early_ioremap(phys, size);
+}
+void __init __acpi_unmap_table(char *map, unsigned long size)
+{
+ if (!map || !size)
+ return;
- return ((unsigned char *)base + offset);
+ early_iounmap(map, size);
}
#ifdef CONFIG_PCI_MMCONFIG
@@ -239,7 +208,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
madt->address);
}
- acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
+ default_acpi_madt_oem_check(madt->header.oem_id,
+ madt->header.oem_table_id);
return 0;
}
@@ -884,7 +854,7 @@ static struct {
DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
} mp_ioapic_routing[MAX_IO_APICS];
-static int mp_find_ioapic(int gsi)
+int mp_find_ioapic(int gsi)
{
int i = 0;
@@ -899,6 +869,16 @@ static int mp_find_ioapic(int gsi)
return -1;
}
+int mp_find_ioapic_pin(int ioapic, int gsi)
+{
+ if (WARN_ON(ioapic == -1))
+ return -1;
+ if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
+ return -1;
+
+ return gsi - mp_ioapic_routing[ioapic].gsi_base;
+}
+
static u8 __init uniq_ioapic_id(u8 id)
{
#ifdef CONFIG_X86_32
@@ -912,8 +892,8 @@ static u8 __init uniq_ioapic_id(u8 id)
DECLARE_BITMAP(used, 256);
bitmap_zero(used, 256);
for (i = 0; i < nr_ioapics; i++) {
- struct mp_config_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->mp_apicid, used);
+ struct mpc_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->apicid, used);
}
if (!test_bit(id, used))
return id;
@@ -945,29 +925,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
idx = nr_ioapics;
- mp_ioapics[idx].mp_type = MP_IOAPIC;
- mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
- mp_ioapics[idx].mp_apicaddr = address;
+ mp_ioapics[idx].type = MP_IOAPIC;
+ mp_ioapics[idx].flags = MPC_APIC_USABLE;
+ mp_ioapics[idx].apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
+ mp_ioapics[idx].apicid = uniq_ioapic_id(id);
#ifdef CONFIG_X86_32
- mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
+ mp_ioapics[idx].apicver = io_apic_get_version(idx);
#else
- mp_ioapics[idx].mp_apicver = 0;
+ mp_ioapics[idx].apicver = 0;
#endif
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
mp_ioapic_routing[idx].gsi_base = gsi_base;
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
- mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
+ mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
nr_ioapics++;
@@ -996,19 +976,19 @@ int __init acpi_probe_gsi(void)
return max_gsi + 1;
}
-static void assign_to_mp_irq(struct mp_config_intsrc *m,
- struct mp_config_intsrc *mp_irq)
+static void assign_to_mp_irq(struct mpc_intsrc *m,
+ struct mpc_intsrc *mp_irq)
{
- memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+ memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
}
-static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
- struct mp_config_intsrc *m)
+static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
+ struct mpc_intsrc *m)
{
- return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+ return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
}
-static void save_mp_irq(struct mp_config_intsrc *m)
+static void save_mp_irq(struct mpc_intsrc *m)
{
int i;
@@ -1026,7 +1006,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
int ioapic;
int pin;
- struct mp_config_intsrc mp_irq;
+ struct mpc_intsrc mp_irq;
/*
* Convert 'gsi' to 'ioapic.pin'.
@@ -1034,7 +1014,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
ioapic = mp_find_ioapic(gsi);
if (ioapic < 0)
return;
- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
/*
* TBD: This check is for faulty timer entries, where the override
@@ -1044,13 +1024,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
if ((bus_irq == 0) && (trigger == 3))
trigger = 1;
- mp_irq.mp_type = MP_INTSRC;
- mp_irq.mp_irqtype = mp_INT;
- mp_irq.mp_irqflag = (trigger << 2) | polarity;
- mp_irq.mp_srcbus = MP_ISA_BUS;
- mp_irq.mp_srcbusirq = bus_irq; /* IRQ */
- mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
- mp_irq.mp_dstirq = pin; /* INTIN# */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq; /* IRQ */
+ mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
+ mp_irq.dstirq = pin; /* INTIN# */
save_mp_irq(&mp_irq);
}
@@ -1060,7 +1040,7 @@ void __init mp_config_acpi_legacy_irqs(void)
int i;
int ioapic;
unsigned int dstapic;
- struct mp_config_intsrc mp_irq;
+ struct mpc_intsrc mp_irq;
#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
/*
@@ -1085,7 +1065,7 @@ void __init mp_config_acpi_legacy_irqs(void)
ioapic = mp_find_ioapic(0);
if (ioapic < 0)
return;
- dstapic = mp_ioapics[ioapic].mp_apicid;
+ dstapic = mp_ioapics[ioapic].apicid;
/*
* Use the default configuration for the IRQs 0-15. Unless
@@ -1095,16 +1075,14 @@ void __init mp_config_acpi_legacy_irqs(void)
int idx;
for (idx = 0; idx < mp_irq_entries; idx++) {
- struct mp_config_intsrc *irq = mp_irqs + idx;
+ struct mpc_intsrc *irq = mp_irqs + idx;
/* Do we already have a mapping for this ISA IRQ? */
- if (irq->mp_srcbus == MP_ISA_BUS
- && irq->mp_srcbusirq == i)
+ if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
break;
/* Do we already have a mapping for this IOAPIC pin */
- if (irq->mp_dstapic == dstapic &&
- irq->mp_dstirq == i)
+ if (irq->dstapic == dstapic && irq->dstirq == i)
break;
}
@@ -1113,13 +1091,13 @@ void __init mp_config_acpi_legacy_irqs(void)
continue; /* IRQ already used */
}
- mp_irq.mp_type = MP_INTSRC;
- mp_irq.mp_irqflag = 0; /* Conforming */
- mp_irq.mp_srcbus = MP_ISA_BUS;
- mp_irq.mp_dstapic = dstapic;
- mp_irq.mp_irqtype = mp_INT;
- mp_irq.mp_srcbusirq = i; /* Identity mapped */
- mp_irq.mp_dstirq = i;
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqflag = 0; /* Conforming */
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.dstapic = dstapic;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.srcbusirq = i; /* Identity mapped */
+ mp_irq.dstirq = i;
save_mp_irq(&mp_irq);
}
@@ -1156,7 +1134,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
return gsi;
}
- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
#ifdef CONFIG_X86_32
if (ioapic_renumber_irq)
@@ -1230,22 +1208,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
u32 gsi, int triggering, int polarity)
{
#ifdef CONFIG_X86_MPPARSE
- struct mp_config_intsrc mp_irq;
+ struct mpc_intsrc mp_irq;
int ioapic;
if (!acpi_ioapic)
return 0;
/* print the entry should happen on mptable identically */
- mp_irq.mp_type = MP_INTSRC;
- mp_irq.mp_irqtype = mp_INT;
- mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.mp_srcbus = number;
- mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
ioapic = mp_find_ioapic(gsi);
- mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
- mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
save_mp_irq(&mp_irq);
#endif
@@ -1372,7 +1350,7 @@ static void __init acpi_process_madt(void)
if (!error) {
acpi_lapic = 1;
-#ifdef CONFIG_X86_GENERICARCH
+#ifdef CONFIG_X86_BIGSMP
generic_bigsmp_probe();
#endif
/*
@@ -1384,9 +1362,8 @@ static void __init acpi_process_madt(void)
acpi_ioapic = 1;
smp_found_config = 1;
-#ifdef CONFIG_X86_32
- setup_apic_routing();
-#endif
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
}
}
if (error == -EINVAL) {
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 3355973b12a..580b4e29601 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -3,8 +3,8 @@
*/
#include <asm/segment.h>
#include <asm/msr-index.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
#include <asm/processor-flags.h>
.code16
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a60c1f3bcb8..7c243a2c511 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
stack_start.sp = temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
+ initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index a12e6a9fb65..8ded418b059 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,7 +1,7 @@
.section .text.page_aligned
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 96258d9dc97..8ea5164cbd0 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,8 +1,8 @@
.text
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
+#include <asm/pgtable_types.h>
+#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b570e..4c80f155743 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
that might execute the to be patched code.
Other CPUs are not running. */
stop_nmi();
-#ifdef CONFIG_X86_MCE
- stop_mce();
-#endif
+
+ /*
+ * Don't stop machine check exceptions while patching.
+ * MCEs only happen when something got corrupted and in this
+ * case we must do something about the corruption.
+ * Ignoring it is worse than a unlikely patching race.
+ * Also machine checks tend to be broadcast and if one CPU
+ * goes into machine check the others follow quickly, so we don't
+ * expect a machine check to cause undue problems during to code
+ * patching.
+ */
apply_alternatives(__alt_instructions, __alt_instructions_end);
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
(unsigned long)__smp_locks_end);
restart_nmi();
-#ifdef CONFIG_X86_MCE
- restart_mce();
-#endif
}
/**
@@ -498,12 +503,12 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)
*/
void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
{
- unsigned long flags;
char *vaddr;
int nr_pages = 2;
struct page *pages[2];
int i;
+ might_sleep();
if (!core_kernel_text((unsigned long)addr)) {
pages[0] = vmalloc_to_page(addr);
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
@@ -517,9 +522,9 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
nr_pages = 1;
vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
BUG_ON(!vaddr);
- local_irq_save(flags);
+ local_irq_disable();
memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
- local_irq_restore(flags);
+ local_irq_enable();
vunmap(vaddr);
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
new file mode 100644
index 00000000000..da7b7b9f8bd
--- /dev/null
+++ b/arch/x86/kernel/apic/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile for local APIC drivers and for the IO-APIC code
+#
+
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_SMP) += ipi.o
+
+ifeq ($(CONFIG_X86_64),y)
+obj-y += apic_flat_64.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+endif
+
+obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
+obj-$(CONFIG_X86_ES7000) += es7000_32.o
+obj-$(CONFIG_X86_SUMMIT) += summit_32.o
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic/apic.c
index 570f36e44e5..85eb8e10081 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1,7 +1,7 @@
/*
* Local APIC handling, local APIC timers
*
- * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
*
* Fixes
* Maciej W. Rozycki : Bits for genuine 82489DX APICs;
@@ -14,51 +14,70 @@
* Mikael Pettersson : PM converted to driver model.
*/
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/ioport.h>
-#include <linux/cpu.h>
-#include <linux/clockchips.h>
+#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/ioport.h>
#include <linux/module.h>
-#include <linux/dmi.h>
+#include <linux/sysdev.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
#include <linux/dmar.h>
-#include <linux/ftrace.h>
-#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/dmi.h>
#include <linux/nmi.h>
-#include <linux/timex.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <asm/pgalloc.h>
#include <asm/atomic.h>
-#include <asm/mtrr.h>
#include <asm/mpspec.h>
-#include <asm/desc.h>
-#include <asm/arch_hooks.h>
-#include <asm/hpet.h>
-#include <asm/pgalloc.h>
#include <asm/i8253.h>
-#include <asm/idle.h>
+#include <asm/i8259.h>
#include <asm/proto.h>
#include <asm/apic.h>
-#include <asm/i8259.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/idle.h>
+#include <asm/mtrr.h>
#include <asm/smp.h>
+#include <asm/mce.h>
+
+unsigned int num_processors;
+
+unsigned disabled_cpus __cpuinitdata;
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-#include <mach_ipi.h>
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
/*
- * Sanity check
+ * The highest APIC ID seen during enumeration.
+ *
+ * This determines the messaging protocol we can use: if all APIC IDs
+ * are in the 0 ... 7 range, then we can use logical addressing which
+ * has some performance advantages (better broadcasting).
+ *
+ * If there's an APIC ID above 8, we use physical addressing.
*/
-#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
-# error SPURIOUS_APIC_VECTOR definition error
-#endif
+unsigned int max_physical_apicid;
+
+/*
+ * Bitmask of physically existing CPUs:
+ */
+physid_mask_t phys_cpu_present_map;
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
#ifdef CONFIG_X86_32
/*
@@ -92,11 +111,7 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-#ifdef CONFIG_X86_64
-#define HAVE_X2APIC
-#endif
-
-#ifdef HAVE_X2APIC
+#ifdef CONFIG_X86_X2APIC
int x2apic;
/* x2apic enabled before OS handover */
static int x2apic_preenabled;
@@ -194,18 +209,13 @@ static int modern_apic(void)
return lapic_get_version() >= 0x14;
}
-/*
- * Paravirt kernels also might be using these below ops. So we still
- * use generic apic_read()/apic_write(), which might be pointing to different
- * ops in PARAVIRT case.
- */
-void xapic_wait_icr_idle(void)
+void native_apic_wait_icr_idle(void)
{
while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
cpu_relax();
}
-u32 safe_xapic_wait_icr_idle(void)
+u32 native_safe_apic_wait_icr_idle(void)
{
u32 send_status;
int timeout;
@@ -221,13 +231,13 @@ u32 safe_xapic_wait_icr_idle(void)
return send_status;
}
-void xapic_icr_write(u32 low, u32 id)
+void native_apic_icr_write(u32 low, u32 id)
{
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
}
-static u64 xapic_icr_read(void)
+u64 native_apic_icr_read(void)
{
u32 icr1, icr2;
@@ -237,54 +247,6 @@ static u64 xapic_icr_read(void)
return icr1 | ((u64)icr2 << 32);
}
-static struct apic_ops xapic_ops = {
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = xapic_icr_read,
- .icr_write = xapic_icr_write,
- .wait_icr_idle = xapic_wait_icr_idle,
- .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
-};
-
-struct apic_ops __read_mostly *apic_ops = &xapic_ops;
-EXPORT_SYMBOL_GPL(apic_ops);
-
-#ifdef HAVE_X2APIC
-static void x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return;
-}
-
-static u32 safe_x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return 0;
-}
-
-void x2apic_icr_write(u32 low, u32 id)
-{
- wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
-}
-
-static u64 x2apic_icr_read(void)
-{
- unsigned long val;
-
- rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
- return val;
-}
-
-static struct apic_ops x2apic_ops = {
- .read = native_apic_msr_read,
- .write = native_apic_msr_write,
- .icr_read = x2apic_icr_read,
- .icr_write = x2apic_icr_write,
- .wait_icr_idle = x2apic_wait_icr_idle,
- .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
-};
-#endif
-
/**
* enable_NMI_through_LVT0 - enable NMI through local vector table 0
*/
@@ -457,7 +419,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
static void lapic_timer_broadcast(const struct cpumask *mask)
{
#ifdef CONFIG_SMP
- send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+ apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
#endif
}
@@ -535,7 +497,8 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
}
-static int __init calibrate_by_pmtimer(long deltapm, long *delta)
+static int __init
+calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -546,7 +509,7 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -556,19 +519,30 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
if (deltapm > (pm_100ms - pm_thresh) &&
deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
- } else {
- res = (((u64)deltapm) * mult) >> 22;
- do_div(res, 1000000);
- pr_warning("APIC calibration not consistent "
- "with PM Timer: %ldms instead of 100ms\n",
- (long)res);
- /* Correct the lapic counter value */
- res = (((u64)(*delta)) * pm_100ms);
+ apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ return 0;
+ }
+
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ pr_warning("APIC calibration not consistent "
+ "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ pr_info("APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+
+ /* Correct the tsc counter value */
+ if (cpu_has_tsc) {
+ res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
- pr_info("APIC delta adjusted to PM-Timer: "
- "%lu (%ld)\n", (unsigned long)res, *delta);
- *delta = (long)res;
+ apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
+ "PM-Timer: %lu (%ld) \n",
+ (unsigned long)res, *deltatsc);
+ *deltatsc = (long)res;
}
return 0;
@@ -579,7 +553,7 @@ static int __init calibrate_APIC_clock(void)
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
void (*real_handler)(struct clock_event_device *dev);
unsigned long deltaj;
- long delta;
+ long delta, deltatsc;
int pm_referenced = 0;
local_irq_disable();
@@ -609,9 +583,11 @@ static int __init calibrate_APIC_clock(void)
delta = lapic_cal_t1 - lapic_cal_t2;
apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+ deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+
/* we trust the PM based calibration if possible */
pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
- &delta);
+ &delta, &deltatsc);
/* Calculate the scaled math multiplication factor */
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -629,11 +605,10 @@ static int __init calibrate_APIC_clock(void)
calibration_result);
if (cpu_has_tsc) {
- delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
"%ld.%04ld MHz.\n",
- (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
}
apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
@@ -834,7 +809,7 @@ void clear_local_APIC(void)
u32 v;
/* APIC hasn't been mapped yet */
- if (!apic_phys)
+ if (!x2apic && !apic_phys)
return;
maxlvt = lapic_get_maxlvt();
@@ -868,6 +843,14 @@ void clear_local_APIC(void)
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
}
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+#endif
+
/*
* Clean APIC state for other OSs:
*/
@@ -991,11 +974,11 @@ int __init verify_local_APIC(void)
*/
reg0 = apic_read(APIC_ID);
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+ apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
reg1 = apic_read(APIC_ID);
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ APIC_ID_MASK))
+ if (reg1 != (reg0 ^ apic->apic_id_mask))
return 0;
/*
@@ -1089,7 +1072,7 @@ static void __cpuinit lapic_setup_esr(void)
return;
}
- if (esr_disable) {
+ if (apic->disable_esr) {
/*
* Something untraceable is creating bad interrupts on
* secondary quads ... for the moment, just leave the
@@ -1130,9 +1113,14 @@ void __cpuinit setup_local_APIC(void)
unsigned int value;
int i, j;
+ if (disable_apic) {
+ arch_disable_smp_support();
+ return;
+ }
+
#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
- if (lapic_is_integrated() && esr_disable) {
+ if (lapic_is_integrated() && apic->disable_esr) {
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
@@ -1146,7 +1134,7 @@ void __cpuinit setup_local_APIC(void)
* Double-check whether this APIC is really registered.
* This is meaningless in clustered apic mode, so we skip it.
*/
- if (!apic_id_registered())
+ if (!apic->apic_id_registered())
BUG();
/*
@@ -1154,7 +1142,7 @@ void __cpuinit setup_local_APIC(void)
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
* document number 292116). So here it goes...
*/
- init_apic_ldr();
+ apic->init_apic_ldr();
/*
* Set Task Priority to 'accept all'. We never change this
@@ -1262,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_LVT1, value);
preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (smp_processor_id() == 0)
+ cmci_recheck();
+#endif
}
void __cpuinit end_local_APIC_setup(void)
@@ -1282,17 +1276,12 @@ void __cpuinit end_local_APIC_setup(void)
apic_pm_activate();
}
-#ifdef HAVE_X2APIC
+#ifdef CONFIG_X86_X2APIC
void check_x2apic(void)
{
- int msr, msr2;
-
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
-
- if (msr & X2APIC_ENABLE) {
+ if (x2apic_enabled()) {
pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
x2apic_preenabled = x2apic = 1;
- apic_ops = &x2apic_ops;
}
}
@@ -1300,6 +1289,9 @@ void enable_x2apic(void)
{
int msr, msr2;
+ if (!x2apic)
+ return;
+
rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
pr_info("Enabling x2apic\n");
@@ -1342,15 +1334,16 @@ void __init enable_IR_x2apic(void)
return;
}
- local_irq_save(flags);
- mask_8259A();
-
- ret = save_mask_IO_APIC_setup();
+ ret = save_IO_APIC_setup();
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
goto end;
}
+ local_irq_save(flags);
+ mask_IO_APIC_setup();
+ mask_8259A();
+
ret = enable_intr_remapping(1);
if (ret && x2apic_preenabled) {
@@ -1363,7 +1356,6 @@ void __init enable_IR_x2apic(void)
if (!x2apic) {
x2apic = 1;
- apic_ops = &x2apic_ops;
enable_x2apic();
}
@@ -1376,10 +1368,10 @@ end_restore:
else
reinit_intr_remapped_IO_APIC(x2apic_preenabled);
-end:
unmask_8259A();
local_irq_restore(flags);
+end:
if (!ret) {
if (!x2apic_preenabled)
pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1401,7 +1393,7 @@ end:
return;
}
-#endif /* HAVE_X2APIC */
+#endif /* CONFIG_X86_X2APIC */
#ifdef CONFIG_X86_64
/*
@@ -1532,12 +1524,10 @@ void __init early_init_lapic_mapping(void)
*/
void __init init_apic_mappings(void)
{
-#ifdef HAVE_X2APIC
if (x2apic) {
boot_cpu_physical_apicid = read_apic_id();
return;
}
-#endif
/*
* If no local APIC can be found then set up a fake all
@@ -1570,11 +1560,11 @@ int apic_version[MAX_APICS];
int __init APIC_init_uniprocessor(void)
{
-#ifdef CONFIG_X86_64
if (disable_apic) {
pr_info("Apic disabled\n");
return -1;
}
+#ifdef CONFIG_X86_64
if (!cpu_has_apic) {
disable_apic = 1;
pr_info("Apic disabled by BIOS\n");
@@ -1596,11 +1586,9 @@ int __init APIC_init_uniprocessor(void)
}
#endif
-#ifdef HAVE_X2APIC
enable_IR_x2apic();
-#endif
#ifdef CONFIG_X86_64
- setup_apic_routing();
+ default_setup_apic_routing();
#endif
verify_local_APIC();
@@ -1621,35 +1609,31 @@ int __init APIC_init_uniprocessor(void)
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
setup_local_APIC();
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_IO_APIC
/*
* Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling vector on BP
+ * We need clear_IO_APIC before enabling error vector
*/
if (!skip_ioapic_setup && nr_ioapics)
enable_IO_APIC();
#endif
-#ifdef CONFIG_X86_IO_APIC
- if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
-#endif
- localise_nmi_watchdog();
end_local_APIC_setup();
#ifdef CONFIG_X86_IO_APIC
if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
setup_IO_APIC();
-# ifdef CONFIG_X86_64
- else
+ else {
nr_ioapics = 0;
-# endif
+ localise_nmi_watchdog();
+ }
+#else
+ localise_nmi_watchdog();
#endif
+ setup_boot_clock();
#ifdef CONFIG_X86_64
- setup_boot_APIC_clock();
check_nmi_watchdog();
-#else
- setup_boot_clock();
#endif
return 0;
@@ -1738,7 +1722,8 @@ void __init connect_bsp_APIC(void)
outb(0x01, 0x23);
}
#endif
- enable_apic_mode();
+ if (apic->enable_apic_mode)
+ apic->enable_apic_mode();
}
/**
@@ -1876,29 +1861,39 @@ void __cpuinit generic_processor_info(int apicid, int version)
}
#endif
-#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
- /* are we being called early in kernel startup? */
- if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
- cpu_to_apicid[cpu] = apicid;
- bios_cpu_apicid[cpu] = apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
- }
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
#endif
set_cpu_possible(cpu, true);
set_cpu_present(cpu, true);
}
-#ifdef CONFIG_X86_64
int hard_smp_processor_id(void)
{
return read_apic_id();
}
+
+void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+#ifdef CONFIG_X86_32
+int default_apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
#endif
/*
@@ -1976,12 +1971,9 @@ static int lapic_resume(struct sys_device *dev)
local_irq_save(flags);
-#ifdef HAVE_X2APIC
if (x2apic)
enable_x2apic();
- else
-#endif
- {
+ else {
/*
* Make sure the APICBASE points to the right address
*
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 34185488e4f..f933822dba1 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -17,9 +17,8 @@
#include <linux/init.h>
#include <linux/hardirq.h>
#include <asm/smp.h>
+#include <asm/apic.h>
#include <asm/ipi.h>
-#include <asm/genapic.h>
-#include <mach_apicdef.h>
#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
@@ -74,7 +73,7 @@ static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
unsigned long flags;
local_irq_save(flags);
- __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
local_irq_restore(flags);
}
@@ -85,14 +84,15 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
_flat_send_IPI_mask(mask, vector);
}
-static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
- int vector)
+static void
+ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
{
unsigned long mask = cpumask_bits(cpumask)[0];
int cpu = smp_processor_id();
if (cpu < BITS_PER_LONG)
clear_bit(cpu, &mask);
+
_flat_send_IPI_mask(mask, vector);
}
@@ -114,23 +114,27 @@ static void flat_send_IPI_allbutself(int vector)
_flat_send_IPI_mask(mask, vector);
}
} else if (num_online_cpus() > 1) {
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
+ vector, apic->dest_logical);
}
}
static void flat_send_IPI_all(int vector)
{
- if (vector == NMI_VECTOR)
+ if (vector == NMI_VECTOR) {
flat_send_IPI_mask(cpu_online_mask, vector);
- else
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+ } else {
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC,
+ vector, apic->dest_logical);
+ }
}
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int flat_get_apic_id(unsigned long x)
{
unsigned int id;
id = (((x)>>24) & 0xFFu);
+
return id;
}
@@ -146,7 +150,7 @@ static unsigned int read_xapic_id(void)
{
unsigned int id;
- id = get_apic_id(apic_read(APIC_ID));
+ id = flat_get_apic_id(apic_read(APIC_ID));
return id;
}
@@ -169,31 +173,67 @@ static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return mask1 & mask2;
}
-static unsigned int phys_pkg_id(int index_msb)
+static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
return hard_smp_processor_id() >> index_msb;
}
-struct genapic apic_flat = {
- .name = "flat",
- .acpi_madt_oem_check = flat_acpi_madt_oem_check,
- .int_delivery_mode = dest_LowestPrio,
- .int_dest_mode = (APIC_DEST_LOGICAL != 0),
- .target_cpus = flat_target_cpus,
- .vector_allocation_domain = flat_vector_allocation_domain,
- .apic_id_registered = flat_apic_id_registered,
- .init_apic_ldr = flat_init_apic_ldr,
- .send_IPI_all = flat_send_IPI_all,
- .send_IPI_allbutself = flat_send_IPI_allbutself,
- .send_IPI_mask = flat_send_IPI_mask,
- .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
- .send_IPI_self = apic_send_IPI_self,
- .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFu<<24),
+struct apic apic_flat = {
+ .name = "flat",
+ .probe = NULL,
+ .acpi_madt_oem_check = flat_acpi_madt_oem_check,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = flat_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = flat_vector_allocation_domain,
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = flat_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFu << 24,
+
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = flat_send_IPI_mask,
+ .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = flat_send_IPI_allbutself,
+ .send_IPI_all = flat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
/*
@@ -232,18 +272,18 @@ static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
{
- send_IPI_mask_sequence(cpumask, vector);
+ default_send_IPI_mask_sequence_phys(cpumask, vector);
}
static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
int vector)
{
- send_IPI_mask_allbutself(cpumask, vector);
+ default_send_IPI_mask_allbutself_phys(cpumask, vector);
}
static void physflat_send_IPI_allbutself(int vector)
{
- send_IPI_mask_allbutself(cpu_online_mask, vector);
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
}
static void physflat_send_IPI_all(int vector)
@@ -276,32 +316,72 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- for_each_cpu_and(cpu, cpumask, andmask)
+ for_each_cpu_and(cpu, cpumask, andmask) {
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
+ }
if (cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
+
return BAD_APICID;
}
-struct genapic apic_physflat = {
- .name = "physical flat",
- .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
- .int_delivery_mode = dest_Fixed,
- .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .target_cpus = physflat_target_cpus,
- .vector_allocation_domain = physflat_vector_allocation_domain,
- .apic_id_registered = flat_apic_id_registered,
- .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
- .send_IPI_all = physflat_send_IPI_all,
- .send_IPI_allbutself = physflat_send_IPI_allbutself,
- .send_IPI_mask = physflat_send_IPI_mask,
- .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
- .send_IPI_self = apic_send_IPI_self,
- .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFu<<24),
+struct apic apic_physflat = {
+
+ .name = "physical flat",
+ .probe = NULL,
+ .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = physflat_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = physflat_vector_allocation_domain,
+ /* not needed, but shouldn't hurt: */
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = flat_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFu << 24,
+
+ .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = physflat_send_IPI_mask,
+ .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = physflat_send_IPI_allbutself,
+ .send_IPI_all = physflat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
new file mode 100644
index 00000000000..d806ecaa948
--- /dev/null
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -0,0 +1,267 @@
+/*
+ * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
+ *
+ * Drives the local APIC in "clustered mode".
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+
+#include <asm/apicdef.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+static unsigned bigsmp_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static int bigsmp_apic_id_registered(void)
+{
+ return 1;
+}
+
+static const cpumask_t *bigsmp_target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return &cpu_online_map;
+#else
+ return &cpumask_of_cpu(0);
+#endif
+}
+
+static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+
+static unsigned long bigsmp_check_apicid_present(int bit)
+{
+ return 1;
+}
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long val, id;
+
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ id = per_cpu(x86_bios_cpu_apicid, cpu);
+ val |= SET_APIC_LOGICAL_ID(id);
+
+ return val;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void bigsmp_init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void bigsmp_setup_apic_routing(void)
+{
+ printk(KERN_INFO
+ "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static int bigsmp_apicid_to_node(int logical_apicid)
+{
+ return apicid_2_node[hard_smp_processor_id()];
+}
+
+static int bigsmp_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+
+ return BAD_APICID;
+}
+
+static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
+{
+ return physid_mask_of_physid(phys_apicid);
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int bigsmp_cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return cpu_physical_id(cpu);
+}
+
+static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xFFL);
+}
+
+static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+/* As we are using single CPU as destination, pick only one CPU here */
+static unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask));
+}
+
+static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ if (cpu < nr_cpu_ids)
+ return bigsmp_cpu_to_logical_apicid(cpu);
+
+ return BAD_APICID;
+}
+
+static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_phys(mask, vector);
+}
+
+static void bigsmp_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void bigsmp_send_IPI_all(int vector)
+{
+ bigsmp_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int dmi_bigsmp; /* can be set by dmi scanners */
+
+static int hp_ht_bigsmp(const struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
+ dmi_bigsmp = 1;
+
+ return 0;
+}
+
+
+static const struct dmi_system_id bigsmp_dmi_table[] = {
+ { hp_ht_bigsmp, "HP ProLiant DL760 G2",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
+ }
+ },
+
+ { hp_ht_bigsmp, "HP ProLiant DL740",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
+ }
+ },
+ { } /* NULL entry stops DMI scanning */
+};
+
+static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ cpus_clear(*retmask);
+ cpu_set(cpu, *retmask);
+}
+
+static int probe_bigsmp(void)
+{
+ if (def_to_bigsmp)
+ dmi_bigsmp = 1;
+ else
+ dmi_check_system(bigsmp_dmi_table);
+
+ return dmi_bigsmp;
+}
+
+struct apic apic_bigsmp = {
+
+ .name = "bigsmp",
+ .probe = probe_bigsmp,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = bigsmp_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPU: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = bigsmp_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = bigsmp_check_apicid_used,
+ .check_apicid_present = bigsmp_check_apicid_present,
+
+ .vector_allocation_domain = bigsmp_vector_allocation_domain,
+ .init_apic_ldr = bigsmp_init_apic_ldr,
+
+ .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
+ .setup_apic_routing = bigsmp_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = bigsmp_apicid_to_node,
+ .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
+ .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = bigsmp_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = bigsmp_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = bigsmp_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
+ .send_IPI_all = bigsmp_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
new file mode 100644
index 00000000000..19588f2770e
--- /dev/null
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -0,0 +1,780 @@
+/*
+ * Written by: Garry Forsgren, Unisys Corporation
+ * Natalie Protasevich, Unisys Corporation
+ *
+ * This file contains the code to configure and interface
+ * with Unisys ES7000 series hardware system manager.
+ *
+ * Copyright (c) 2003 Unisys Corporation.
+ * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
+ *
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Unisys Corporation, Township Line & Union Meeting
+ * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
+ *
+ * http://www.unisys.com
+ */
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/nmi.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#include <asm/apicdef.h>
+#include <asm/atomic.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+/*
+ * ES7000 chipsets
+ */
+
+#define NON_UNISYS 0
+#define ES7000_CLASSIC 1
+#define ES7000_ZORRO 2
+
+#define MIP_REG 1
+#define MIP_PSAI_REG 4
+
+#define MIP_BUSY 1
+#define MIP_SPIN 0xf0000
+#define MIP_VALID 0x0100000000000000ULL
+#define MIP_SW_APIC 0x1020b
+
+#define MIP_PORT(val) ((val >> 32) & 0xffff)
+
+#define MIP_RD_LO(val) (val & 0xffffffff)
+
+struct mip_reg {
+ unsigned long long off_0x00;
+ unsigned long long off_0x08;
+ unsigned long long off_0x10;
+ unsigned long long off_0x18;
+ unsigned long long off_0x20;
+ unsigned long long off_0x28;
+ unsigned long long off_0x30;
+ unsigned long long off_0x38;
+};
+
+struct mip_reg_info {
+ unsigned long long mip_info;
+ unsigned long long delivery_info;
+ unsigned long long host_reg;
+ unsigned long long mip_reg;
+};
+
+struct psai {
+ unsigned long long entry_type;
+ unsigned long long addr;
+ unsigned long long bep_addr;
+};
+
+#ifdef CONFIG_ACPI
+
+struct es7000_oem_table {
+ struct acpi_table_header Header;
+ u32 OEMTableAddr;
+ u32 OEMTableSize;
+};
+
+static unsigned long oem_addrX;
+static unsigned long oem_size;
+
+#endif
+
+/*
+ * ES7000 Globals
+ */
+
+static volatile unsigned long *psai;
+static struct mip_reg *mip_reg;
+static struct mip_reg *host_reg;
+static int mip_port;
+static unsigned long mip_addr;
+static unsigned long host_addr;
+
+int es7000_plat;
+
+/*
+ * GSI override for ES7000 platforms.
+ */
+
+static unsigned int base;
+
+static int
+es7000_rename_gsi(int ioapic, int gsi)
+{
+ if (es7000_plat == ES7000_ZORRO)
+ return gsi;
+
+ if (!base) {
+ int i;
+ for (i = 0; i < nr_ioapics; i++)
+ base += nr_ioapic_registers[i];
+ }
+
+ if (!ioapic && (gsi < 16))
+ gsi += base;
+
+ return gsi;
+}
+
+static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+{
+ unsigned long vect = 0, psaival = 0;
+
+ if (psai == NULL)
+ return -1;
+
+ vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+ psaival = (0x1000000 | vect | cpu);
+
+ while (*psai & 0x1000000)
+ ;
+
+ *psai = psaival;
+
+ return 0;
+}
+
+static int es7000_apic_is_cluster(void)
+{
+ /* MPENTIUMIII */
+ if (boot_cpu_data.x86 == 6 &&
+ (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11))
+ return 1;
+
+ return 0;
+}
+
+static void setup_unisys(void)
+{
+ /*
+ * Determine the generation of the ES7000 currently running.
+ *
+ * es7000_plat = 1 if the machine is a 5xx ES7000 box
+ * es7000_plat = 2 if the machine is a x86_64 ES7000 box
+ *
+ */
+ if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
+ es7000_plat = ES7000_ZORRO;
+ else
+ es7000_plat = ES7000_CLASSIC;
+ ioapic_renumber_irq = es7000_rename_gsi;
+}
+
+/*
+ * Parse the OEM Table:
+ */
+static int parse_unisys_oem(char *oemptr)
+{
+ int i;
+ int success = 0;
+ unsigned char type, size;
+ unsigned long val;
+ char *tp = NULL;
+ struct psai *psaip = NULL;
+ struct mip_reg_info *mi;
+ struct mip_reg *host, *mip;
+
+ tp = oemptr;
+
+ tp += 8;
+
+ for (i = 0; i <= 6; i++) {
+ type = *tp++;
+ size = *tp++;
+ tp -= 2;
+ switch (type) {
+ case MIP_REG:
+ mi = (struct mip_reg_info *)tp;
+ val = MIP_RD_LO(mi->host_reg);
+ host_addr = val;
+ host = (struct mip_reg *)val;
+ host_reg = __va(host);
+ val = MIP_RD_LO(mi->mip_reg);
+ mip_port = MIP_PORT(mi->mip_info);
+ mip_addr = val;
+ mip = (struct mip_reg *)val;
+ mip_reg = __va(mip);
+ pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
+ (unsigned long)host_reg);
+ pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
+ (unsigned long)mip_reg);
+ success++;
+ break;
+ case MIP_PSAI_REG:
+ psaip = (struct psai *)tp;
+ if (tp != NULL) {
+ if (psaip->addr)
+ psai = __va(psaip->addr);
+ else
+ psai = NULL;
+ success++;
+ }
+ break;
+ default:
+ break;
+ }
+ tp += size;
+ }
+
+ if (success < 2)
+ es7000_plat = NON_UNISYS;
+ else
+ setup_unisys();
+
+ return es7000_plat;
+}
+
+#ifdef CONFIG_ACPI
+static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
+{
+ struct acpi_table_header *header = NULL;
+ struct es7000_oem_table *table;
+ acpi_size tbl_size;
+ acpi_status ret;
+ int i = 0;
+
+ for (;;) {
+ ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
+ if (!ACPI_SUCCESS(ret))
+ return -1;
+
+ if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
+ break;
+
+ early_acpi_os_unmap_memory(header, tbl_size);
+ }
+
+ table = (void *)header;
+
+ oem_addrX = table->OEMTableAddr;
+ oem_size = table->OEMTableSize;
+
+ early_acpi_os_unmap_memory(header, tbl_size);
+
+ *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
+
+ return 0;
+}
+
+static void unmap_unisys_acpi_oem_table(unsigned long oem_addr)
+{
+ if (!oem_addr)
+ return;
+
+ __acpi_unmap_table((char *)oem_addr, oem_size);
+}
+
+static int es7000_check_dsdt(void)
+{
+ struct acpi_table_header header;
+
+ if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
+ !strncmp(header.oem_id, "UNISYS", 6))
+ return 1;
+ return 0;
+}
+
+static int es7000_acpi_ret;
+
+/* Hook from generic ACPI tables.c */
+static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ unsigned long oem_addr = 0;
+ int check_dsdt;
+ int ret = 0;
+
+ /* check dsdt at first to avoid clear fix_map for oem_addr */
+ check_dsdt = es7000_check_dsdt();
+
+ if (!find_unisys_acpi_oem_table(&oem_addr)) {
+ if (check_dsdt) {
+ ret = parse_unisys_oem((char *)oem_addr);
+ } else {
+ setup_unisys();
+ ret = 1;
+ }
+ /*
+ * we need to unmap it
+ */
+ unmap_unisys_acpi_oem_table(oem_addr);
+ }
+
+ es7000_acpi_ret = ret;
+
+ return ret && !es7000_apic_is_cluster();
+}
+
+static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
+{
+ int ret = es7000_acpi_ret;
+
+ return ret && es7000_apic_is_cluster();
+}
+
+#else /* !CONFIG_ACPI: */
+static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 0;
+}
+
+static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
+{
+ return 0;
+}
+#endif /* !CONFIG_ACPI */
+
+static void es7000_spin(int n)
+{
+ int i = 0;
+
+ while (i++ < n)
+ rep_nop();
+}
+
+static int es7000_mip_write(struct mip_reg *mip_reg)
+{
+ int status = 0;
+ int spin;
+
+ spin = MIP_SPIN;
+ while ((host_reg->off_0x38 & MIP_VALID) != 0) {
+ if (--spin <= 0) {
+ WARN(1, "Timeout waiting for Host Valid Flag\n");
+ return -1;
+ }
+ es7000_spin(MIP_SPIN);
+ }
+
+ memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
+ outb(1, mip_port);
+
+ spin = MIP_SPIN;
+
+ while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
+ if (--spin <= 0) {
+ WARN(1, "Timeout waiting for MIP Valid Flag\n");
+ return -1;
+ }
+ es7000_spin(MIP_SPIN);
+ }
+
+ status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
+ mip_reg->off_0x38 &= ~MIP_VALID;
+
+ return status;
+}
+
+static void es7000_enable_apic_mode(void)
+{
+ struct mip_reg es7000_mip_reg;
+ int mip_status;
+
+ if (!es7000_plat)
+ return;
+
+ printk(KERN_INFO "ES7000: Enabling APIC mode.\n");
+ memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
+ es7000_mip_reg.off_0x00 = MIP_SW_APIC;
+ es7000_mip_reg.off_0x38 = MIP_VALID;
+
+ while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
+ WARN(1, "Command failed, status = %x\n", mip_status);
+}
+
+static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+}
+
+
+static void es7000_wait_for_init_deassert(atomic_t *deassert)
+{
+ while (!atomic_read(deassert))
+ cpu_relax();
+}
+
+static unsigned int es7000_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_phys(mask, vector);
+}
+
+static void es7000_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void es7000_send_IPI_all(int vector)
+{
+ es7000_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int es7000_apic_id_registered(void)
+{
+ return 1;
+}
+
+static const cpumask_t *target_cpus_cluster(void)
+{
+ return &CPU_MASK_ALL;
+}
+
+static const cpumask_t *es7000_target_cpus(void)
+{
+ return &cpumask_of_cpu(smp_processor_id());
+}
+
+static unsigned long
+es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+static unsigned long es7000_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static unsigned long calculate_ldr(int cpu)
+{
+ unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
+
+ return SET_APIC_LOGICAL_ID(id);
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LdR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void es7000_init_apic_ldr_cluster(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_CLUSTER);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void es7000_init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void es7000_setup_apic_routing(void)
+{
+ int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
+
+ printk(KERN_INFO
+ "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
+ (apic_version[apic] == 0x14) ?
+ "Physical Cluster" : "Logical Cluster",
+ nr_ioapics, cpus_addr(*es7000_target_cpus())[0]);
+}
+
+static int es7000_apicid_to_node(int logical_apicid)
+{
+ return 0;
+}
+
+
+static int es7000_cpu_present_to_apicid(int mps_cpu)
+{
+ if (!mps_cpu)
+ return boot_cpu_physical_apicid;
+ else if (mps_cpu < nr_cpu_ids)
+ return per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static int cpu_id;
+
+static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
+{
+ physid_mask_t mask;
+
+ mask = physid_mask_of_physid(cpu_id);
+ ++cpu_id;
+
+ return mask;
+}
+
+/* Mapping from cpu number to logical apicid */
+static int es7000_cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xff);
+}
+
+static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
+{
+ boot_cpu_physical_apicid = read_apic_id();
+ return 1;
+}
+
+static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ unsigned int round = 0;
+ int cpu, uninitialized_var(apicid);
+
+ /*
+ * The cpus in the mask must all be on the apic cluster.
+ */
+ for_each_cpu(cpu, cpumask) {
+ int new_apicid = es7000_cpu_to_logical_apicid(cpu);
+
+ if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
+ WARN(1, "Not a valid mask!");
+
+ return BAD_APICID;
+ }
+ apicid = new_apicid;
+ round++;
+ }
+ return apicid;
+}
+
+static unsigned int
+es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
+ const struct cpumask *andmask)
+{
+ int apicid = es7000_cpu_to_logical_apicid(0);
+ cpumask_var_t cpumask;
+
+ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+ return apicid;
+
+ cpumask_and(cpumask, inmask, andmask);
+ cpumask_and(cpumask, cpumask, cpu_online_mask);
+ apicid = es7000_cpu_mask_to_apicid(cpumask);
+
+ free_cpumask_var(cpumask);
+
+ return apicid;
+}
+
+static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static int probe_es7000(void)
+{
+ /* probed later in mptable/ACPI hooks */
+ return 0;
+}
+
+static int es7000_mps_ret;
+static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
+ char *productid)
+{
+ int ret = 0;
+
+ if (mpc->oemptr) {
+ struct mpc_oemtable *oem_table =
+ (struct mpc_oemtable *)mpc->oemptr;
+
+ if (!strncmp(oem, "UNISYS", 6))
+ ret = parse_unisys_oem((char *)oem_table);
+ }
+
+ es7000_mps_ret = ret;
+
+ return ret && !es7000_apic_is_cluster();
+}
+
+static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
+ char *productid)
+{
+ int ret = es7000_mps_ret;
+
+ return ret && es7000_apic_is_cluster();
+}
+
+struct apic apic_es7000_cluster = {
+
+ .name = "es7000",
+ .probe = probe_es7000,
+ .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
+ .apic_id_registered = es7000_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all procs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = target_cpus_cluster,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = es7000_check_apicid_used,
+ .check_apicid_present = es7000_check_apicid_present,
+
+ .vector_allocation_domain = es7000_vector_allocation_domain,
+ .init_apic_ldr = es7000_init_apic_ldr_cluster,
+
+ .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
+ .setup_apic_routing = es7000_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = es7000_apicid_to_node,
+ .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
+ .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = es7000_check_phys_apicid_present,
+ .enable_apic_mode = es7000_enable_apic_mode,
+ .phys_pkg_id = es7000_phys_pkg_id,
+ .mps_oem_check = es7000_mps_oem_check_cluster,
+
+ .get_apic_id = es7000_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = es7000_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = es7000_send_IPI_allbutself,
+ .send_IPI_all = es7000_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
+
+ .trampoline_phys_low = 0x467,
+ .trampoline_phys_high = 0x469,
+
+ .wait_for_init_deassert = NULL,
+
+ /* Nothing to do for most platforms, since cleared by the INIT cycle: */
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+struct apic apic_es7000 = {
+
+ .name = "es7000",
+ .probe = probe_es7000,
+ .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
+ .apic_id_registered = es7000_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPUs: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = es7000_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = es7000_check_apicid_used,
+ .check_apicid_present = es7000_check_apicid_present,
+
+ .vector_allocation_domain = es7000_vector_allocation_domain,
+ .init_apic_ldr = es7000_init_apic_ldr,
+
+ .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
+ .setup_apic_routing = es7000_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = es7000_apicid_to_node,
+ .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
+ .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = es7000_check_phys_apicid_present,
+ .enable_apic_mode = es7000_enable_apic_mode,
+ .phys_pkg_id = es7000_phys_pkg_id,
+ .mps_oem_check = es7000_mps_oem_check,
+
+ .get_apic_id = es7000_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = es7000_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = es7000_send_IPI_allbutself,
+ .send_IPI_all = es7000_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = 0x467,
+ .trampoline_phys_high = 0x469,
+
+ .wait_for_init_deassert = es7000_wait_for_init_deassert,
+
+ /* Nothing to do for most platforms, since cleared by the INIT cycle: */
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index bc7ac4da90d..42cdc78427a 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1,7 +1,7 @@
/*
* Intel IO-APIC support for multi-Pentium hosts.
*
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
*
* Many thanks to Stig Venaas for trying out countless experimental
* patches and reporting/debugging problems patiently!
@@ -46,6 +46,7 @@
#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/acpi.h>
@@ -61,9 +62,7 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_irq.h>
-#include <mach_ipi.h>
-#include <mach_apic.h>
-#include <mach_apicdef.h>
+#include <asm/apic.h>
#define __apicdebuginit(type) static type __init
@@ -82,11 +81,11 @@ static DEFINE_SPINLOCK(vector_lock);
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
@@ -99,10 +98,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
int skip_ioapic_setup;
+void arch_disable_smp_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
static int __init parse_noapic(char *str)
{
/* disable IO-APIC */
- disable_ioapic_setup();
+ arch_disable_smp_support();
return 0;
}
early_param("noapic", parse_noapic);
@@ -356,7 +364,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
if (!cfg->move_in_progress) {
/* it means that domain is not changed */
- if (!cpumask_intersects(&desc->affinity, mask))
+ if (!cpumask_intersects(desc->affinity, mask))
cfg->move_desc_pending = 1;
}
}
@@ -381,12 +389,20 @@ struct io_apic {
unsigned int index;
unsigned int unused[3];
unsigned int data;
+ unsigned int unused2[11];
+ unsigned int eoi;
};
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(vector, &io_apic->eoi);
}
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -478,7 +494,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
}
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
@@ -513,11 +529,11 @@ static void send_cleanup_vector(struct irq_cfg *cfg)
for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
cfg->move_cleanup_count++;
for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
} else {
cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
free_cpumask_var(cleanup_mask);
}
cfg->move_in_progress = 0;
@@ -538,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
apic = entry->apic;
pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
/*
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
if (!irq_remapped(irq))
io_apic_write(apic, 0x11 + pin*2, dest);
-#else
- io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
@@ -562,8 +574,9 @@ static int
assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
/*
- * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
- * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
*/
static unsigned int
set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
@@ -579,9 +592,10 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
if (assign_irq_vector(irq, cfg, mask))
return BAD_APICID;
- cpumask_and(&desc->affinity, cfg->domain, mask);
+ cpumask_and(desc->affinity, cfg->domain, mask);
set_extra_move_desc(desc, mask);
- return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+
+ return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
}
static void
@@ -796,23 +810,6 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
-#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
-void send_IPI_self(int vector)
-{
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
- cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP && CONFIG_X86_32*/
-
#ifdef CONFIG_X86_32
/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -820,8 +817,9 @@ void send_IPI_self(int vector)
*/
#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
+static int pirq_entries[MAX_PIRQS] = {
+ [0 ... MAX_PIRQS - 1] = -1
+};
static int __init ioapic_pirq_setup(char *str)
{
@@ -830,10 +828,6 @@ static int __init ioapic_pirq_setup(char *str)
get_options(str, ARRAY_SIZE(ints), ints);
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- pirqs_enabled = 1;
apic_printk(APIC_VERBOSE, KERN_INFO
"PIRQ redirection, working around broken MP-BIOS.\n");
max = MAX_PIRQS;
@@ -859,9 +853,9 @@ __setup("pirq=", ioapic_pirq_setup);
static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
/*
- * Saves and masks all the unmasked IO-APIC RTE's
+ * Saves all the IO-APIC RTE's
*/
-int save_mask_IO_APIC_setup(void)
+int save_IO_APIC_setup(void)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -886,16 +880,9 @@ int save_mask_IO_APIC_setup(void)
}
for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- struct IO_APIC_route_entry entry;
-
- entry = early_ioapic_entries[apic][pin] =
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ early_ioapic_entries[apic][pin] =
ioapic_read_entry(apic, pin);
- if (!entry.mask) {
- entry.mask = 1;
- ioapic_write_entry(apic, pin, entry);
- }
- }
return 0;
@@ -908,6 +895,25 @@ nomem:
return -ENOMEM;
}
+void mask_IO_APIC_setup(void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!early_ioapic_entries[apic])
+ break;
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = early_ioapic_entries[apic][pin];
+ if (!entry.mask) {
+ entry.mask = 1;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+ }
+}
+
void restore_IO_APIC_setup(void)
{
int apic, pin;
@@ -944,10 +950,10 @@ static int find_irq_entry(int apic, int pin, int type)
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == type &&
- (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mp_dstirq == pin)
+ if (mp_irqs[i].irqtype == type &&
+ (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].dstirq == pin)
return i;
return -1;
@@ -961,13 +967,13 @@ static int __init find_isa_irq_pin(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
- return mp_irqs[i].mp_dstirq;
+ return mp_irqs[i].dstirq;
}
return -1;
}
@@ -977,17 +983,17 @@ static int __init find_isa_irq_apic(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
return apic;
}
}
@@ -1012,23 +1018,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
+ int lbus = mp_irqs[i].srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mp_irqtype &&
+ !mp_irqs[i].irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mp_srcbusirq & 3))
+ if (pin == (mp_irqs[i].srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -1071,7 +1077,7 @@ static int EISA_ELCR(unsigned int irq)
* EISA conforming in the MP table, that means its trigger type must
* be read in from the ELCR */
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
@@ -1088,13 +1094,13 @@ static int EISA_ELCR(unsigned int irq)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mp_irqflag & 3)
+ switch (mp_irqs[idx].irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
if (test_bit(bus, mp_bus_not_pci))
@@ -1130,13 +1136,13 @@ static int MPBIOS_polarity(int idx)
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
if (test_bit(bus, mp_bus_not_pci))
@@ -1214,16 +1220,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mp_srcbus;
+ int bus = mp_irqs[idx].srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mp_dstirq != pin)
+ if (mp_irqs[idx].dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].mp_srcbusirq;
+ irq = mp_irqs[idx].srcbusirq;
} else {
/*
* PCI IRQs are mapped in order
@@ -1315,7 +1321,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
int new_cpu;
int vector, offset;
- vector_allocation_domain(cpu, tmp_mask);
+ apic->vector_allocation_domain(cpu, tmp_mask);
vector = current_vector;
offset = current_offset;
@@ -1421,9 +1427,8 @@ void __setup_vector_irq(int cpu)
}
static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
static struct irq_chip ir_ioapic_chip;
-#endif
+static struct irq_chip msi_ir_chip;
#define IOAPIC_AUTO -1
#define IOAPIC_EDGE 0
@@ -1462,7 +1467,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
else
desc->status &= ~IRQ_LEVEL;
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
desc->status |= IRQ_MOVE_PCNTXT;
if (trigger)
@@ -1474,7 +1478,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
handle_edge_irq, "edge");
return;
}
-#endif
+
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1485,37 +1489,43 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
handle_edge_irq, "edge");
}
-static int setup_ioapic_entry(int apic, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector)
+int setup_ioapic_entry(int apic_id, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector, int pin)
{
/*
* add it to the IO-APIC irq-routing table:
*/
memset(entry,0,sizeof(*entry));
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_ioapic_to_ir(apic);
+ struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
struct irte irte;
struct IR_IO_APIC_route_entry *ir_entry =
(struct IR_IO_APIC_route_entry *) entry;
int index;
if (!iommu)
- panic("No mapping iommu for ioapic %d\n", apic);
+ panic("No mapping iommu for ioapic %d\n", apic_id);
index = alloc_irte(iommu, irq, 1);
if (index < 0)
- panic("Failed to allocate IRTE for ioapic %d\n", apic);
+ panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
memset(&irte, 0, sizeof(irte));
irte.present = 1;
- irte.dst_mode = INT_DEST_MODE;
- irte.trigger_mode = trigger;
- irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.dst_mode = apic->irq_dest_mode;
+ /*
+ * Trigger mode in the IRTE will always be edge, and the
+ * actual level or edge trigger will be setup in the IO-APIC
+ * RTE. This will help simplify level triggered irq migration.
+ * For more details, see the comments above explainig IO-APIC
+ * irq migration in the presence of interrupt-remapping.
+ */
+ irte.trigger_mode = 0;
+ irte.dlvry_mode = apic->irq_delivery_mode;
irte.vector = vector;
irte.dest_id = IRTE_DEST(destination);
@@ -1525,18 +1535,21 @@ static int setup_ioapic_entry(int apic, int irq,
ir_entry->zero = 0;
ir_entry->format = 1;
ir_entry->index = (index & 0x7fff);
- } else
-#endif
- {
- entry->delivery_mode = INT_DELIVERY_MODE;
- entry->dest_mode = INT_DEST_MODE;
+ /*
+ * IO-APIC RTE will be configured with virtual vector.
+ * irq handler will do the explicit EOI to the io-apic.
+ */
+ ir_entry->vector = pin;
+ } else {
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
entry->dest = destination;
+ entry->vector = vector;
}
entry->mask = 0; /* enable IRQ */
entry->trigger = trigger;
entry->polarity = polarity;
- entry->vector = vector;
/* Mask level triggered irqs.
* Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1546,7 +1559,7 @@ static int setup_ioapic_entry(int apic, int irq,
return 0;
}
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
+static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
int trigger, int polarity)
{
struct irq_cfg *cfg;
@@ -1558,22 +1571,22 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, TARGET_CPUS))
+ if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
"IRQ %d Mode:%i Active:%i)\n",
- apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
+ apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
irq, trigger, polarity);
- if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- dest, trigger, polarity, cfg->vector)) {
+ if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+ dest, trigger, polarity, cfg->vector, pin)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic].mp_apicid, pin);
+ mp_ioapics[apic_id].apicid, pin);
__clear_irq_vector(irq, cfg);
return;
}
@@ -1582,12 +1595,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
if (irq < NR_IRQS_LEGACY)
disable_8259A_irq(irq);
- ioapic_write_entry(apic, pin, entry);
+ ioapic_write_entry(apic_id, pin, entry);
}
static void __init setup_IO_APIC_irqs(void)
{
- int apic, pin, idx, irq;
+ int apic_id, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
@@ -1595,21 +1608,19 @@ static void __init setup_IO_APIC_irqs(void)
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
+ idx = find_irq_entry(apic_id, pin, mp_INT);
if (idx == -1) {
if (!notcon) {
notcon = 1;
apic_printk(APIC_VERBOSE,
KERN_DEBUG " %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
+ mp_ioapics[apic_id].apicid, pin);
} else
apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
+ mp_ioapics[apic_id].apicid, pin);
continue;
}
if (notcon) {
@@ -1618,20 +1629,25 @@ static void __init setup_IO_APIC_irqs(void)
notcon = 0;
}
- irq = pin_2_irq(idx, apic, pin);
-#ifdef CONFIG_X86_32
- if (multi_timer_check(apic, irq))
+ irq = pin_2_irq(idx, apic_id, pin);
+
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(apic_id, irq))
continue;
-#endif
+
desc = irq_to_desc_alloc_cpu(irq, cpu);
if (!desc) {
printk(KERN_INFO "can not get irq_desc for %d\n", irq);
continue;
}
cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, apic, pin);
+ add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
- setup_IO_APIC_irq(apic, pin, irq, desc,
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
irq_trigger(idx), irq_polarity(idx));
}
}
@@ -1644,15 +1660,13 @@ static void __init setup_IO_APIC_irqs(void)
/*
* Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
int vector)
{
struct IO_APIC_route_entry entry;
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
return;
-#endif
memset(&entry, 0, sizeof(entry));
@@ -1660,10 +1674,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 1; /* mask IRQ now */
- entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = apic->irq_dest_mode;
+ entry.mask = 0; /* don't mask IRQ for edge */
+ entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+ entry.delivery_mode = apic->irq_delivery_mode;
entry.polarity = 0;
entry.trigger = 0;
entry.vector = vector;
@@ -1677,7 +1691,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
/*
* Add it to the IO-APIC irq-routing table:
*/
- ioapic_write_entry(apic, pin, entry);
+ ioapic_write_entry(apic_id, pin, entry);
}
@@ -1699,7 +1713,7 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -1719,7 +1733,7 @@ __apicdebuginit(void) print_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1980,13 +1994,6 @@ void __init enable_IO_APIC(void)
int apic;
unsigned long flags;
-#ifdef CONFIG_X86_32
- int i;
- if (!pirqs_enabled)
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-#endif
-
/*
* The number of IO-APIC IRQ registers (== #pins):
*/
@@ -2054,8 +2061,13 @@ void disable_IO_APIC(void)
* If the i8259 is routed through an IOAPIC
* Put that IOAPIC in virtual wire mode
* so legacy interrupts can be delivered.
+ *
+ * With interrupt-remapping, for now we will use virtual wire A mode,
+ * as virtual wire B is little complex (need to configure both
+ * IOAPIC RTE aswell as interrupt-remapping table entry).
+ * As this gets called during crash dump, keep this simple for now.
*/
- if (ioapic_i8259.pin != -1) {
+ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
@@ -2075,7 +2087,10 @@ void disable_IO_APIC(void)
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
- disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+ /*
+ * Use virtual wire A mode when interrupt remapping is enabled.
+ */
+ disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
}
#ifdef CONFIG_X86_32
@@ -2090,7 +2105,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
physid_mask_t phys_id_present_map;
- int apic;
+ int apic_id;
int i;
unsigned char old_id;
unsigned long flags;
@@ -2109,26 +2124,26 @@ static void __init setup_ioapic_ids_from_mpc(void)
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+ phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
/* Read the register 0 value */
spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
+ reg_00.raw = io_apic_read(apic_id, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- old_id = mp_ioapics[apic].mp_apicid;
+ old_id = mp_ioapics[apic_id].apicid;
- if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+ if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mp_apicid);
+ apic_id, mp_ioapics[apic_id].apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+ mp_ioapics[apic_id].apicid = reg_00.bits.ID;
}
/*
@@ -2136,10 +2151,10 @@ static void __init setup_ioapic_ids_from_mpc(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mp_apicid)) {
+ if (apic->check_apicid_used(phys_id_present_map,
+ mp_ioapics[apic_id].apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mp_apicid);
+ apic_id, mp_ioapics[apic_id].apicid);
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -2148,13 +2163,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mp_apicid = i;
+ mp_ioapics[apic_id].apicid = i;
} else {
physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+ tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic].mp_apicid);
+ mp_ioapics[apic_id].apicid);
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -2163,11 +2178,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic].mp_apicid)
+ if (old_id != mp_ioapics[apic_id].apicid)
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_dstapic == old_id)
- mp_irqs[i].mp_dstapic
- = mp_ioapics[apic].mp_apicid;
+ if (mp_irqs[i].dstapic == old_id)
+ mp_irqs[i].dstapic
+ = mp_ioapics[apic_id].apicid;
/*
* Read the right value from the MPC table and
@@ -2175,20 +2190,20 @@ static void __init setup_ioapic_ids_from_mpc(void)
*/
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mp_apicid);
+ mp_ioapics[apic_id].apicid);
- reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+ reg_00.bits.ID = mp_ioapics[apic_id].apicid;
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
+ io_apic_write(apic_id, 0, reg_00.raw);
spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
+ reg_00.raw = io_apic_read(apic_id, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+ if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2291,7 +2306,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+ apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -2299,7 +2314,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
#else
static int ioapic_retrigger_irq(unsigned int irq)
{
- send_IPI_self(irq_cfg(irq)->vector);
+ apic->send_IPI_self(irq_cfg(irq)->vector);
return 1;
}
@@ -2317,37 +2332,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
#ifdef CONFIG_SMP
#ifdef CONFIG_INTR_REMAP
-static void ir_irq_migration(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
/*
* Migrate the IO-APIC irq in the presence of intr-remapping.
*
- * For edge triggered, irq migration is a simple atomic update(of vector
- * and cpu destination) of IRTE and flush the hardware cache.
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
*
- * For level triggered, we need to modify the io-apic RTE aswell with the update
- * vector information, along with modifying IRTE with vector and destination.
- * So irq migration for level triggered is little bit more complex compared to
- * edge triggered migration. But the good news is, we use the same algorithm
- * for level triggered migration as we have today, only difference being,
- * we now initiate the irq migration from process context instead of the
- * interrupt context.
- *
- * In future, when we do a directed EOI (combined with cpu EOI broadcast
- * suppression) to the IO-APIC, level triggered irq migration will also be
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
*/
static void
migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
struct irte irte;
- int modify_ioapic_rte;
unsigned int dest;
- unsigned long flags;
unsigned int irq;
if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2363,14 +2365,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
set_extra_move_desc(desc, mask);
- dest = cpu_mask_to_apicid_and(cfg->domain, mask);
-
- modify_ioapic_rte = desc->status & IRQ_LEVEL;
- if (modify_ioapic_rte) {
- spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- }
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -2383,61 +2378,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
- cpumask_copy(&desc->affinity, mask);
-}
-
-static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
-{
- int ret = -1;
- struct irq_cfg *cfg = desc->chip_data;
-
- mask_IO_APIC_irq_desc(desc);
-
- if (io_apic_level_ack_pending(cfg)) {
- /*
- * Interrupt in progress. Migrating irq now will change the
- * vector information in the IO-APIC RTE and that will confuse
- * the EOI broadcast performed by cpu.
- * So, delay the irq migration to the next instance.
- */
- schedule_delayed_work(&ir_migration_work, 1);
- goto unmask;
- }
-
- /* everthing is clear. we have right of way */
- migrate_ioapic_irq_desc(desc, &desc->pending_mask);
-
- ret = 0;
- desc->status &= ~IRQ_MOVE_PENDING;
- cpumask_clear(&desc->pending_mask);
-
-unmask:
- unmask_IO_APIC_irq_desc(desc);
-
- return ret;
-}
-
-static void ir_irq_migration(struct work_struct *work)
-{
- unsigned int irq;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- if (desc->status & IRQ_MOVE_PENDING) {
- unsigned long flags;
-
- spin_lock_irqsave(&desc->lock, flags);
- if (!desc->chip->set_affinity ||
- !(desc->status & IRQ_MOVE_PENDING)) {
- desc->status &= ~IRQ_MOVE_PENDING;
- spin_unlock_irqrestore(&desc->lock, flags);
- continue;
- }
-
- desc->chip->set_affinity(irq, &desc->pending_mask);
- spin_unlock_irqrestore(&desc->lock, flags);
- }
- }
+ cpumask_copy(desc->affinity, mask);
}
/*
@@ -2446,13 +2387,6 @@ static void ir_irq_migration(struct work_struct *work)
static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
- if (desc->status & IRQ_LEVEL) {
- desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(&desc->pending_mask, mask);
- migrate_irq_remapped_level_desc(desc);
- return;
- }
-
migrate_ioapic_irq_desc(desc, mask);
}
static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2462,6 +2396,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
set_ir_ioapic_affinity_irq_desc(desc, mask);
}
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
+{
+}
#endif
asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2475,6 +2414,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irq;
+ unsigned int irr;
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
@@ -2494,6 +2434,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ /*
+ * Check if the vector that needs to be cleanedup is
+ * registered at the cpu's IRR. If so, then this is not
+ * the best time to clean it up. Lets clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to myself.
+ */
+ if (irr & (1 << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ goto unlock;
+ }
__get_cpu_var(vector_irq)[vector] = -1;
cfg->move_cleanup_count--;
unlock:
@@ -2516,7 +2468,7 @@ static void irq_complete_move(struct irq_desc **descp)
/* domain has not changed, but affinity did */
me = smp_processor_id();
- if (cpu_isset(me, desc->affinity)) {
+ if (cpumask_test_cpu(me, desc->affinity)) {
*descp = desc = move_irq_desc(desc, me);
/* get the new one */
cfg = desc->chip_data;
@@ -2543,9 +2495,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
#ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ io_apic_eoi(apic, pin);
+ entry = entry->next;
+ }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void ack_x2apic_level(unsigned int irq)
{
+ struct irq_desc *desc = irq_to_desc(irq);
ack_x2APIC_irq();
+ eoi_ioapic_irq(desc);
}
static void ack_x2apic_edge(unsigned int irq)
@@ -2867,19 +2854,15 @@ static inline void __init check_timer(void)
int cpu = boot_cpu_id;
int apic1, pin1, apic2, pin2;
unsigned long flags;
- unsigned int ver;
int no_pin1 = 0;
local_irq_save(flags);
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
-
/*
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
- assign_irq_vector(0, cfg, TARGET_CPUS);
+ assign_irq_vector(0, cfg, apic->target_cpus());
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2893,7 +2876,13 @@ static inline void __init check_timer(void)
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
#ifdef CONFIG_X86_32
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+ {
+ unsigned int ver;
+
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+ }
#endif
pin1 = find_isa_irq_pin(0, mp_INT);
@@ -2913,10 +2902,8 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
-#endif
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2932,8 +2919,17 @@ static inline void __init check_timer(void)
if (no_pin1) {
add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ } else {
+ /* for edge trigger, setup_IO_APIC_irq already
+ * leave it unmasked.
+ * so only need to unmask if it is level-trigger
+ * do we really have level trigger timer?
+ */
+ int idx;
+ idx = find_irq_entry(apic1, pin1, mp_INT);
+ if (idx != -1 && irq_trigger(idx))
+ unmask_IO_APIC_irq_desc(desc);
}
- unmask_IO_APIC_irq_desc(desc);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
@@ -2943,10 +2939,9 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
+ local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2961,7 +2956,6 @@ static inline void __init check_timer(void)
*/
replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- unmask_IO_APIC_irq_desc(desc);
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2976,6 +2970,7 @@ static inline void __init check_timer(void)
/*
* Cleanup, just in case ...
*/
+ local_irq_disable();
disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -3001,6 +2996,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
+ local_irq_disable();
disable_8259A_irq(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3018,6 +3014,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
+ local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
@@ -3047,13 +3044,9 @@ out:
void __init setup_IO_APIC(void)
{
-#ifdef CONFIG_X86_32
- enable_IO_APIC();
-#else
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
-#endif
io_apic_irqs = ~PIC_IRQS;
@@ -3118,8 +3111,8 @@ static int ioapic_resume(struct sys_device *dev)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3169,6 +3162,7 @@ static int __init ioapic_init_sysfs(void)
device_initcall(ioapic_init_sysfs);
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
*/
@@ -3183,11 +3177,11 @@ unsigned int create_irq_nr(unsigned int irq_want)
struct irq_desc *desc_new = NULL;
irq = 0;
- spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new < NR_IRQS; new++) {
- if (platform_legacy_irq(new))
- continue;
+ if (irq_want < nr_irqs_gsi)
+ irq_want = nr_irqs_gsi;
+ spin_lock_irqsave(&vector_lock, flags);
+ for (new = irq_want; new < nr_irqs; new++) {
desc_new = irq_to_desc_alloc_cpu(new, cpu);
if (!desc_new) {
printk(KERN_INFO "can not get irq_desc for %d\n", new);
@@ -3197,7 +3191,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
if (cfg_new->vector != 0)
continue;
- if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
+ if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
break;
}
@@ -3212,7 +3206,6 @@ unsigned int create_irq_nr(unsigned int irq_want)
return irq;
}
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
int create_irq(void)
{
unsigned int irq_want;
@@ -3241,9 +3234,7 @@ void destroy_irq(unsigned int irq)
if (desc)
desc->chip_data = cfg;
-#ifdef CONFIG_INTR_REMAP
free_irte(irq);
-#endif
spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
@@ -3259,14 +3250,16 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
int err;
unsigned dest;
+ if (disable_apic)
+ return -ENXIO;
+
cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
if (err)
return err;
- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
struct irte irte;
int ir_index;
@@ -3278,9 +3271,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
memset (&irte, 0, sizeof(irte));
irte.present = 1;
- irte.dst_mode = INT_DEST_MODE;
+ irte.dst_mode = apic->irq_dest_mode;
irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = INT_DELIVERY_MODE;
+ irte.dlvry_mode = apic->irq_delivery_mode;
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3292,16 +3285,19 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
MSI_ADDR_IR_SHV |
MSI_ADDR_IR_INDEX1(ir_index) |
MSI_ADDR_IR_INDEX2(ir_index);
- } else
-#endif
- {
- msg->address_hi = MSI_ADDR_BASE_HI;
+ } else {
+ if (x2apic_enabled())
+ msg->address_hi = MSI_ADDR_BASE_HI |
+ MSI_ADDR_EXT_DEST_ID(dest);
+ else
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
msg->address_lo =
MSI_ADDR_BASE_LO |
- ((INT_DEST_MODE == 0) ?
+ ((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL:
MSI_ADDR_DEST_MODE_LOGICAL) |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU:
MSI_ADDR_REDIRECTION_LOWPRI) |
MSI_ADDR_DEST_ID(dest);
@@ -3309,7 +3305,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(cfg->vector);
@@ -3406,6 +3402,7 @@ static struct irq_chip msi_ir_chip = {
#endif
.retrigger = ioapic_retrigger_irq,
};
+#endif
/*
* Map the PCI dev to the corresponding remapping hardware unit
@@ -3433,7 +3430,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
}
return index;
}
-#endif
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
@@ -3447,7 +3443,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
-#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
struct irq_desc *desc = irq_to_desc(irq);
/*
@@ -3456,7 +3451,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
desc->status |= IRQ_MOVE_PCNTXT;
set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
} else
-#endif
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3464,60 +3458,22 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
return 0;
}
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
-{
- unsigned int irq;
- int ret;
- unsigned int irq_want;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want);
- if (irq == 0)
- return -1;
-
-#ifdef CONFIG_INTR_REMAP
- if (!intr_remapping_enabled)
- goto no_ir;
-
- ret = msi_alloc_irte(dev, irq, 1);
- if (ret < 0)
- goto error;
-no_ir:
-#endif
- ret = setup_msi_irq(dev, msidesc, irq);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
- }
- return 0;
-
-#ifdef CONFIG_INTR_REMAP
-error:
- destroy_irq(irq);
- return ret;
-#endif
-}
-
int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
unsigned int irq;
int ret, sub_handle;
struct msi_desc *msidesc;
unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
struct intel_iommu *iommu = 0;
int index = 0;
-#endif
irq_want = nr_irqs_gsi;
sub_handle = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
irq = create_irq_nr(irq_want);
- irq_want++;
if (irq == 0)
return -1;
-#ifdef CONFIG_INTR_REMAP
+ irq_want = irq + 1;
if (!intr_remapping_enabled)
goto no_ir;
@@ -3545,7 +3501,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
-#endif
ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
@@ -3563,7 +3518,7 @@ void arch_teardown_msi_irq(unsigned int irq)
destroy_irq(irq);
}
-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
@@ -3727,13 +3682,17 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
struct irq_cfg *cfg;
int err;
+ if (disable_apic)
+ return -ENXIO;
+
cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, TARGET_CPUS);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
if (!err) {
struct ht_irq_msg msg;
unsigned dest;
- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus());
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
@@ -3741,11 +3700,11 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
HT_IRQ_LOW_BASE |
HT_IRQ_LOW_DEST_ID(dest) |
HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((INT_DEST_MODE == 0) ?
+ ((apic->irq_dest_mode == 0) ?
HT_IRQ_LOW_DM_PHYSICAL :
HT_IRQ_LOW_DM_LOGICAL) |
HT_IRQ_LOW_RQEOI_EDGE |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
HT_IRQ_LOW_MT_FIXED :
HT_IRQ_LOW_MT_ARBITRATED) |
HT_IRQ_LOW_IRQ_MASKED;
@@ -3761,7 +3720,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
}
#endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
/*
* Re-target the irq to the specified CPU and enable the specified MMR located
* on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3793,12 +3752,12 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
entry->vector = cfg->vector;
- entry->delivery_mode = INT_DELIVERY_MODE;
- entry->dest_mode = INT_DEST_MODE;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
entry->polarity = 0;
entry->trigger = 0;
entry->mask = 0;
- entry->dest = cpu_mask_to_apicid(eligible_cpu);
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3861,6 +3820,28 @@ void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+ nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+ nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+ /*
+ * for MSI and HT dyn irq
+ */
+ nr += nr_irqs_gsi * 16;
+#endif
+ if (nr < nr_irqs)
+ nr_irqs = nr;
+
+ return 0;
+}
+#endif
+
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
@@ -3886,7 +3867,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
*/
if (physids_empty(apic_id_map))
- apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+ apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
@@ -3902,10 +3883,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
* Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (check_apicid_used(apic_id_map, apic_id)) {
+ if (apic->check_apicid_used(apic_id_map, apic_id)) {
for (i = 0; i < get_physical_broadcast(); i++) {
- if (!check_apicid_used(apic_id_map, i))
+ if (!apic->check_apicid_used(apic_id_map, i))
break;
}
@@ -3918,7 +3899,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
apic_id = i;
}
- tmp = apicid_to_cpu_present(apic_id);
+ tmp = apic->apicid_to_cpu_present(apic_id);
physids_or(apic_id_map, apic_id_map, tmp);
if (reg_00.bits.ID != apic_id) {
@@ -3995,8 +3976,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == mp_INT &&
- mp_irqs[i].mp_srcbusirq == bus_irq)
+ if (mp_irqs[i].irqtype == mp_INT &&
+ mp_irqs[i].srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -4011,7 +3992,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
/*
* This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
+ * so mask in all cases should simply be apic->target_cpus()
*/
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
@@ -4050,15 +4031,13 @@ void __init setup_ioapic_dest(void)
*/
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = &desc->affinity;
+ mask = desc->affinity;
else
- mask = TARGET_CPUS;
+ mask = apic->target_cpus();
-#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
set_ir_ioapic_affinity_irq_desc(desc, mask);
else
-#endif
set_ioapic_affinity_irq_desc(desc, mask);
}
@@ -4111,7 +4090,7 @@ void __init ioapic_init_mappings(void)
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mp_apicaddr;
+ ioapic_phys = mp_ioapics[i].apicaddr;
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
printk(KERN_ERR
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
new file mode 100644
index 00000000000..dbf5445727a
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi.c
@@ -0,0 +1,164 @@
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/proto.h>
+#include <asm/ipi.h>
+
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
+{
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicast to each CPU instead.
+ * - mbligh
+ */
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int query_cpu;
+ unsigned long flags;
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+ * should be modified to do 1 message per cluster ID - mbligh
+ */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask)
+ __default_send_IPI_dest_field(
+ apic->cpu_to_logical_apicid(query_cpu), vector,
+ apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(
+ apic->cpu_to_logical_apicid(query_cpu), vector,
+ apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_32
+
+/*
+ * This is only used on smaller machines.
+ */
+void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned long flags;
+
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then we get an APIC send
+ * error if we try to broadcast, thus avoid sending IPIs in this case.
+ */
+ if (!(num_online_cpus() > 1))
+ return;
+
+ __default_local_send_IPI_allbutself(vector);
+}
+
+void default_send_IPI_all(int vector)
+{
+ __default_local_send_IPI_all(vector);
+}
+
+void default_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
+}
+
+/* must come after the send_IPI functions above for inlining */
+static int convert_apicid_to_cpu(int apic_id)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
+ return i;
+ }
+ return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+ int apicid, cpuid;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return 0;
+
+ apicid = hard_smp_processor_id();
+ if (apicid == BAD_APICID)
+ return 0;
+
+ cpuid = convert_apicid_to_cpu(apicid);
+
+ return cpuid >= 0 ? cpuid : 0;
+}
+#endif
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7228979f1e7..bdfad80c3cf 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -34,7 +34,7 @@
#include <asm/mce.h>
-#include <mach_traps.h>
+#include <asm/mach_traps.h>
int unknown_nmi_panic;
int nmi_watchdog_enabled;
@@ -61,11 +61,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
-#ifdef CONFIG_X86_64
- return cpu_pda(cpu)->__nmi_count;
-#else
- return nmi_count(cpu);
-#endif
+ return per_cpu(irq_stat, cpu).__nmi_count;
}
static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
*/
static inline unsigned int get_timer_irqs(int cpu)
{
-#ifdef CONFIG_X86_64
- return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
-#else
return per_cpu(irq_stat, cpu).apic_timer_irqs +
per_cpu(irq_stat, cpu).irq0_irqs;
-#endif
}
#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
new file mode 100644
index 00000000000..ba2fc646553
--- /dev/null
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -0,0 +1,557 @@
+/*
+ * Written by: Patricia Gaughen, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <gone@us.ibm.com>
+ */
+#include <linux/nodemask.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/numa.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/numaq.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/ipi.h>
+
+#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
+
+int found_numaq;
+
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+struct mpc_trans {
+ unsigned char mpc_type;
+ unsigned char trans_len;
+ unsigned char trans_type;
+ unsigned char trans_quad;
+ unsigned char trans_global;
+ unsigned char trans_local;
+ unsigned short trans_reserved;
+};
+
+/* x86_quirks member */
+static int mpc_record;
+
+static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+int quad_local_to_mp_bus_id[NR_CPUS/4][4];
+
+
+static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
+{
+ struct eachquadmem *eq = scd->eq + node;
+
+ node_set_online(node);
+
+ /* Convert to pages */
+ node_start_pfn[node] =
+ MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
+
+ node_end_pfn[node] =
+ MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+
+ e820_register_active_regions(node, node_start_pfn[node],
+ node_end_pfn[node]);
+
+ memory_present(node, node_start_pfn[node], node_end_pfn[node]);
+
+ node_remap_size[node] = node_memmap_size_bytes(node,
+ node_start_pfn[node],
+ node_end_pfn[node]);
+}
+
+/*
+ * Function: smp_dump_qct()
+ *
+ * Description: gets memory layout from the quad config table. This
+ * function also updates node_online_map with the nodes (quads) present.
+ */
+static void __init smp_dump_qct(void)
+{
+ struct sys_cfg_data *scd;
+ int node;
+
+ scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
+
+ nodes_clear(node_online_map);
+ for_each_node(node) {
+ if (scd->quads_present31_0 & (1 << node))
+ numaq_register_node(node, scd);
+ }
+}
+
+void __cpuinit numaq_tsc_disable(void)
+{
+ if (!found_numaq)
+ return;
+
+ if (num_online_nodes() > 1) {
+ printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+ setup_clear_cpu_cap(X86_FEATURE_TSC);
+ }
+}
+
+static int __init numaq_pre_time_init(void)
+{
+ numaq_tsc_disable();
+ return 0;
+}
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+ return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+/* x86_quirks member */
+static int mpc_apic_id(struct mpc_cpu *m)
+{
+ int quad = translation_table[mpc_record]->trans_quad;
+ int logical_apicid = generate_logical_apicid(quad, m->apicid);
+
+ printk(KERN_DEBUG
+ "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+ m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
+ (m->cpufeature & CPU_MODEL_MASK) >> 4,
+ m->apicver, quad, logical_apicid);
+
+ return logical_apicid;
+}
+
+/* x86_quirks member */
+static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
+{
+ int quad = translation_table[mpc_record]->trans_quad;
+ int local = translation_table[mpc_record]->trans_local;
+
+ mp_bus_id_to_node[m->busid] = quad;
+ mp_bus_id_to_local[m->busid] = local;
+
+ printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
+}
+
+/* x86_quirks member */
+static void mpc_oem_pci_bus(struct mpc_bus *m)
+{
+ int quad = translation_table[mpc_record]->trans_quad;
+ int local = translation_table[mpc_record]->trans_local;
+
+ quad_local_to_mp_bus_id[quad][local] = m->busid;
+}
+
+static void __init MP_translation_info(struct mpc_trans *m)
+{
+ printk(KERN_INFO
+ "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+ mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+ m->trans_local);
+
+ if (mpc_record >= MAX_MPC_ENTRY)
+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+ else
+ translation_table[mpc_record] = m; /* stash this for later */
+
+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+ node_set_online(m->trans_quad);
+}
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+ int sum = 0;
+
+ while (len--)
+ sum += *mp++;
+
+ return sum & 0xFF;
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+static void __init
+ smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
+{
+ int count = sizeof(*oemtable); /* the header size */
+ unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+ mpc_record = 0;
+ printk(KERN_INFO
+ "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
+
+ if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
+ printk(KERN_WARNING
+ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+ oemtable->signature[0], oemtable->signature[1],
+ oemtable->signature[2], oemtable->signature[3]);
+ return;
+ }
+
+ if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+ return;
+ }
+
+ while (count < oemtable->length) {
+ switch (*oemptr) {
+ case MP_TRANSLATION:
+ {
+ struct mpc_trans *m = (void *)oemptr;
+
+ MP_translation_info(m);
+ oemptr += sizeof(*m);
+ count += sizeof(*m);
+ ++mpc_record;
+ break;
+ }
+ default:
+ printk(KERN_WARNING
+ "Unrecognised OEM table entry type! - %d\n",
+ (int)*oemptr);
+ return;
+ }
+ }
+}
+
+static int __init numaq_setup_ioapic_ids(void)
+{
+ /* so can skip it */
+ return 1;
+}
+
+static struct x86_quirks numaq_x86_quirks __initdata = {
+ .arch_pre_time_init = numaq_pre_time_init,
+ .arch_time_init = NULL,
+ .arch_pre_intr_init = NULL,
+ .arch_memory_setup = NULL,
+ .arch_intr_init = NULL,
+ .arch_trap_init = NULL,
+ .mach_get_smp_config = NULL,
+ .mach_find_smp_config = NULL,
+ .mpc_record = &mpc_record,
+ .mpc_apic_id = mpc_apic_id,
+ .mpc_oem_bus_info = mpc_oem_bus_info,
+ .mpc_oem_pci_bus = mpc_oem_pci_bus,
+ .smp_read_mpc_oem = smp_read_mpc_oem,
+ .setup_ioapic_ids = numaq_setup_ioapic_ids,
+};
+
+static __init void early_check_numaq(void)
+{
+ /*
+ * Find possible boot-time SMP configuration:
+ */
+ early_find_smp_config();
+
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ early_get_smp_config();
+
+ if (found_numaq)
+ x86_quirks = &numaq_x86_quirks;
+}
+
+int __init get_memcfg_numaq(void)
+{
+ early_check_numaq();
+ if (!found_numaq)
+ return 0;
+ smp_dump_qct();
+
+ return 1;
+}
+
+#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline unsigned int numaq_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0x0F;
+}
+
+static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_logical(mask, vector);
+}
+
+static inline void numaq_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
+}
+
+static inline void numaq_send_IPI_all(int vector)
+{
+ numaq_send_IPI_mask(cpu_online_mask, vector);
+}
+
+#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
+#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
+
+/*
+ * Because we use NMIs rather than the INIT-STARTUP sequence to
+ * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
+ */
+static inline void numaq_smp_callin_clear_local_apic(void)
+{
+ clear_local_APIC();
+}
+
+static inline const cpumask_t *numaq_target_cpus(void)
+{
+ return &CPU_MASK_ALL;
+}
+
+static inline unsigned long
+numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return physid_isset(apicid, bitmap);
+}
+
+static inline unsigned long numaq_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static inline int numaq_apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void numaq_init_apic_ldr(void)
+{
+ /* Already done in NUMA-Q firmware */
+}
+
+static inline void numaq_setup_apic_routing(void)
+{
+ printk(KERN_INFO
+ "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+/*
+ * Skip adding the timer int on secondary nodes, which causes
+ * a small but painful rift in the time-space continuum.
+ */
+static inline int numaq_multi_timer_check(int apic, int irq)
+{
+ return apic != 0 && irq == 0;
+}
+
+static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* We don't have a good way to do this yet - hack */
+ return physids_promote(0xFUL);
+}
+
+static inline int numaq_cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return cpu_2_logical_apicid[cpu];
+}
+
+/*
+ * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
+ * cpu to APIC ID relation to properly interact with the intelligent
+ * mode of the cluster controller.
+ */
+static inline int numaq_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < 60)
+ return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
+ else
+ return BAD_APICID;
+}
+
+static inline int numaq_apicid_to_node(int logical_apicid)
+{
+ return logical_apicid >> 4;
+}
+
+static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
+{
+ int node = numaq_apicid_to_node(logical_apicid);
+ int cpu = __ffs(logical_apicid & 0xf);
+
+ return physid_mask_of_physid(cpu + 4*node);
+}
+
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+
+static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+/*
+ * We use physical apicids here, not logical, so just return the default
+ * physical broadcast to stop people from breaking us
+ */
+static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ return 0x0F;
+}
+
+static inline unsigned int
+numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ return 0x0F;
+}
+
+/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
+static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static int
+numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ if (strncmp(oem, "IBM NUMA", 8))
+ printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
+ else
+ found_numaq = 1;
+
+ return found_numaq;
+}
+
+static int probe_numaq(void)
+{
+ /* already know from get_memcfg_numaq() */
+ return found_numaq;
+}
+
+static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+}
+
+static void numaq_setup_portio_remap(void)
+{
+ int num_quads = num_online_nodes();
+
+ if (num_quads <= 1)
+ return;
+
+ printk(KERN_INFO
+ "Remapping cross-quad port I/O for %d quads\n", num_quads);
+
+ xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
+
+ printk(KERN_INFO
+ "xquad_portio vaddr 0x%08lx, len %08lx\n",
+ (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
+}
+
+struct apic apic_numaq = {
+
+ .name = "NUMAQ",
+ .probe = probe_numaq,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = numaq_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* physical delivery on LOCAL quad: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = numaq_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = numaq_check_apicid_used,
+ .check_apicid_present = numaq_check_apicid_present,
+
+ .vector_allocation_domain = numaq_vector_allocation_domain,
+ .init_apic_ldr = numaq_init_apic_ldr,
+
+ .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
+ .setup_apic_routing = numaq_setup_apic_routing,
+ .multi_timer_check = numaq_multi_timer_check,
+ .apicid_to_node = numaq_apicid_to_node,
+ .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
+ .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
+ .setup_portio_remap = numaq_setup_portio_remap,
+ .check_phys_apicid_present = numaq_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = numaq_phys_pkg_id,
+ .mps_oem_check = numaq_mps_oem_check,
+
+ .get_apic_id = numaq_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = numaq_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = numaq_send_IPI_allbutself,
+ .send_IPI_all = numaq_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
+ .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
+
+ /* We don't do anything here because we use NMI's to boot instead */
+ .wait_for_init_deassert = NULL,
+
+ .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
new file mode 100644
index 00000000000..141c99a1c26
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -0,0 +1,284 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <asm/ipi.h>
+
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+int no_broadcast = DEFAULT_SEND_IPI;
+
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ pr_info("Using %s mode\n",
+ no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
+ return 1;
+}
+__setup("no_ipi_broadcast=", no_ipi_broadcast);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("Using IPI %s mode\n",
+ no_broadcast ? "No-Shortcut" : "Shortcut");
+ return 0;
+}
+late_initcall(print_ipi_mode);
+
+void default_setup_apic_routing(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ printk(KERN_INFO
+ "Enabling APIC mode: Flat. Using %d I/O APICs\n",
+ nr_ioapics);
+#endif
+}
+
+static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ /*
+ * Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
+}
+
+/* should be called last. */
+static int probe_default(void)
+{
+ return 1;
+}
+
+struct apic apic_default = {
+
+ .name = "default",
+ .probe = probe_default,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = default_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = default_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = default_check_apicid_used,
+ .check_apicid_present = default_check_apicid_present,
+
+ .vector_allocation_domain = default_vector_allocation_domain,
+ .init_apic_ldr = default_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = default_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = default_apicid_to_node,
+ .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = default_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = default_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = default_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = default_send_IPI_mask_logical,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+extern struct apic apic_numaq;
+extern struct apic apic_summit;
+extern struct apic apic_bigsmp;
+extern struct apic apic_es7000;
+extern struct apic apic_es7000_cluster;
+extern struct apic apic_default;
+
+struct apic *apic = &apic_default;
+EXPORT_SYMBOL_GPL(apic);
+
+static struct apic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_NUMAQ
+ &apic_numaq,
+#endif
+#ifdef CONFIG_X86_SUMMIT
+ &apic_summit,
+#endif
+#ifdef CONFIG_X86_BIGSMP
+ &apic_bigsmp,
+#endif
+#ifdef CONFIG_X86_ES7000
+ &apic_es7000,
+ &apic_es7000_cluster,
+#endif
+ &apic_default, /* must be last */
+ NULL,
+};
+
+static int cmdline_apic __initdata;
+static int __init parse_apic(char *arg)
+{
+ int i;
+
+ if (!arg)
+ return -EINVAL;
+
+ for (i = 0; apic_probe[i]; i++) {
+ if (!strcmp(apic_probe[i]->name, arg)) {
+ apic = apic_probe[i];
+ cmdline_apic = 1;
+ return 0;
+ }
+ }
+
+ /* Parsed again by __setup for debug/verbose */
+ return 0;
+}
+early_param("apic", parse_apic);
+
+void __init generic_bigsmp_probe(void)
+{
+#ifdef CONFIG_X86_BIGSMP
+ /*
+ * This routine is used to switch to bigsmp mode when
+ * - There is no apic= option specified by the user
+ * - generic_apic_probe() has chosen apic_default as the sub_arch
+ * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+ */
+
+ if (!cmdline_apic && apic == &apic_default) {
+ if (apic_bigsmp.probe()) {
+ apic = &apic_bigsmp;
+ printk(KERN_INFO "Overriding APIC driver with %s\n",
+ apic->name);
+ }
+ }
+#endif
+}
+
+void __init generic_apic_probe(void)
+{
+ if (!cmdline_apic) {
+ int i;
+ for (i = 0; apic_probe[i]; i++) {
+ if (apic_probe[i]->probe()) {
+ apic = apic_probe[i];
+ break;
+ }
+ }
+ /* Not visible without early console */
+ if (!apic_probe[i])
+ panic("Didn't find an APIC driver");
+ }
+ printk(KERN_INFO "Using APIC driver %s\n", apic->name);
+}
+
+/* These functions can switch the APIC even after the initial ->probe() */
+
+int __init
+generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ int i;
+
+ for (i = 0; apic_probe[i]; ++i) {
+ if (!apic_probe[i]->mps_oem_check)
+ continue;
+ if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = apic_probe[i];
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ int i;
+
+ for (i = 0; apic_probe[i]; ++i) {
+ if (!apic_probe[i]->acpi_madt_oem_check)
+ continue;
+ if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = apic_probe[i];
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
new file mode 100644
index 00000000000..1783652bb0e
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/setup.h>
+
+extern struct apic apic_flat;
+extern struct apic apic_physflat;
+extern struct apic apic_x2xpic_uv_x;
+extern struct apic apic_x2apic_phys;
+extern struct apic apic_x2apic_cluster;
+
+struct apic __read_mostly *apic = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
+
+static struct apic *apic_probe[] __initdata = {
+#ifdef CONFIG_X86_UV
+ &apic_x2apic_uv_x,
+#endif
+#ifdef CONFIG_X86_X2APIC
+ &apic_x2apic_phys,
+ &apic_x2apic_cluster,
+#endif
+ &apic_physflat,
+ NULL,
+};
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init default_setup_apic_routing(void)
+{
+#ifdef CONFIG_X86_X2APIC
+ if (x2apic && (apic != &apic_x2apic_phys &&
+#ifdef CONFIG_X86_UV
+ apic != &apic_x2apic_uv_x &&
+#endif
+ apic != &apic_x2apic_cluster)) {
+ if (x2apic_phys)
+ apic = &apic_x2apic_phys;
+ else
+ apic = &apic_x2apic_cluster;
+ printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+ }
+#endif
+
+ if (apic == &apic_flat) {
+ if (max_physical_apicid >= 8)
+ apic = &apic_physflat;
+ printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+ }
+
+ /*
+ * Now that apic routing model is selected, configure the
+ * fault handling for intr remapping.
+ */
+ if (intr_remapping_enabled)
+ enable_drhd_fault_handling();
+}
+
+/* Same for both flat and physical. */
+
+void apic_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ int i;
+
+ for (i = 0; apic_probe[i]; ++i) {
+ if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ apic = apic_probe[i];
+ printk(KERN_INFO "Setting APIC routing to %s.\n",
+ apic->name);
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
new file mode 100644
index 00000000000..aac52fa873f
--- /dev/null
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -0,0 +1,579 @@
+/*
+ * IBM Summit-Specific Code
+ *
+ * Written By: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (c) 2003 IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/bios_ebda.h>
+
+/*
+ * APIC driver for the IBM "Summit" chipset.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/smp.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/ipi.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+
+static unsigned summit_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector)
+{
+ default_send_IPI_mask_sequence_logical(mask, vector);
+}
+
+static void summit_send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ summit_send_IPI_mask(&mask, vector);
+}
+
+static void summit_send_IPI_all(int vector)
+{
+ summit_send_IPI_mask(&cpu_online_map, vector);
+}
+
+#include <asm/tsc.h>
+
+extern int use_cyclone;
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
+static void setup_summit(void);
+#else
+static inline void setup_summit(void) {}
+#endif
+
+static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
+ char *productid)
+{
+ if (!strncmp(oem, "IBM ENSW", 8) &&
+ (!strncmp(productid, "VIGIL SMP", 9)
+ || !strncmp(productid, "EXA", 3)
+ || !strncmp(productid, "RUTHLESS SMP", 12))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+/* Hook from generic ACPI tables.c */
+static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "IBM", 3) &&
+ (!strncmp(oem_table_id, "SERVIGIL", 8)
+ || !strncmp(oem_table_id, "EXA", 3))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+struct rio_table_hdr {
+ unsigned char version; /* Version number of this data structure */
+ /* Version 3 adds chassis_num & WP_index */
+ unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
+ unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
+} __attribute__((packed));
+
+struct scal_detail {
+ unsigned char node_id; /* Scalability Node ID */
+ unsigned long CBAR; /* Address of 1MB register space */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port2node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ unsigned char node_id; /* RIO Node ID */
+ unsigned long BBAR; /* Address of 1MB register space */
+ unsigned char type; /* Type of device */
+ unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
+ /* For CYC: Node ID of Twister that owns this CYC */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
+ /* For CYC: 0 */
+ unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie:*/
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ /* For CYC: Bits0:7 Reserved */
+ unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ /* For CYC: No meaning */
+ unsigned char chassis_num; /* 1 based Chassis number */
+ /* For LookOut WPEGs this field indicates the */
+ /* Expansion Chassis #, enumerated from Boot */
+ /* Node WPEG external port, then Boot Node CYC */
+ /* external port, then Next Vigil chassis WPEG */
+ /* external port, etc. */
+ /* Shared Lookouts have only 1 chassis number (the */
+ /* first one assigned) */
+} __attribute__((packed));
+
+
+typedef enum {
+ CompatTwister = 0, /* Compatibility Twister */
+ AltTwister = 1, /* Alternate Twister of internal 8-way */
+ CompatCyclone = 2, /* Compatibility Cyclone */
+ AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
+ CompatWPEG = 4, /* Compatibility WPEG */
+ AltWPEG = 5, /* Second Planar WPEG */
+ LookOutAWPEG = 6, /* LookOut WPEG */
+ LookOutBWPEG = 7, /* LookOut WPEG */
+} node_type;
+
+static inline int is_WPEG(struct rio_detail *rio){
+ return (rio->type == CompatWPEG || rio->type == AltWPEG ||
+ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
+}
+
+
+/* In clustered mode, the high nibble of APIC ID is a cluster number.
+ * The low nibble is a 4-bit bitmap. */
+#define XAPIC_DEST_CPUS_SHIFT 4
+#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+
+#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static const cpumask_t *summit_target_cpus(void)
+{
+ /* CPU_MASK_ALL (0xff) has undefined behaviour with
+ * dest_LowestPrio mode logical clustered apic interrupt routing
+ * Just start on cpu 0. IRQ balancing will spread load
+ */
+ return &cpumask_of_cpu(0);
+}
+
+static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+
+/* we don't use the phys_cpu_present_map to indicate apicid presence */
+static unsigned long summit_check_apicid_present(int bit)
+{
+ return 1;
+}
+
+static void summit_init_apic_ldr(void)
+{
+ unsigned long val, id;
+ int count = 0;
+ u8 my_id = (u8)hard_smp_processor_id();
+ u8 my_cluster = APIC_CLUSTER(my_id);
+#ifdef CONFIG_SMP
+ u8 lid;
+ int i;
+
+ /* Create logical APIC IDs by counting CPUs already in cluster. */
+ for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
+ lid = cpu_2_logical_apicid[i];
+ if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
+ ++count;
+ }
+#endif
+ /* We only have a 4 wide bitmap in cluster mode. If a deranged
+ * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
+ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
+ id = my_cluster | (1UL << count);
+ apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static int summit_apic_id_registered(void)
+{
+ return 1;
+}
+
+static void summit_setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static int summit_apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int summit_cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= nr_cpu_ids)
+ return BAD_APICID;
+ return cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static int summit_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0x0F);
+}
+
+static physid_mask_t summit_apicid_to_cpu_present(int apicid)
+{
+ return physid_mask_of_physid(0);
+}
+
+static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+static unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+ unsigned int round = 0;
+ int cpu, apicid = 0;
+
+ /*
+ * The cpus in the mask must all be on the apic cluster.
+ */
+ for_each_cpu(cpu, cpumask) {
+ int new_apicid = summit_cpu_to_logical_apicid(cpu);
+
+ if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
+ printk("%s: Not a valid mask!\n", __func__);
+ return BAD_APICID;
+ }
+ apicid |= new_apicid;
+ round++;
+ }
+ return apicid;
+}
+
+static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
+ const struct cpumask *andmask)
+{
+ int apicid = summit_cpu_to_logical_apicid(0);
+ cpumask_var_t cpumask;
+
+ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+ return apicid;
+
+ cpumask_and(cpumask, inmask, andmask);
+ cpumask_and(cpumask, cpumask, cpu_online_mask);
+ apicid = summit_cpu_mask_to_apicid(cpumask);
+
+ free_cpumask_var(cpumask);
+
+ return apicid;
+}
+
+/*
+ * cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value. For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+static int probe_summit(void)
+{
+ /* probed later in mptable/ACPI hooks */
+ return 0;
+}
+
+static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
+}
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
+static struct rio_table_hdr *rio_table_hdr;
+static struct scal_detail *scal_devs[MAX_NUMNODES];
+static struct rio_detail *rio_devs[MAX_NUMNODES*4];
+
+#ifndef CONFIG_X86_NUMAQ
+static int mp_bus_id_to_node[MAX_MP_BUSSES];
+#endif
+
+static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
+{
+ int twister = 0, node = 0;
+ int i, bus, num_buses;
+
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
+ if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
+ twister = rio_devs[i]->owner_id;
+ break;
+ }
+ }
+ if (i == rio_table_hdr->num_rio_dev) {
+ printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
+ return last_bus;
+ }
+
+ for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
+ if (scal_devs[i]->node_id == twister) {
+ node = scal_devs[i]->node_id;
+ break;
+ }
+ }
+ if (i == rio_table_hdr->num_scal_dev) {
+ printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
+ return last_bus;
+ }
+
+ switch (rio_devs[wpeg_num]->type) {
+ case CompatWPEG:
+ /*
+ * The Compatibility Winnipeg controls the 2 legacy buses,
+ * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
+ * a PCI-PCI bridge card is used in either slot: total 5 buses.
+ */
+ num_buses = 5;
+ break;
+ case AltWPEG:
+ /*
+ * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
+ * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
+ * the "extra" buses for each of those slots: total 7 buses.
+ */
+ num_buses = 7;
+ break;
+ case LookOutAWPEG:
+ case LookOutBWPEG:
+ /*
+ * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
+ * & the "extra" buses for each of those slots: total 9 buses.
+ */
+ num_buses = 9;
+ break;
+ default:
+ printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
+ return last_bus;
+ }
+
+ for (bus = last_bus; bus < last_bus + num_buses; bus++)
+ mp_bus_id_to_node[bus] = node;
+ return bus;
+}
+
+static int build_detail_arrays(void)
+{
+ unsigned long ptr;
+ int i, scal_detail_size, rio_detail_size;
+
+ if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
+ printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+ return 0;
+ }
+
+ switch (rio_table_hdr->version) {
+ default:
+ printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
+ return 0;
+ case 2:
+ scal_detail_size = 11;
+ rio_detail_size = 13;
+ break;
+ case 3:
+ scal_detail_size = 12;
+ rio_detail_size = 15;
+ break;
+ }
+
+ ptr = (unsigned long)rio_table_hdr + 3;
+ for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
+ scal_devs[i] = (struct scal_detail *)ptr;
+
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
+ rio_devs[i] = (struct rio_detail *)ptr;
+
+ return 1;
+}
+
+void setup_summit(void)
+{
+ unsigned long ptr;
+ unsigned short offset;
+ int i, next_wpeg, next_bus = 0;
+
+ /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
+ ptr = get_bios_ebda();
+ ptr = (unsigned long)phys_to_virt(ptr);
+
+ rio_table_hdr = NULL;
+ offset = 0x180;
+ while (offset) {
+ /* The block id is stored in the 2nd word */
+ if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
+ /* set the pointer past the offset & block id */
+ rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+ break;
+ }
+ /* The next offset is stored in the 1st word. 0 means no more */
+ offset = *((unsigned short *)(ptr + offset));
+ }
+ if (!rio_table_hdr) {
+ printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
+ return;
+ }
+
+ if (!build_detail_arrays())
+ return;
+
+ /* The first Winnipeg we're looking for has an index of 0 */
+ next_wpeg = 0;
+ do {
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
+ if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
+ /* It's the Winnipeg we're looking for! */
+ next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
+ next_wpeg++;
+ break;
+ }
+ }
+ /*
+ * If we go through all Rio devices and don't find one with
+ * the next index, it means we've found all the Winnipegs,
+ * and thus all the PCI buses.
+ */
+ if (i == rio_table_hdr->num_rio_dev)
+ next_wpeg = 0;
+ } while (next_wpeg != 0);
+}
+#endif
+
+struct apic apic_summit = {
+
+ .name = "summit",
+ .probe = probe_summit,
+ .acpi_madt_oem_check = summit_acpi_madt_oem_check,
+ .apic_id_registered = summit_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = summit_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = summit_check_apicid_used,
+ .check_apicid_present = summit_check_apicid_present,
+
+ .vector_allocation_domain = summit_vector_allocation_domain,
+ .init_apic_ldr = summit_init_apic_ldr,
+
+ .ioapic_phys_id_map = summit_ioapic_phys_id_map,
+ .setup_apic_routing = summit_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .apicid_to_node = summit_apicid_to_node,
+ .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = summit_cpu_present_to_apicid,
+ .apicid_to_cpu_present = summit_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = summit_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = summit_phys_pkg_id,
+ .mps_oem_check = summit_mps_oem_check,
+
+ .get_apic_id = summit_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = summit_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = summit_send_IPI_allbutself,
+ .send_IPI_all = summit_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
new file mode 100644
index 00000000000..4a903e2f0d1
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -0,0 +1,245 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled();
+}
+
+/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
+
+static const struct cpumask *x2apic_target_cpus(void)
+{
+ return cpumask_of(0);
+}
+
+/*
+ * for now each logical cpu is in its own vector allocation domain.
+ */
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static void
+ __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+{
+ unsigned long cfg;
+
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * send the IPI.
+ */
+ native_x2apic_icr_write(cfg, apicid);
+}
+
+/*
+ * for now, we send the IPI's one by one in the cpumask.
+ * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
+ * at once. We have 16 cpu's in a cluster. This will minimize IPI register
+ * writes.
+ */
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned long this_cpu = smp_processor_id();
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ unsigned long this_cpu = smp_processor_id();
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
+ for_each_online_cpu(query_cpu) {
+ if (query_cpu == this_cpu)
+ continue;
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ x2apic_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ /*
+ * We're using fixed IRQ delivery, can only return one logical APIC ID.
+ * May as well be the first.
+ */
+ int cpu = cpumask_first(cpumask);
+
+ if ((unsigned)cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ else
+ return BAD_APICID;
+}
+
+static unsigned int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one logical APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ return BAD_APICID;
+}
+
+static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ id = x;
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = id;
+ return x;
+}
+
+static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return current_cpu_data.initial_apicid >> index_msb;
+}
+
+static void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static void init_x2apic_ldr(void)
+{
+ int cpu = smp_processor_id();
+
+ per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+}
+
+struct apic apic_x2apic_cluster = {
+
+ .name = "cluster x2apic",
+ .probe = NULL,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = x2apic_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = x2apic_cluster_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_cluster_phys_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 21bcc0e098b..a284359627e 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,10 +7,10 @@
#include <linux/dmar.h>
#include <asm/smp.h>
+#include <asm/apic.h>
#include <asm/ipi.h>
-#include <asm/genapic.h>
-static int x2apic_phys;
+int x2apic_phys;
static int set_x2apic_phys_mode(char *arg)
{
@@ -21,10 +21,10 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- if (cpu_has_x2apic && x2apic_phys)
- return 1;
-
- return 0;
+ if (x2apic_phys)
+ return x2apic_enabled();
+ else
+ return 0;
}
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
@@ -50,13 +50,15 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
/*
* send the IPI.
*/
- x2apic_icr_write(cfg, apicid);
+ native_x2apic_icr_write(cfg, apicid);
}
static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- unsigned long flags;
unsigned long query_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
@@ -66,12 +68,14 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
local_irq_restore(flags);
}
-static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
- int vector)
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- unsigned long flags;
- unsigned long query_cpu;
unsigned long this_cpu = smp_processor_id();
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
local_irq_save(flags);
for_each_cpu(query_cpu, mask) {
@@ -85,16 +89,19 @@ static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
static void x2apic_send_IPI_allbutself(int vector)
{
- unsigned long flags;
- unsigned long query_cpu;
unsigned long this_cpu = smp_processor_id();
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
local_irq_save(flags);
- for_each_online_cpu(query_cpu)
- if (query_cpu != this_cpu)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
+ for_each_online_cpu(query_cpu) {
+ if (query_cpu == this_cpu)
+ continue;
+ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
local_irq_restore(flags);
}
@@ -110,21 +117,21 @@ static int x2apic_apic_id_registered(void)
static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- int cpu;
-
/*
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = cpumask_first(cpumask);
+ int cpu = cpumask_first(cpumask);
+
if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
-static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
+static unsigned int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
{
int cpu;
@@ -132,31 +139,28 @@ static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- for_each_cpu_and(cpu, cpumask, andmask)
+ for_each_cpu_and(cpu, cpumask, andmask) {
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
+ }
+
if (cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
+
return BAD_APICID;
}
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int x2apic_phys_get_apic_id(unsigned long x)
{
- unsigned int id;
-
- id = x;
- return id;
+ return x;
}
static unsigned long set_apic_id(unsigned int id)
{
- unsigned long x;
-
- x = id;
- return x;
+ return id;
}
-static unsigned int phys_pkg_id(int index_msb)
+static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
{
return current_cpu_data.initial_apicid >> index_msb;
}
@@ -168,27 +172,63 @@ static void x2apic_send_IPI_self(int vector)
static void init_x2apic_ldr(void)
{
- return;
-}
-
-struct genapic apic_x2apic_phys = {
- .name = "physical x2apic",
- .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .int_delivery_mode = dest_Fixed,
- .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .target_cpus = x2apic_target_cpus,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
- .apic_id_registered = x2apic_apic_id_registered,
- .init_apic_ldr = init_x2apic_ldr,
- .send_IPI_all = x2apic_send_IPI_all,
- .send_IPI_allbutself = x2apic_send_IPI_allbutself,
- .send_IPI_mask = x2apic_send_IPI_mask,
- .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
- .send_IPI_self = x2apic_send_IPI_self,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFFFFFFFu),
+}
+
+struct apic apic_x2apic_phys = {
+
+ .name = "physical x2apic",
+ .probe = NULL,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = x2apic_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_phys_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index b193e082f6c..1bd6da1f8fa 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -7,27 +7,28 @@
*
* Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
*/
-
-#include <linux/kernel.h>
-#include <linux/threads.h>
-#include <linux/cpu.h>
#include <linux/cpumask.h>
+#include <linux/hardirq.h>
+#include <linux/proc_fs.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/string.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/hardirq.h>
#include <linux/timer.h>
-#include <linux/proc_fs.h>
-#include <asm/current.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include <asm/genapic.h>
-#include <asm/pgtable.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
+#include <asm/current.h>
+#include <asm/pgtable.h>
#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/smp.h>
DEFINE_PER_CPU(int, x2apic_extra_bits);
@@ -90,39 +91,43 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
cpumask_set_cpu(cpu, retmask);
}
-int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
+static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
+#ifdef CONFIG_SMP
unsigned long val;
int pnode;
pnode = uv_apicid_to_pnode(phys_apicid);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
- (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_INIT;
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
mdelay(10);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
- (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_STARTUP;
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+ atomic_set(&init_deasserted, 1);
+#endif
return 0;
}
static void uv_send_IPI_one(int cpu, int vector)
{
- unsigned long val, apicid, lapicid;
+ unsigned long val, apicid;
int pnode;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
- lapicid = apicid & 0x3f; /* ZZZ macro needed */
pnode = uv_apicid_to_pnode(apicid);
- val =
- (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
- UVH_IPI_INT_APIC_ID_SHFT) |
- (vector << UVH_IPI_INT_VECTOR_SHFT);
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (vector << UVH_IPI_INT_VECTOR_SHFT);
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
@@ -136,22 +141,24 @@ static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- unsigned int cpu;
unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
- for_each_cpu(cpu, mask)
+ for_each_cpu(cpu, mask) {
if (cpu != this_cpu)
uv_send_IPI_one(cpu, vector);
+ }
}
static void uv_send_IPI_allbutself(int vector)
{
- unsigned int cpu;
unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
if (cpu != this_cpu)
uv_send_IPI_one(cpu, vector);
+ }
}
static void uv_send_IPI_all(int vector)
@@ -170,21 +177,21 @@ static void uv_init_apic_ldr(void)
static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
- int cpu;
-
/*
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = cpumask_first(cpumask);
+ int cpu = cpumask_first(cpumask);
+
if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
-static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
+static unsigned int
+uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
{
int cpu;
@@ -192,15 +199,17 @@ static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- for_each_cpu_and(cpu, cpumask, andmask)
+ for_each_cpu_and(cpu, cpumask, andmask) {
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
+ }
if (cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
+
return BAD_APICID;
}
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int x2apic_get_apic_id(unsigned long x)
{
unsigned int id;
@@ -222,10 +231,10 @@ static unsigned long set_apic_id(unsigned int id)
static unsigned int uv_read_apic_id(void)
{
- return get_apic_id(apic_read(APIC_ID));
+ return x2apic_get_apic_id(apic_read(APIC_ID));
}
-static unsigned int phys_pkg_id(int index_msb)
+static int uv_phys_pkg_id(int initial_apicid, int index_msb)
{
return uv_read_apic_id() >> index_msb;
}
@@ -235,26 +244,64 @@ static void uv_send_IPI_self(int vector)
apic_write(APIC_SELF_IPI, vector);
}
-struct genapic apic_x2apic_uv_x = {
- .name = "UV large system",
- .acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .int_delivery_mode = dest_Fixed,
- .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .target_cpus = uv_target_cpus,
- .vector_allocation_domain = uv_vector_allocation_domain,
- .apic_id_registered = uv_apic_id_registered,
- .init_apic_ldr = uv_init_apic_ldr,
- .send_IPI_all = uv_send_IPI_all,
- .send_IPI_allbutself = uv_send_IPI_allbutself,
- .send_IPI_mask = uv_send_IPI_mask,
- .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
- .send_IPI_self = uv_send_IPI_self,
- .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFFFFFFFu),
+struct apic apic_x2apic_uv_x = {
+
+ .name = "UV large system",
+ .probe = NULL,
+ .acpi_madt_oem_check = uv_acpi_madt_oem_check,
+ .apic_id_registered = uv_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = uv_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = uv_vector_allocation_domain,
+ .init_apic_ldr = uv_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = NULL,
+ .cpu_to_logical_apicid = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = uv_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = uv_send_IPI_mask,
+ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = uv_send_IPI_allbutself,
+ .send_IPI_all = uv_send_IPI_all,
+ .send_IPI_self = uv_send_IPI_self,
+
+ .wakeup_secondary_cpu = uv_wakeup_secondary,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -322,7 +369,7 @@ static __init void map_high(char *id, unsigned long base, int shift,
paddr = base << shift;
bytes = (1UL << shift) * (max_pnode + 1);
printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
- paddr + bytes);
+ paddr + bytes);
if (map_type == map_uc)
init_extra_mapping_uc(paddr, bytes);
else
@@ -485,7 +532,7 @@ late_initcall(uv_init_heartbeat);
/*
* Called on each cpu to initialize the per_cpu UV data area.
- * ZZZ hotplug not supported yet
+ * FIXME: hotplug not supported yet
*/
void __cpuinit uv_cpu_init(void)
{
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 266ec6c18b6..10033fe718e 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -301,7 +301,7 @@ extern int (*console_blank_hook)(int);
*/
#define APM_ZERO_SEGS
-#include "apm.h"
+#include <asm/apm.h>
/*
* Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ee4df08feee..fbf2f33e308 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -75,6 +75,7 @@ void foo(void)
OFFSET(PT_DS, pt_regs, ds);
OFFSET(PT_ES, pt_regs, es);
OFFSET(PT_FS, pt_regs, fs);
+ OFFSET(PT_GS, pt_regs, gs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
OFFSET(PT_EIP, pt_regs, ip);
OFFSET(PT_CS, pt_regs, cs);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edb..8793ab33e2c 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
#include <linux/hardirq.h>
#include <linux/suspend.h>
#include <linux/kbuild.h>
-#include <asm/pda.h>
#include <asm/processor.h>
#include <asm/segment.h>
#include <asm/thread_info.h>
@@ -48,16 +47,6 @@ int main(void)
#endif
BLANK();
#undef ENTRY
-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
- ENTRY(kernelstack);
- ENTRY(oldrsp);
- ENTRY(pcurrent);
- ENTRY(irqcount);
- ENTRY(cpunumber);
- ENTRY(irqstackptr);
- ENTRY(data_offset);
- BLANK();
-#undef ENTRY
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412..fc999e6fc46 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
u64 size;
addr = find_e820_area_size(addr, &size, PAGE_SIZE);
- if (addr == 0)
+ if (!(addr + 1))
+ break;
+
+ if (addr >= corruption_check_size)
break;
if ((addr + size) > corruption_check_size)
size = corruption_check_size - addr;
- if (size == 0)
- break;
-
e820_update_range(addr, size, E820_RAM, E820_RESERVED);
scan_areas[num_scan_areas].addr = addr;
scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2d..4e242f9a06e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,11 +14,12 @@ obj-y += vmware.o hypervisor.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
+obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
+
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
+obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 2cf23634b6d..8220ae69849 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,7 +7,7 @@
#include <asm/pat.h>
#include <asm/processor.h>
-#include <mach_apic.h>
+#include <asm/apic.h>
struct cpuid_bit {
u16 feature;
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit cpuid_bits[] = {
+ static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
{ 0, 0, 0, 0 }
};
@@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
*/
void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
@@ -116,22 +116,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
-#ifdef CONFIG_X86_32
- c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
& core_select_mask;
- c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
/*
* Reinit the apicid, now that we have extended initial_apicid.
*/
- c->apicid = phys_pkg_id(c->initial_apicid, 0);
-#else
- c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
- c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
- /*
- * Reinit the apicid, now that we have extended initial_apicid.
- */
- c->apicid = phys_pkg_id(0);
-#endif
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
@@ -143,37 +135,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
return;
#endif
}
-
-#ifdef CONFIG_X86_PAT
-void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
-{
- if (!cpu_has_pat)
- pat_disable("PAT not supported by CPU.");
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- /*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
- *
- * Enable PAT WC only on P4, Core 2 or later CPUs.
- */
- if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
- return;
-
- pat_disable("PAT WC disabled due to known CPU erratum.");
- return;
-
- case X86_VENDOR_AMD:
- case X86_VENDOR_CENTAUR:
- case X86_VENDOR_TRANSMETA:
- return;
- }
-
- pat_disable("PAT disabled. Not yet verified on this CPU type.");
-}
-#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c878f6aa91..7e4a459daa6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
@@ -12,8 +13,6 @@
# include <asm/cacheflush.h>
#endif
-#include <mach_apic.h>
-
#include "cpu.h"
#ifdef CONFIG_X86_32
@@ -143,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
}
}
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ /* calling is from identify_secondary_cpu() ? */
+ if (c->cpu_index == boot_cpu_id)
+ return;
+
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+ (c->x86_mask == 1)))
+ goto valid_k7;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model == 7) && (c->x86_mask == 0))
+ goto valid_k7;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability
+ * bit. It's worth noting that the A5 stepping (662) of some
+ * Athlon XP's have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+ * more.
+ */
+ if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ (c->x86_model > 7))
+ if (cpu_has_mp)
+ goto valid_k7;
+
+ /* If we get here, not a certified SMP capable AMD system. */
+
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ WARN_ONCE(1, "WARNING: This combination of AMD"
+ "processors is not suitable for SMP.\n");
+ if (!test_taint(TAINT_UNSAFE_SMP))
+ add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+ ;
+#endif
+}
+
static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -177,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
}
set_cpu_cap(c, X86_FEATURE_K7);
+
+ amd_k7_smp_check(c);
}
#endif
@@ -452,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
}
#endif
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc..c95e831bb09 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
+#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/bitops.h>
#include <asm/processor.h>
-#include <asm/msr.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
*/
c->x86_capability[5] = cpuid_edx(0xC0000001);
}
-
+#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
if (c->x86_model >= 6 && c->x86_model <= 9) {
rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
/* Before Nehemiah, the C3's had 3dNOW! */
if (c->x86_model >= 6 && c->x86_model < 9)
set_cpu_cap(c, X86_FEATURE_3DNOW);
+#endif
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
display_cacheinfo(c);
}
@@ -316,16 +321,25 @@ enum {
static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
{
switch (c->x86) {
+#ifdef CONFIG_X86_32
case 5:
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
break;
+#endif
+ case 6:
+ if (c->x86_model >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ break;
}
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
}
static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
{
-
+#ifdef CONFIG_X86_32
char *name;
u32 fcr_set = 0;
u32 fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
* 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
*/
clear_cpu_cap(c, 0*32+31);
-
+#endif
+ early_init_centaur(c);
switch (c->x86) {
+#ifdef CONFIG_X86_32
case 5:
switch (c->x86_model) {
case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
}
sprintf(c->x86_model_id, "WinChip %s", name);
break;
-
+#endif
case 6:
init_c3(c);
break;
}
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
}
static unsigned int __cpuinit
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
+#ifdef CONFIG_X86_32
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
if ((c->x86 == 6) && (c->x86_model == 9) &&
(c->x86_mask == 1) && (size == 65))
size -= 1;
-
+#endif
return size;
}
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index a1625f5a1e7..00000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x6 && c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
- set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
- early_init_centaur(c);
-
- if (c->x86 == 0x6 && c->x86_model >= 0xf) {
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- }
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
- .c_vendor = "Centaur",
- .c_ident = { "CentaurHauls" },
- .c_early_init = early_init_centaur,
- .c_init = init_centaur,
- .c_x86_vendor = X86_VENDOR_CENTAUR,
-};
-
-cpu_dev_register(centaur_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 83492b1f93b..e2962cc1e27 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,118 +1,129 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
#include <linux/bootmem.h>
+#include <linux/linkage.h>
#include <linux/bitops.h>
+#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
#include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
#include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/cpumask.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
#include <asm/mtrr.h>
+#include <asm/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
#include <asm/mce.h>
+#include <asm/msr.h>
#include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
#include <asm/smp.h>
+
#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <mach_apic.h>
-#include <asm/genapic.h>
+#include <asm/uv/uv.h>
#endif
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/hypervisor.h>
-
#include "cpu.h"
#ifdef CONFIG_X86_64
/* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_callin_mask;
-cpumask_var_t cpu_callout_mask;
cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+ alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+ alloc_bootmem_cpumask_var(&cpu_callin_mask);
+ alloc_bootmem_cpumask_var(&cpu_callout_mask);
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
+
#else /* CONFIG_X86_32 */
-cpumask_t cpu_callin_map;
+cpumask_t cpu_sibling_setup_map;
cpumask_t cpu_callout_map;
cpumask_t cpu_initialized;
-cpumask_t cpu_sibling_setup_map;
+cpumask_t cpu_callin_map;
#endif /* CONFIG_X86_32 */
-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
- Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
+ /*
+ * We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ *
+ * TLS descriptors are currently at a different place compared to i386.
+ * Hopefully nobody expects them at a fixed place (Wine?)
+ */
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
#else
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
/* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
/* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
/* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
/* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
/* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
- [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
-} };
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+ GDT_STACK_CANARY_INIT
#endif
+} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
#ifdef CONFIG_X86_32
@@ -153,16 +164,17 @@ static inline int flag_is_changeable_p(u32 flag)
* the CPUID. Add "volatile" to not allow gcc to
* optimize the subsequent calls to this function.
*/
- asm volatile ("pushfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "movl %0,%1\n\t"
- "xorl %2,%0\n\t"
- "pushl %0\n\t"
- "popfl\n\t"
- "pushfl\n\t"
- "popl %0\n\t"
- "popfl\n\t"
+ asm volatile ("pushfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "movl %0, %1 \n\t"
+ "xorl %2, %0 \n\t"
+ "pushl %0 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "popfl \n\t"
+
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
@@ -177,18 +189,22 @@ static int __cpuinit have_cpuid_p(void)
static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
- /* Disable processor serial number */
- unsigned long lo, hi;
- rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- lo |= 0x200000;
- wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
- clear_cpu_cap(c, X86_FEATURE_PN);
-
- /* Disabling the serial number may affect the cpuid level */
- c->cpuid_level = cpuid_eax(0);
- }
+ unsigned long lo, hi;
+
+ if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+ return;
+
+ /* Disable processor serial number: */
+
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+ printk(KERN_NOTICE "CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
}
static int __init x86_serial_nr_setup(char *s)
@@ -213,16 +229,64 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
#endif
/*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software. Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+ u32 feature;
+ u32 level;
+};
+
+static const struct cpuid_dependent_feature __cpuinitconst
+cpuid_dependent_features[] = {
+ { X86_FEATURE_MWAIT, 0x00000005 },
+ { X86_FEATURE_DCA, 0x00000009 },
+ { X86_FEATURE_XSAVE, 0x0000000d },
+ { 0, 0 }
+};
+
+static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+ const struct cpuid_dependent_feature *df;
+
+ for (df = cpuid_dependent_features; df->feature; df++) {
+
+ if (!cpu_has(c, df->feature))
+ continue;
+ /*
+ * Note: cpuid_level is set to -1 if unavailable, but
+ * extended_extended_level is set to 0 if unavailable
+ * and the legitimate extended levels are all negative
+ * when signed; hence the weird messing around with
+ * signs here...
+ */
+ if (!((s32)df->level < 0 ?
+ (u32)df->level > (u32)c->extended_cpuid_level :
+ (s32)df->level > (s32)c->cpuid_level))
+ continue;
+
+ clear_cpu_cap(c, df->feature);
+ if (!warn)
+ continue;
+
+ printk(KERN_WARNING
+ "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
+ }
+}
+
+/*
* Naming convention should be: <Name> [(<Codename>)]
* This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
*/
/* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
{
- struct cpu_model_info *info;
+ const struct cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -242,21 +306,34 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ loadsegment(fs, __KERNEL_PERCPU);
+#else
+ loadsegment(gs, 0);
+ wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#endif
+ load_stack_canary_segment();
+}
+
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
+void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;
- gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.address = (long)get_cpu_gdt_table(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
-#ifdef CONFIG_X86_32
- asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-#endif
+ /* Reload the per-cpu base */
+
+ load_percpu_segment(cpu);
}
-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
@@ -275,7 +352,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
#endif
}
-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -289,22 +366,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level < 0x80000004)
return;
- v = (unsigned int *) c->x86_model_id;
+ v = (unsigned int *)c->x86_model_id;
cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /* Intel chips right-justify this string for some dumb reason;
- undo that brain damage */
+ /*
+ * Intel chips right-justify this string for some dumb reason;
+ * undo that brain damage:
+ */
p = q = &c->x86_model_id[0];
while (*p == ' ')
- p++;
+ p++;
if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+ while (*p)
+ *q++ = *p++;
+ while (q <= &c->x86_model_id[48])
+ *q++ = '\0'; /* Zero-pad the rest */
}
}
@@ -373,36 +452,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
+ goto out;
+ }
- if (smp_num_siblings > nr_cpu_ids) {
- printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
+ if (smp_num_siblings <= 1)
+ goto out;
- index_msb = get_count_order(smp_num_siblings);
-#ifdef CONFIG_X86_64
- c->phys_proc_id = phys_pkg_id(index_msb);
-#else
- c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
-#endif
+ if (smp_num_siblings > nr_cpu_ids) {
+ pr_warning("CPU: Unsupported number of siblings %d",
+ smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
- index_msb = get_count_order(smp_num_siblings);
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
- core_bits = get_count_order(c->x86_max_cores);
+ index_msb = get_count_order(smp_num_siblings);
-#ifdef CONFIG_X86_64
- c->cpu_core_id = phys_pkg_id(index_msb) &
- ((1 << core_bits) - 1);
-#else
- c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
-#endif
- }
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
out:
if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -417,8 +490,8 @@ out:
static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
- int i;
static int printed;
+ int i;
for (i = 0; i < X86_VENDOR_NUM; i++) {
if (!cpu_devs[i])
@@ -427,6 +500,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
(cpu_devs[i]->c_ident[1] &&
!strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
this_cpu = cpu_devs[i];
c->x86_vendor = this_cpu->c_x86_vendor;
return;
@@ -435,7 +509,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
if (!printed) {
printed++;
- printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+ printk(KERN_ERR
+ "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
printk(KERN_ERR "CPU: Your system may be unstable.\n");
}
@@ -455,14 +531,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 junk, tfms, cap0, misc;
+
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
c->x86 = (tfms >> 8) & 0xf;
c->x86_model = (tfms >> 4) & 0xf;
c->x86_mask = tfms & 0xf;
+
if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
c->x86_cache_alignment = c->x86_clflush_size;
@@ -478,6 +557,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
u32 capability, excap;
+
cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
c->x86_capability[0] = capability;
c->x86_capability[4] = excap;
@@ -486,6 +566,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
+
if ((xlvl & 0xffff0000) == 0x80000000) {
if (xlvl >= 0x80000001) {
c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -493,13 +574,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
}
}
-#ifdef CONFIG_X86_64
if (c->extended_cpuid_level >= 0x80000008) {
u32 eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
}
+#ifdef CONFIG_X86_32
+ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
#endif
if (c->extended_cpuid_level >= 0x80000007)
@@ -546,8 +629,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
@@ -570,21 +657,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
- validate_pat_support(c);
-
#ifdef CONFIG_SMP
c->cpu_index = boot_cpu_id;
#endif
+ filter_cpuid_features(c, false);
}
void __init early_cpu_init(void)
{
- struct cpu_dev **cdev;
+ const struct cpu_dev *const *cdev;
int count = 0;
- printk("KERNEL supported cpus:\n");
+ printk(KERN_INFO "KERNEL supported cpus:\n");
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
- struct cpu_dev *cpudev = *cdev;
+ const struct cpu_dev *cpudev = *cdev;
unsigned int j;
if (count >= X86_VENDOR_NUM)
@@ -595,7 +681,7 @@ void __init early_cpu_init(void)
for (j = 0; j < 2; j++) {
if (!cpudev->c_ident[j])
continue;
- printk(" %s %s\n", cpudev->c_vendor,
+ printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
cpudev->c_ident[j]);
}
}
@@ -637,7 +723,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_HT
- c->apicid = phys_pkg_id(c->initial_apicid, 0);
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
# else
c->apicid = c->initial_apicid;
# endif
@@ -671,9 +757,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_coreid_bits = 0;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
#else
c->cpuid_level = -1; /* CPUID not detected */
c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -684,7 +774,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_identify(c);
#ifdef CONFIG_X86_64
- c->apicid = phys_pkg_id(0);
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
/*
@@ -704,13 +794,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
squash_the_stupid_serial_number(c);
/*
- * The vendor-specific functions might have changed features. Now
- * we do "generic changes."
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
*/
+ /* Filter out anything that depends on CPUID levels we don't have */
+ filter_cpuid_features(c, true);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
- char *p;
+ const char *p;
p = table_lookup_model(c);
if (p)
strcpy(c->x86_model_id, p);
@@ -785,11 +878,11 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
}
struct msr_range {
- unsigned min;
- unsigned max;
+ unsigned min;
+ unsigned max;
};
-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
{ 0x00000000, 0x00000418},
{ 0xc0000000, 0xc000040b},
{ 0xc0010000, 0xc0010142},
@@ -798,14 +891,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
static void __cpuinit print_cpu_msr(void)
{
+ unsigned index_min, index_max;
unsigned index;
u64 val;
int i;
- unsigned index_min, index_max;
for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
index_min = msr_range_array[i].min;
index_max = msr_range_array[i].max;
+
for (index = index_min; index < index_max; index++) {
if (rdmsrl_amd_safe(index, &val))
continue;
@@ -815,6 +909,7 @@ static void __cpuinit print_cpu_msr(void)
}
static int show_msr __cpuinitdata;
+
static __init int setup_show_msr(char *arg)
{
int num;
@@ -836,12 +931,14 @@ __setup("noclflush", setup_noclflush);
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
- char *vendor = NULL;
+ const char *vendor = NULL;
- if (c->x86_vendor < X86_VENDOR_NUM)
+ if (c->x86_vendor < X86_VENDOR_NUM) {
vendor = this_cpu->c_vendor;
- else if (c->cpuid_level >= 0)
- vendor = c->x86_vendor_id;
+ } else {
+ if (c->cpuid_level >= 0)
+ vendor = c->x86_vendor_id;
+ }
if (vendor && !strstr(c->x86_model_id, vendor))
printk(KERN_CONT "%s ", vendor);
@@ -868,65 +965,45 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
static __init int setup_disablecpuid(char *arg)
{
int bit;
+
if (get_option(&arg, &bit) && bit < NCAPINTS*32)
setup_clear_cpu_cap(bit);
else
return 0;
+
return 1;
}
__setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+ irq_stack_union) __aligned(PAGE_SIZE);
-void __cpuinit pda_init(int cpu)
-{
- struct x8664_pda *pda = cpu_pda(cpu);
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
- /* Setup up data that may be needed in __get_free_pages early */
- loadsegment(fs, 0);
- loadsegment(gs, 0);
- /* Memory clobbers used to order PDA accessed */
- mb();
- wrmsrl(MSR_GS_BASE, pda);
- mb();
-
- pda->cpunumber = cpu;
- pda->irqcount = -1;
- pda->kernelstack = (unsigned long)stack_thread_info() -
- PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
-
- if (cpu == 0) {
- /* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
- pda->irqstackptr = boot_cpu_stack;
- pda->irqstackptr += IRQSTACKSIZE - 64;
- } else {
- if (!pda->irqstackptr) {
- pda->irqstackptr = (char *)
- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
- if (!pda->irqstackptr)
- panic("cannot allocate irqstack for cpu %d",
- cpu);
- pda->irqstackptr += IRQSTACKSIZE - 64;
- }
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+ (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
- if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
- pda->nodenumber = cpu_to_node(cpu);
- }
-}
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;
-static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
-extern asmlinkage void ignore_sysret(void);
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+ __aligned(PAGE_SIZE);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
@@ -957,16 +1034,38 @@ unsigned long kernel_eflags;
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
-#else
+#else /* CONFIG_X86_64 */
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+DEFINE_PER_CPU(unsigned long, stack_canary);
+#endif
-/* Make sure %fs is initialized properly in idle threads */
+/* Make sure %fs and %gs are initialized properly in idle threads */
struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
regs->fs = __KERNEL_PERCPU;
+ regs->gs = __KERNEL_STACK_CANARY;
+
return regs;
}
-#endif
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+
+ set_debugreg(0, i);
+ }
+}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -976,21 +1075,25 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
* A lot of state is already set up in PDA init for 64 bit
*/
#ifdef CONFIG_X86_64
+
void __cpuinit cpu_init(void)
{
- int cpu = stack_smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
- unsigned long v;
- char *estacks = NULL;
+ struct orig_ist *orig_ist;
struct task_struct *me;
+ struct tss_struct *t;
+ unsigned long v;
+ int cpu;
int i;
- /* CPU 0 is initialised in head64.c */
- if (cpu != 0)
- pda_init(cpu);
- else
- estacks = boot_exception_stacks;
+ cpu = stack_smp_processor_id();
+ t = &per_cpu(init_tss, cpu);
+ orig_ist = &per_cpu(orig_ist, cpu);
+
+#ifdef CONFIG_NUMA
+ if (cpu != 0 && percpu_read(node_number) == 0 &&
+ cpu_to_node(cpu) != NUMA_NO_NODE)
+ percpu_write(node_number, cpu_to_node(cpu));
+#endif
me = current;
@@ -1006,7 +1109,9 @@ void __cpuinit cpu_init(void)
* and set up the GDT descriptor:
*/
- switch_to_new_gdt();
+ switch_to_new_gdt(cpu);
+ loadsegment(fs, 0);
+
load_idt((const struct desc_ptr *)&idt_descr);
memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1017,31 +1122,24 @@ void __cpuinit cpu_init(void)
barrier();
check_efer();
- if (cpu != 0 && x2apic)
+ if (cpu != 0)
enable_x2apic();
/*
* set up and load the per-CPU TSS
*/
if (!orig_ist->ist[0]) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
+ char *estacks = per_cpu(exception_stacks, cpu);
+
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception "
- "stack %ld %d\n", v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
+ estacks += exception_stack_sizes[v];
orig_ist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
}
}
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
/*
* <= is required because the CPU will access up to
* 8 bits beyond the end of the IO permission bitmap.
@@ -1051,8 +1149,7 @@ void __cpuinit cpu_init(void)
atomic_inc(&init_mm.mm_count);
me->active_mm = &init_mm;
- if (me->mm)
- BUG();
+ BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);
load_sp0(t, &current->thread);
@@ -1069,22 +1166,9 @@ void __cpuinit cpu_init(void)
*/
if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
arch_kgdb_ops.correct_hw_break();
- else {
-#endif
- /*
- * Clear all 6 debug registers:
- */
-
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
- /* If the kgdb is connected no debug regs should be altered. */
- }
+ else
#endif
+ clear_all_debug_regs();
fpu_init();
@@ -1105,7 +1189,8 @@ void __cpuinit cpu_init(void)
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;) local_irq_enable();
+ for (;;)
+ local_irq_enable();
}
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1114,15 +1199,14 @@ void __cpuinit cpu_init(void)
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_idt(&idt_descr);
- switch_to_new_gdt();
+ switch_to_new_gdt(cpu);
/*
* Set up and load the per-CPU TSS and LDT
*/
atomic_inc(&init_mm.mm_count);
curr->active_mm = &init_mm;
- if (curr->mm)
- BUG();
+ BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
@@ -1135,16 +1219,7 @@ void __cpuinit cpu_init(void)
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
- /* Clear %gs. */
- asm volatile ("mov %0, %%gs" : : "r" (0));
-
- /* Clear all 6 debug registers: */
- set_debugreg(0, 0);
- set_debugreg(0, 1);
- set_debugreg(0, 2);
- set_debugreg(0, 3);
- set_debugreg(0, 6);
- set_debugreg(0, 7);
+ clear_all_debug_regs();
/*
* Force FPU initialization:
@@ -1164,6 +1239,4 @@ void __cpuinit cpu_init(void)
xsave_init();
}
-
-
#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a3921..9469ecb5aeb 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -5,15 +5,15 @@
struct cpu_model_info {
int vendor;
int family;
- char *model_names[16];
+ const char *model_names[16];
};
/* attempt to consolidate cpu attributes */
struct cpu_dev {
- char * c_vendor;
+ const char * c_vendor;
/* some have two possibilities for cpuid string */
- char * c_ident[2];
+ const char * c_ident[2];
struct cpu_model_info c_models[4];
@@ -25,11 +25,12 @@ struct cpu_dev {
};
#define cpu_dev_register(cpu_devX) \
- static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__attribute__((__section__(".x86_cpu_dev.init"))) = \
&cpu_devX;
-extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+ *const __x86_cpu_dev_end[];
extern void display_cacheinfo(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 00000000000..46e29ab96c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,901 @@
+/*
+ * CPU x86 architecture debug code
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+
+#include <asm/cpu_debug.h>
+#include <asm/paravirt.h>
+#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
+static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(unsigned, cpu_modelflag);
+static DEFINE_PER_CPU(int, cpu_priv_count);
+static DEFINE_PER_CPU(unsigned, cpu_model);
+
+static DEFINE_MUTEX(cpu_debug_lock);
+
+static struct dentry *cpu_debugfs_dir;
+
+static struct cpu_debug_base cpu_base[] = {
+ { "mc", CPU_MC, 0 },
+ { "monitor", CPU_MONITOR, 0 },
+ { "time", CPU_TIME, 0 },
+ { "pmc", CPU_PMC, 1 },
+ { "platform", CPU_PLATFORM, 0 },
+ { "apic", CPU_APIC, 0 },
+ { "poweron", CPU_POWERON, 0 },
+ { "control", CPU_CONTROL, 0 },
+ { "features", CPU_FEATURES, 0 },
+ { "lastbranch", CPU_LBRANCH, 0 },
+ { "bios", CPU_BIOS, 0 },
+ { "freq", CPU_FREQ, 0 },
+ { "mtrr", CPU_MTRR, 0 },
+ { "perf", CPU_PERF, 0 },
+ { "cache", CPU_CACHE, 0 },
+ { "sysenter", CPU_SYSENTER, 0 },
+ { "therm", CPU_THERM, 0 },
+ { "misc", CPU_MISC, 0 },
+ { "debug", CPU_DEBUG, 0 },
+ { "pat", CPU_PAT, 0 },
+ { "vmx", CPU_VMX, 0 },
+ { "call", CPU_CALL, 0 },
+ { "base", CPU_BASE, 0 },
+ { "ver", CPU_VER, 0 },
+ { "conf", CPU_CONF, 0 },
+ { "smm", CPU_SMM, 0 },
+ { "svm", CPU_SVM, 0 },
+ { "osvm", CPU_OSVM, 0 },
+ { "tss", CPU_TSS, 0 },
+ { "cr", CPU_CR, 0 },
+ { "dt", CPU_DT, 0 },
+ { "registers", CPU_REG_ALL, 0 },
+};
+
+static struct cpu_file_base cpu_file[] = {
+ { "index", CPU_REG_ALL, 0 },
+ { "value", CPU_REG_ALL, 1 },
+};
+
+/* Intel Registers Range */
+static struct cpu_debug_range cpu_intel_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
+ { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
+ { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
+ { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
+ { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
+ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
+
+ { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
+ { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
+ { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
+ { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
+
+ { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
+ { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
+ { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
+ { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
+
+ { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
+ { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
+ { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
+
+ { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
+ { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
+ { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
+
+ { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
+ { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
+ { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
+
+ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
+ { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
+ { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
+ { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
+ { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
+ { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
+ { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
+ { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
+
+ { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
+ { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
+ { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
+ { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
+ { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
+ { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
+ { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
+
+ { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
+ { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
+ { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
+ { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
+ { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
+ { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
+ { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
+ { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
+ { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
+
+ { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
+ { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
+ { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
+ { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
+ { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
+ { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
+
+ { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
+ { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
+ { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
+
+ { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
+ { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
+ { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
+ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
+};
+
+/* AMD Registers Range */
+static struct cpu_debug_range cpu_amd_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
+ { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
+ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
+ { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
+ { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
+
+ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
+ { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
+ { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
+ { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
+
+ { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
+ { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
+ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
+ { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
+
+ { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
+ { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
+ { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
+ { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
+ { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
+ { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
+ { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
+ { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
+ { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
+ { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
+ { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
+ { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
+ { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
+ { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
+ { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
+ { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
+ { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
+ { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
+};
+
+
+/* Intel */
+static int get_intel_modelflag(unsigned model)
+{
+ int flag;
+
+ switch (model) {
+ case 0x0501:
+ case 0x0502:
+ case 0x0504:
+ flag = CPU_INTEL_PENTIUM;
+ break;
+ case 0x0601:
+ case 0x0603:
+ case 0x0605:
+ case 0x0607:
+ case 0x0608:
+ case 0x060A:
+ case 0x060B:
+ flag = CPU_INTEL_P6;
+ break;
+ case 0x0609:
+ case 0x060D:
+ flag = CPU_INTEL_PENTIUM_M;
+ break;
+ case 0x060E:
+ flag = CPU_INTEL_CORE;
+ break;
+ case 0x060F:
+ case 0x0617:
+ flag = CPU_INTEL_CORE2;
+ break;
+ case 0x061C:
+ flag = CPU_INTEL_ATOM;
+ break;
+ case 0x0F00:
+ case 0x0F01:
+ case 0x0F02:
+ case 0x0F03:
+ case 0x0F04:
+ flag = CPU_INTEL_XEON_P4;
+ break;
+ case 0x0F06:
+ flag = CPU_INTEL_XEON_MP;
+ break;
+ default:
+ flag = CPU_NONE;
+ break;
+ }
+
+ return flag;
+}
+
+/* AMD */
+static int get_amd_modelflag(unsigned model)
+{
+ int flag;
+
+ switch (model >> 8) {
+ case 0x6:
+ flag = CPU_AMD_K6;
+ break;
+ case 0x7:
+ flag = CPU_AMD_K7;
+ break;
+ case 0x8:
+ flag = CPU_AMD_K8;
+ break;
+ case 0xf:
+ flag = CPU_AMD_0F;
+ break;
+ case 0x10:
+ flag = CPU_AMD_10;
+ break;
+ case 0x11:
+ flag = CPU_AMD_11;
+ break;
+ default:
+ flag = CPU_NONE;
+ break;
+ }
+
+ return flag;
+}
+
+static int get_cpu_modelflag(unsigned cpu)
+{
+ int flag;
+
+ flag = per_cpu(cpu_model, cpu);
+
+ switch (flag >> 16) {
+ case X86_VENDOR_INTEL:
+ flag = get_intel_modelflag(flag);
+ break;
+ case X86_VENDOR_AMD:
+ flag = get_amd_modelflag(flag & 0xffff);
+ break;
+ default:
+ flag = CPU_NONE;
+ break;
+ }
+
+ return flag;
+}
+
+static int get_cpu_range_count(unsigned cpu)
+{
+ int index;
+
+ switch (per_cpu(cpu_model, cpu) >> 16) {
+ case X86_VENDOR_INTEL:
+ index = ARRAY_SIZE(cpu_intel_range);
+ break;
+ case X86_VENDOR_AMD:
+ index = ARRAY_SIZE(cpu_amd_range);
+ break;
+ default:
+ index = 0;
+ break;
+ }
+
+ return index;
+}
+
+static int is_typeflag_valid(unsigned cpu, unsigned flag)
+{
+ unsigned vendor, modelflag;
+ int i, index;
+
+ /* Standard Registers should be always valid */
+ if (flag >= CPU_TSS)
+ return 1;
+
+ modelflag = per_cpu(cpu_modelflag, cpu);
+ vendor = per_cpu(cpu_model, cpu) >> 16;
+ index = get_cpu_range_count(cpu);
+
+ for (i = 0; i < index; i++) {
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if ((cpu_intel_range[i].model & modelflag) &&
+ (cpu_intel_range[i].flag & flag))
+ return 1;
+ break;
+ case X86_VENDOR_AMD:
+ if ((cpu_amd_range[i].model & modelflag) &&
+ (cpu_amd_range[i].flag & flag))
+ return 1;
+ break;
+ }
+ }
+
+ /* Invalid */
+ return 0;
+}
+
+static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
+ int index, unsigned flag)
+{
+ unsigned modelflag;
+
+ modelflag = per_cpu(cpu_modelflag, cpu);
+ *max = 0;
+ switch (per_cpu(cpu_model, cpu) >> 16) {
+ case X86_VENDOR_INTEL:
+ if ((cpu_intel_range[index].model & modelflag) &&
+ (cpu_intel_range[index].flag & flag)) {
+ *min = cpu_intel_range[index].min;
+ *max = cpu_intel_range[index].max;
+ }
+ break;
+ case X86_VENDOR_AMD:
+ if ((cpu_amd_range[index].model & modelflag) &&
+ (cpu_amd_range[index].flag & flag)) {
+ *min = cpu_amd_range[index].min;
+ *max = cpu_amd_range[index].max;
+ }
+ break;
+ }
+
+ return *max;
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_cpu_data(struct seq_file *seq, unsigned type,
+ u32 low, u32 high)
+{
+ struct cpu_private *priv;
+ u64 val = high;
+
+ if (seq) {
+ priv = seq->private;
+ if (priv->file) {
+ val = (val << 32) | low;
+ seq_printf(seq, "0x%llx\n", val);
+ } else
+ seq_printf(seq, " %08x: %08x_%08x\n",
+ type, high, low);
+ } else
+ printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
+{
+ unsigned msr, msr_min, msr_max;
+ struct cpu_private *priv;
+ u32 low, high;
+ int i, range;
+
+ if (seq) {
+ priv = seq->private;
+ if (priv->file) {
+ if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
+ &low, &high))
+ print_cpu_data(seq, priv->reg, low, high);
+ return;
+ }
+ }
+
+ range = get_cpu_range_count(cpu);
+
+ for (i = 0; i < range; i++) {
+ if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
+ continue;
+
+ for (msr = msr_min; msr <= msr_max; msr++) {
+ if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
+ continue;
+ print_cpu_data(seq, msr, low, high);
+ }
+ }
+}
+
+static void print_tss(void *arg)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct seq_file *seq = arg;
+ unsigned int seg;
+
+ seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
+ seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
+ seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
+ seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
+
+ seq_printf(seq, " RSI\t: %016lx\n", regs->si);
+ seq_printf(seq, " RDI\t: %016lx\n", regs->di);
+ seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
+ seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
+
+#ifdef CONFIG_X86_64
+ seq_printf(seq, " R08\t: %016lx\n", regs->r8);
+ seq_printf(seq, " R09\t: %016lx\n", regs->r9);
+ seq_printf(seq, " R10\t: %016lx\n", regs->r10);
+ seq_printf(seq, " R11\t: %016lx\n", regs->r11);
+ seq_printf(seq, " R12\t: %016lx\n", regs->r12);
+ seq_printf(seq, " R13\t: %016lx\n", regs->r13);
+ seq_printf(seq, " R14\t: %016lx\n", regs->r14);
+ seq_printf(seq, " R15\t: %016lx\n", regs->r15);
+#endif
+
+ asm("movl %%cs,%0" : "=r" (seg));
+ seq_printf(seq, " CS\t: %04x\n", seg);
+ asm("movl %%ds,%0" : "=r" (seg));
+ seq_printf(seq, " DS\t: %04x\n", seg);
+ seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
+ asm("movl %%es,%0" : "=r" (seg));
+ seq_printf(seq, " ES\t: %04x\n", seg);
+ asm("movl %%fs,%0" : "=r" (seg));
+ seq_printf(seq, " FS\t: %04x\n", seg);
+ asm("movl %%gs,%0" : "=r" (seg));
+ seq_printf(seq, " GS\t: %04x\n", seg);
+
+ seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
+
+ seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
+}
+
+static void print_cr(void *arg)
+{
+ struct seq_file *seq = arg;
+
+ seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
+ seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
+ seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
+ seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
+#ifdef CONFIG_X86_64
+ seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
+#endif
+}
+
+static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
+{
+ seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
+}
+
+static void print_dt(void *seq)
+{
+ struct desc_ptr dt;
+ unsigned long ldt;
+
+ /* IDT */
+ store_idt((struct desc_ptr *)&dt);
+ print_desc_ptr("IDT", seq, dt);
+
+ /* GDT */
+ store_gdt((struct desc_ptr *)&dt);
+ print_desc_ptr("GDT", seq, dt);
+
+ /* LDT */
+ store_ldt(ldt);
+ seq_printf(seq, " LDT\t: %016lx\n", ldt);
+
+ /* TR */
+ store_tr(ldt);
+ seq_printf(seq, " TR\t: %016lx\n", ldt);
+}
+
+static void print_dr(void *arg)
+{
+ struct seq_file *seq = arg;
+ unsigned long dr;
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+ get_debugreg(dr, i);
+ seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
+ }
+
+ seq_printf(seq, "\n MSR\t:\n");
+}
+
+static void print_apic(void *arg)
+{
+ struct seq_file *seq = arg;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(seq, " LAPIC\t:\n");
+ seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
+ seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
+ seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
+ seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
+ seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
+ seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
+ seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
+ seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
+ seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
+ seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
+ seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
+ seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
+ seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
+ seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
+ seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
+ seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
+ seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
+ seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
+ seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
+ seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
+ seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+ seq_printf(seq, "\n MSR\t:\n");
+}
+
+static int cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct cpu_private *priv = seq->private;
+
+ if (priv == NULL)
+ return -EINVAL;
+
+ switch (cpu_base[priv->type].flag) {
+ case CPU_TSS:
+ smp_call_function_single(priv->cpu, print_tss, seq, 1);
+ break;
+ case CPU_CR:
+ smp_call_function_single(priv->cpu, print_cr, seq, 1);
+ break;
+ case CPU_DT:
+ smp_call_function_single(priv->cpu, print_dt, seq, 1);
+ break;
+ case CPU_DEBUG:
+ if (priv->file == CPU_INDEX_BIT)
+ smp_call_function_single(priv->cpu, print_dr, seq, 1);
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+ case CPU_APIC:
+ if (priv->file == CPU_INDEX_BIT)
+ smp_call_function_single(priv->cpu, print_apic, seq, 1);
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+
+ default:
+ print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+ break;
+ }
+ seq_printf(seq, "\n");
+
+ return 0;
+}
+
+static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos == 0) /* One time is enough ;-) */
+ return seq;
+
+ return NULL;
+}
+
+static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return cpu_seq_start(seq, pos);
+}
+
+static void cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations cpu_seq_ops = {
+ .start = cpu_seq_start,
+ .next = cpu_seq_next,
+ .stop = cpu_seq_stop,
+ .show = cpu_seq_show,
+};
+
+static int cpu_seq_open(struct inode *inode, struct file *file)
+{
+ struct cpu_private *priv = inode->i_private;
+ struct seq_file *seq;
+ int err;
+
+ err = seq_open(file, &cpu_seq_ops);
+ if (!err) {
+ seq = file->private_data;
+ seq->private = priv;
+ }
+
+ return err;
+}
+
+static int write_msr(struct cpu_private *priv, u64 val)
+{
+ u32 low, high;
+
+ high = (val >> 32) & 0xffffffff;
+ low = val & 0xffffffff;
+
+ if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
+ return 0;
+
+ return -EPERM;
+}
+
+static int write_cpu_register(struct cpu_private *priv, const char *buf)
+{
+ int ret = -EPERM;
+ u64 val;
+
+ ret = strict_strtoull(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ /* Supporting only MSRs */
+ if (priv->type < CPU_TSS_BIT)
+ return write_msr(priv, val);
+
+ return ret;
+}
+
+static ssize_t cpu_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct cpu_private *priv = seq->private;
+ char buf[19];
+
+ if ((priv == NULL) || (count >= sizeof(buf)))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, count))
+ return -EFAULT;
+
+ buf[count] = 0;
+
+ if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
+ if (!write_cpu_register(priv, buf))
+ return count;
+
+ return -EACCES;
+}
+
+static const struct file_operations cpu_fops = {
+ .owner = THIS_MODULE,
+ .open = cpu_seq_open,
+ .read = seq_read,
+ .write = cpu_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
+ unsigned file, struct dentry *dentry)
+{
+ struct cpu_private *priv = NULL;
+
+ /* Already intialized */
+ if (file == CPU_INDEX_BIT)
+ if (per_cpu(cpu_arr[type].init, cpu))
+ return 0;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv == NULL)
+ return -ENOMEM;
+
+ priv->cpu = cpu;
+ priv->type = type;
+ priv->reg = reg;
+ priv->file = file;
+ mutex_lock(&cpu_debug_lock);
+ per_cpu(priv_arr[type], cpu) = priv;
+ per_cpu(cpu_priv_count, cpu)++;
+ mutex_unlock(&cpu_debug_lock);
+
+ if (file)
+ debugfs_create_file(cpu_file[file].name, S_IRUGO,
+ dentry, (void *)priv, &cpu_fops);
+ else {
+ debugfs_create_file(cpu_base[type].name, S_IRUGO,
+ per_cpu(cpu_arr[type].dentry, cpu),
+ (void *)priv, &cpu_fops);
+ mutex_lock(&cpu_debug_lock);
+ per_cpu(cpu_arr[type].init, cpu) = 1;
+ mutex_unlock(&cpu_debug_lock);
+ }
+
+ return 0;
+}
+
+static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
+ struct dentry *dentry)
+{
+ unsigned file;
+ int err = 0;
+
+ for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
+ err = cpu_create_file(cpu, type, reg, file, dentry);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
+{
+ struct dentry *cpu_dentry = NULL;
+ unsigned reg, reg_min, reg_max;
+ int i, range, err = 0;
+ char reg_dir[12];
+ u32 low, high;
+
+ range = get_cpu_range_count(cpu);
+
+ for (i = 0; i < range; i++) {
+ if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
+ cpu_base[type].flag))
+ continue;
+
+ for (reg = reg_min; reg <= reg_max; reg++) {
+ if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
+ continue;
+
+ sprintf(reg_dir, "0x%x", reg);
+ cpu_dentry = debugfs_create_dir(reg_dir, dentry);
+ err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
+ if (err)
+ return err;
+ }
+ }
+
+ return err;
+}
+
+static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
+{
+ struct dentry *cpu_dentry = NULL;
+ unsigned type;
+ int err = 0;
+
+ for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
+ if (!is_typeflag_valid(cpu, cpu_base[type].flag))
+ continue;
+ cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
+ per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+
+ if (type < CPU_TSS_BIT)
+ err = cpu_init_msr(cpu, type, cpu_dentry);
+ else
+ err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
+ cpu_dentry);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int cpu_init_cpu(void)
+{
+ struct dentry *cpu_dentry = NULL;
+ struct cpuinfo_x86 *cpui;
+ char cpu_dir[12];
+ unsigned cpu;
+ int err = 0;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ cpui = &cpu_data(cpu);
+ if (!cpu_has(cpui, X86_FEATURE_MSR))
+ continue;
+ per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
+ (cpui->x86 << 8) |
+ (cpui->x86_model));
+ per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
+
+ sprintf(cpu_dir, "cpu%d", cpu);
+ cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
+ err = cpu_init_allreg(cpu, cpu_dentry);
+
+ pr_info("cpu%d(%d) debug files %d\n",
+ cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
+ if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
+ pr_err("Register files count %d exceeds limit %d\n",
+ per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
+ per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
+ err = -ENFILE;
+ }
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+static int __init cpu_debug_init(void)
+{
+ cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
+
+ return cpu_init_cpu();
+}
+
+static void __exit cpu_debug_exit(void)
+{
+ int i, cpu;
+
+ if (cpu_debugfs_dir)
+ debugfs_remove_recursive(cpu_debugfs_dir);
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
+ kfree(per_cpu(priv_arr[i], cpu));
+}
+
+module_init(cpu_debug_init);
+module_exit(cpu_debug_exit);
+
+MODULE_AUTHOR("Jaswinder Singh Rajput");
+MODULE_DESCRIPTION("CPU Debug module");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c..22590cf688a 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
if (!data)
return -ENOMEM;
- data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+ data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
per_cpu(drv_data, cpu) = data;
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index c2f930d8664..41ab3f064cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -204,12 +204,12 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
}
/* Enable Enhanced PowerSaver */
rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & 1 << 16)) {
- val |= 1 << 16;
+ if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+ val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
wrmsrl(MSR_IA32_MISC_ENABLE, val);
/* Can be locked at 0 */
rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & 1 << 16)) {
+ if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
return -ENODEV;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index f08998278a3..c9f1fdc0283 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -390,14 +390,14 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
enable it if not. */
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- if (!(l & (1<<16))) {
- l |= (1<<16);
+ if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+ l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
wrmsr(MSR_IA32_MISC_ENABLE, l, h);
/* check to see if it stuck */
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- if (!(l & (1<<16))) {
+ if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
printk(KERN_INFO PFX
"couldn't enable Enhanced SpeedStep\n");
return -ENODEV;
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071..593171e967e 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
*/
static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
-static char Cx86_model[][9] __cpuinitdata = {
+static const char __cpuinitconst Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static char Cx486_name[][5] __cpuinitdata = {
+static const char __cpuinitconst Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static char Cx486S_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
-static char Cx486D_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __cpuinitdata = "12??43";
-static char cyrix_model_mult2[] __cpuinitdata = "12233445";
+static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
+static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
}
}
-static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
cpu_dev_register(cyrix_cpu_dev);
-static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 24ff26a38ad..b09d4eb52bb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -13,6 +13,7 @@
#include <asm/uaccess.h>
#include <asm/ds.h>
#include <asm/bugs.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_64
#include <asm/topology.h>
@@ -24,7 +25,6 @@
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
-#include <mach_apic.h>
#endif
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
@@ -54,6 +54,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
c->x86_cache_alignment = 128;
#endif
+ /* CPUID workaround for 0F33/0F34 CPU */
+ if (c->x86 == 0xF && c->x86_model == 0x3
+ && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+ c->x86_phys_bits = 36;
+
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
* with P/T states and does not stop in deep C-states
@@ -63,6 +68,18 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
+ /*
+ * There is a known erratum on Pentium III and Core Solo
+ * and Core Duo CPUs.
+ * " Page with PAT set to WC while associated MTRR is UC
+ * may consolidate to UC "
+ * Because of this erratum, it is better to stick with
+ * setting WC in MTRR rather than using PAT on these CPUs.
+ *
+ * Enable PAT WC only on P4, Core 2 or later CPUs.
+ */
+ if (c->x86 == 6 && c->x86_model < 15)
+ clear_cpu_cap(c, X86_FEATURE_PAT);
}
#ifdef CONFIG_X86_32
@@ -99,6 +116,28 @@ static void __cpuinit trap_init_f00f_bug(void)
}
#endif
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ /* calling is from identify_secondary_cpu() ? */
+ if (c->cpu_index == boot_cpu_id)
+ return;
+
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86 == 5 &&
+ c->x86_mask >= 1 && c->x86_mask <= 4 &&
+ c->x86_model <= 3) {
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+ "with B stepping processors.\n");
+ }
+#endif
+}
+
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
@@ -135,10 +174,10 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & (1<<9)) == 0) {
+ if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= (1<<9); /* Disable hw prefetching */
+ lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
}
}
@@ -175,6 +214,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_NUMAQ
numaq_tsc_disable();
#endif
+
+ intel_smp_check(c);
}
#else
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -374,7 +415,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
}
#endif
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index da299eb85fc..c471eb1a389 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
};
/* all the cache descriptor types we care about (no TLB or trace cache entries) */
-static struct _cache_table cache_table[] __cpuinitdata =
+static const struct _cache_table __cpuinitconst cache_table[] =
{
{ 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
{ 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -147,7 +147,16 @@ struct _cpuid4_info {
union _cpuid4_leaf_ecx ecx;
unsigned long size;
unsigned long can_disable;
- cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
+ DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
+};
+
+/* subset of above _cpuid4_info w/o shared_cpu_map */
+struct _cpuid4_info_regs {
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned long size;
+ unsigned long can_disable;
};
#ifdef CONFIG_PCI
@@ -197,15 +206,15 @@ union l3_cache {
unsigned val;
};
-static unsigned short assocs[] __cpuinitdata = {
+static const unsigned short __cpuinitconst assocs[] = {
[1] = 1, [2] = 2, [4] = 4, [6] = 8,
[8] = 16, [0xa] = 32, [0xb] = 48,
[0xc] = 64,
[0xf] = 0xffff // ??
};
-static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
-static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
+static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
+static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
static void __cpuinit
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
@@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
}
static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
+amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
if (index < 3)
return;
@@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
}
static int
-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+__cpuinit cpuid4_cache_lookup_regs(int index,
+ struct _cpuid4_info_regs *this_leaf)
{
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
@@ -314,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
return 0;
}
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+{
+ struct _cpuid4_info_regs *leaf_regs =
+ (struct _cpuid4_info_regs *)this_leaf;
+
+ return cpuid4_cache_lookup_regs(index, leaf_regs);
+}
+
static int __cpuinit find_num_cache_leaves(void)
{
unsigned int eax, ebx, ecx, edx;
@@ -353,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
* parameters cpuid leaf to find the cache details
*/
for (i = 0; i < num_cache_leaves; i++) {
- struct _cpuid4_info this_leaf;
-
+ struct _cpuid4_info_regs this_leaf;
int retval;
- retval = cpuid4_cache_lookup(i, &this_leaf);
+ retval = cpuid4_cache_lookup_regs(i, &this_leaf);
if (retval >= 0) {
switch(this_leaf.eax.split.level) {
case 1:
@@ -506,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
if (num_threads_sharing == 1)
- cpu_set(cpu, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
else {
index_msb = get_count_order(num_threads_sharing);
for_each_online_cpu(i) {
if (cpu_data(i).apicid >> index_msb ==
c->apicid >> index_msb) {
- cpu_set(i, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(i,
+ to_cpumask(this_leaf->shared_cpu_map));
if (i != cpu && per_cpu(cpuid4_info, i)) {
- sibling_leaf = CPUID4_INFO_IDX(i, index);
- cpu_set(cpu, sibling_leaf->shared_cpu_map);
+ sibling_leaf =
+ CPUID4_INFO_IDX(i, index);
+ cpumask_set_cpu(cpu, to_cpumask(
+ sibling_leaf->shared_cpu_map));
}
}
}
@@ -528,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
int sibling;
this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
+ for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpu_clear(cpu, sibling_leaf->shared_cpu_map);
+ cpumask_clear_cpu(cpu,
+ to_cpumask(sibling_leaf->shared_cpu_map));
}
}
#else
@@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
int n = 0;
if (len > 1) {
- cpumask_t *mask = &this_leaf->shared_cpu_map;
+ const struct cpumask *mask;
+ mask = to_cpumask(this_leaf->shared_cpu_map);
n = type?
cpulist_scnprintf(buf, len-2, mask) :
cpumask_scnprintf(buf, len-2, mask);
@@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
{
- int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+ int node = cpu_to_node(cpumask_first(mask));
struct pci_dev *dev = NULL;
ssize_t ret = 0;
int i;
@@ -733,7 +757,8 @@ static ssize_t
store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
size_t count)
{
- int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+ int node = cpu_to_node(cpumask_first(mask));
struct pci_dev *dev = NULL;
unsigned int ret, index, val;
@@ -878,7 +903,7 @@ err_out:
return -ENOMEM;
}
-static cpumask_t cache_dev_map = CPU_MASK_NONE;
+static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
/* Add/Remove cache interface for CPU device */
static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
}
kobject_uevent(&(this_object->kobj), KOBJ_ADD);
}
- cpu_set(cpu, cache_dev_map);
+ cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
return 0;
@@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
if (per_cpu(cpuid4_info, cpu) == NULL)
return;
- if (!cpu_isset(cpu, cache_dev_map))
+ if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
return;
- cpu_clear(cpu, cache_dev_map);
+ cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
for (i = 0; i < num_cache_leaves; i++)
kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
}
}
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f..ca14604611e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
@@ -24,6 +26,9 @@
#include <linux/ctype.h>
#include <linux/kmod.h>
#include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -32,7 +37,6 @@
#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
atomic_t mce_entry;
@@ -47,7 +51,7 @@ static int mce_dont_init;
*/
static int tolerant = 1;
static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = smp_processor_id();
+ rdtscll(m->tsc);
+}
+
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
print_symbol("{%s}", m->ip);
printk("\n");
}
- printk(KERN_EMERG "TSC %Lx ", m->tsc);
+ printk(KERN_EMERG "TSC %llx ", m->tsc);
if (m->addr)
- printk("ADDR %Lx ", m->addr);
+ printk("ADDR %llx ", m->addr);
if (m->misc)
- printk("MISC %Lx ", m->misc);
+ printk("MISC %llx ", m->misc);
printk("\n");
printk(KERN_EMERG "This is not a software problem!\n");
printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
panic(msg);
}
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
{
+ if (mce_dont_init)
+ return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
}
/*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce m;
+ int i;
+
+ mce_setup(&m);
+
+ rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ for (i = 0; i < banks; i++) {
+ if (!bank[i] || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+ m.tsc = 0;
+
+ barrier();
+ rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Uncorrected events are handled by the exception handler
+ * when it is enabled. But when the exception is disabled log
+ * everything.
+ *
+ * TBD do the same check for MCI_STATUS_EN here?
+ */
+ if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+ continue;
+
+ if (m.status & MCI_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+
+ /*
+ * Clear state for this bank.
+ */
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
*/
void do_machine_check(struct pt_regs * regs, long error_code)
{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* error.
*/
int kill_it = 0;
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
atomic_inc(&mce_entry);
- if ((regs
- && notify_die(DIE_NMI, "machine check", regs, error_code,
+ if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
- || !banks)
+ goto out2;
+ if (!banks)
goto out2;
- memset(&m, 0, sizeof(struct mce));
- m.cpu = smp_processor_id();
+ mce_setup(&m);
+
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
/* if the restart IP is not valid, we're done for */
if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
barrier();
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS && !bank[i])
+ __clear_bit(i, toclear);
+ if (!bank[i])
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.tsc = 0;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
+ /*
+ * Non uncorrected errors are handled by machine_check_poll
+ * Leave them alone.
+ */
+ if ((m.status & MCI_STATUS_UC) == 0)
+ continue;
+
+ /*
+ * Set taint even when machine check was not enabled.
+ */
+ add_taint(TAINT_MACHINE_CHECK);
+
+ __set_bit(i, toclear);
+
if (m.status & MCI_STATUS_EN) {
/* if PCC was set, there's no way out */
no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
no_way_out = 1;
kill_it = 1;
}
+ } else {
+ /*
+ * Machine check event was not enabled. Clear, but
+ * ignore.
+ */
+ continue;
}
if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
mce_get_rip(&m, regs);
- if (error_code >= 0)
- rdtscll(m.tsc);
- if (error_code != -2)
- mce_log(&m);
+ mce_log(&m);
/* Did this bank cause the exception? */
/* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
panicm = m;
panicm_found = 1;
}
-
- add_taint(TAINT_MACHINE_CHECK);
}
- /* Never do anything final in the polling timer */
- if (!regs)
- goto out;
-
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
/* notify userspace ASAP */
set_thread_flag(TIF_MCE_NOTIFY);
- out:
/* the last thing we do is clear state */
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ for (i = 0; i < banks; i++) {
+ if (test_bit(i, toclear))
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
wrmsrl(MSR_IA32_MCG_STATUS, 0);
out2:
atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
* and historically has been the register value of the
* MSR_IA32_THERMAL_STATUS (Intel) msr.
*/
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
{
struct mce m;
- memset(&m, 0, sizeof(m));
- m.cpu = cpu;
+ mce_setup(&m);
m.bank = MCE_THERMAL_BANK;
m.status = status;
- rdtscll(m.tsc);
mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
static int check_interval = 5 * 60; /* 5 minutes */
static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
{
- if (mce_available(&current_cpu_data))
- do_machine_check(NULL, 0);
-}
+ struct timer_list *t = &per_cpu(mce_timer, data);
-static void mcheck_timer(struct work_struct *work)
-{
- on_each_cpu(mcheck_check_cpu, NULL, 1);
+ WARN_ON(smp_processor_id() != data);
+
+ if (mce_available(&current_cpu_data))
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
/*
* Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
(int)round_jiffies_relative(check_interval*HZ));
}
- schedule_delayed_work(&mcheck_work, next_interval);
+ t->expires = jiffies + next_interval;
+ add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
/*
- * This is only called from process context. This is where we do
- * anything we need to alert userspace about new MCEs. This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
*/
int mce_notify_user(void)
{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &notify_user)) {
- static unsigned long last_print;
- unsigned long now = jiffies;
-
wake_up_interruptible(&mce_wait);
- if (trigger[0])
- call_usermodehelper(trigger, trigger_argv, NULL,
- UMH_NO_WAIT);
- if (time_after_eq(now, last_print + (check_interval*HZ))) {
- last_print = now;
+ /*
+ * There is no risk of missing notifications because
+ * work_pending is always cleared before the function is
+ * executed.
+ */
+ if (trigger[0] && !work_pending(&mce_trigger_work))
+ schedule_work(&mce_trigger_work);
+
+ if (__ratelimit(&ratelimit))
printk(KERN_INFO "Machine check events logged\n");
- }
return 1;
}
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
static __init int periodic_mcheck_init(void)
{
- next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
- idle_notifier_register(&mce_idle_notifier);
- return 0;
+ idle_notifier_register(&mce_idle_notifier);
+ return 0;
}
__initcall(periodic_mcheck_init);
-
/*
* Initialize Machine Checks for a CPU.
*/
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
{
u64 cap;
- int i;
+ unsigned b;
rdmsrl(MSR_IA32_MCG_CAP, cap);
- banks = cap & 0xff;
- if (banks > MCE_EXTENDED_BANK) {
- banks = MCE_EXTENDED_BANK;
- printk(KERN_INFO "MCE: warning: using only %d banks\n",
- MCE_EXTENDED_BANK);
+ b = cap & 0xff;
+ if (b > MAX_NR_BANKS) {
+ printk(KERN_WARNING
+ "MCE: Using only %u machine check banks out of %u\n",
+ MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
}
+
+ /* Don't support asymmetric configurations today */
+ WARN_ON(banks != 0 && b != banks);
+ banks = b;
+ if (!bank) {
+ bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+ if (!bank)
+ return -ENOMEM;
+ memset(bank, 0xff, banks * sizeof(u64));
+ }
+
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
- /* Log the machine checks left over from the previous reset.
- This also clears all registers */
- do_machine_check(NULL, mce_bootlog ? -1 : -2);
+ return 0;
+}
+
+static void mce_init(void *dummy)
+{
+ u64 cap;
+ int i;
+ mce_banks_t all_banks;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC, &all_banks);
set_in_cr4(X86_CR4_MCE);
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- if (i < NR_SYSFS_BANKS)
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- else
- wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
/* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
- if(c->x86 == 15)
+ if (c->x86 == 15 && banks > 4)
/* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, &bank[4]);
+ clear_bit(10, (unsigned long *)&bank[4]);
if(c->x86 <= 17 && mce_bootlog < 0)
/* Lots of broken BIOS around that don't clear them
by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
}
}
+static void mce_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+
+ /* data race harmless because everyone sets to the same value */
+ if (!next_interval)
+ next_interval = check_interval * HZ;
+ if (!next_interval)
+ return;
+ setup_timer(t, mcheck_timer, smp_processor_id());
+ t->expires = round_jiffies(jiffies + next_interval);
+ add_timer(t);
+}
+
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
- mce_cpu_quirks(c);
+ if (!mce_available(c))
+ return;
- if (mce_dont_init ||
- !mce_available(c))
+ if (mce_cap_init() < 0) {
+ mce_dont_init = 1;
return;
+ }
+ mce_cpu_quirks(c);
mce_init(NULL);
mce_cpu_features(c);
+ mce_init_timer();
}
/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
{
unsigned long *cpu_tsc;
static DEFINE_MUTEX(mce_read_mutex);
- unsigned next;
+ unsigned prev, next;
char __user *buf = ubuf;
int i, err;
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
}
err = 0;
- for (i = 0; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i,0, sizeof(struct mce));
- goto timeout;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+
+ while (!mcelog.entry[i].finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(mcelog.entry + i, 0,
+ sizeof(struct mce));
+ goto timeout;
+ }
+ cpu_relax();
}
- cpu_relax();
+ smp_rmb();
+ err |= copy_to_user(buf, mcelog.entry + i,
+ sizeof(struct mce));
+ buf += sizeof(struct mce);
+timeout:
+ ;
}
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
- buf += sizeof(struct mce);
- timeout:
- ;
- }
- memset(mcelog.entry, 0, next * sizeof(struct mce));
- mcelog.next = 0;
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
synchronize_sched();
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
&mce_chrdev_ops,
};
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
- old_cr4 = read_cr4();
- clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
- if (old_cr4 & X86_CR4_MCE)
- set_in_cr4(X86_CR4_MCE);
-}
-
/*
* Old style boot options parsing. Only for compatibility.
*/
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
return 1;
}
-/* mce=off disables machine check. Note you can re-enable it later
- using sysfs.
+/* mce=off disables machine check.
mce=TOLERANCELEVEL (number, see above)
mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
* Sysfs support
*/
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+ int i;
+
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+ return mce_disable();
+}
+
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
Only one CPU is active at this time, the others get readded later using
CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
return 0;
}
+static void mce_cpu_restart(void *data)
+{
+ del_timer_sync(&__get_cpu_var(mce_timer));
+ if (mce_available(&current_cpu_data))
+ mce_init(NULL);
+ mce_init_timer();
+}
+
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
- if (next_interval)
- cancel_delayed_work(&mcheck_work);
- /* Timer race is harmless here */
- on_each_cpu(mce_init, NULL, 1);
next_interval = check_interval * HZ;
- if (next_interval)
- schedule_delayed_work(&mcheck_work,
- round_jiffies_relative(next_interval));
+ on_each_cpu(mce_cpu_restart, NULL, 1);
}
static struct sysdev_class mce_sysclass = {
+ .suspend = mce_suspend,
+ .shutdown = mce_shutdown,
.resume = mce_resume,
.name = "machinecheck",
};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ char *buf)
+{
+ u64 b = bank[attr - bank_attrs];
+ return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *end;
+ u64 new = simple_strtoull(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ bank[attr - bank_attrs] = new;
+ mce_restart();
+ return end-buf;
+}
static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
ACCESSOR(check_interval,check_interval,mce_restart())
static struct sysdev_attribute *mce_attributes[] = {
- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
NULL
};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
if (err)
goto error;
}
+ for (i = 0; i < banks; i++) {
+ err = sysdev_create_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ if (err)
+ goto error2;
+ }
cpu_set(cpu, mce_device_initialized);
return 0;
+error2:
+ while (--i >= 0) {
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
+ }
error:
- while (i--) {
+ while (--i >= 0) {
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
}
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
+ for (i = 0; i < banks; i++)
+ sysdev_remove_file(&per_cpu(device_mce, cpu),
+ &bank_attrs[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
cpu_clear(cpu, mce_device_initialized);
}
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_clear();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+ int i;
+ unsigned long action = *(unsigned long *)h;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_reenable();
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
switch (action) {
case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
threshold_cpu_callback(action, cpu);
mce_remove_device(cpu);
break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ del_timer_sync(t);
+ smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ t->expires = round_jiffies(jiffies + next_interval);
+ add_timer_on(t, cpu);
+ smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ break;
+ case CPU_POST_DEAD:
+ /* intentionally ignoring frozen here */
+ cmci_rediscover(cpu);
+ break;
}
return NOTIFY_OK;
}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
.notifier_call = mce_cpu_callback,
};
+static __init int mce_init_banks(void)
+{
+ int i;
+
+ bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+ GFP_KERNEL);
+ if (!bank_attrs)
+ return -ENOMEM;
+
+ for (i = 0; i < banks; i++) {
+ struct sysdev_attribute *a = &bank_attrs[i];
+ a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+ if (!a->attr.name)
+ goto nomem;
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+ return 0;
+
+nomem:
+ while (--i >= 0)
+ kfree(bank_attrs[i].attr.name);
+ kfree(bank_attrs);
+ bank_attrs = NULL;
+ return -ENOMEM;
+}
+
static __init int mce_init_device(void)
{
int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
+
+ err = mce_init_banks();
+ if (err)
+ return err;
+
err = sysdev_class_register(&mce_sysclass);
if (err)
return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index f2ee0ae29bd..7d01be86887 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
struct threshold_bank {
struct kobject *kobj;
struct threshold_block *blocks;
- cpumask_t cpus;
+ cpumask_var_t cpus;
};
static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+static void amd_threshold_interrupt(void);
+
/*
* CPU Initialization
*/
@@ -90,7 +92,8 @@ struct thresh_restart {
};
/* must be called with correct cpu affinity */
-static long threshold_restart_bank(void *_tr)
+/* Called via smp_call_function_single() */
+static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
u32 mci_misc_hi, mci_misc_lo;
@@ -117,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
mci_misc_hi |= MASK_COUNT_EN_HI;
wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
- return 0;
}
/* cpu init entry point, called from mce.c with preempt off */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
tr.reset = 0;
tr.old_limit = 0;
threshold_restart_bank(&tr);
+
+ mce_threshold_vector = amd_threshold_interrupt;
}
}
}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
{
unsigned int bank, block;
struct mce m;
u32 low = 0, high = 0, address = 0;
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- memset(&m, 0, sizeof(m));
- rdtscll(m.tsc);
- m.cpu = smp_processor_id();
+ mce_setup(&m);
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
/* Log the machine check that caused the threshold
event. */
- do_machine_check(NULL, 0);
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
+ bank * NR_BLOCKS
+ block;
mce_log(&m);
- goto out;
+ return;
}
}
}
-out:
- inc_irq_stat(irq_threshold_count);
- irq_exit();
}
/*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
tr.b = b;
tr.reset = 0;
tr.old_limit = 0;
- work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return end - buf;
}
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
tr.b = b;
tr.reset = 0;
- work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return end - buf;
}
-static long local_error_count(void *_b)
+struct threshold_block_cross_cpu {
+ struct threshold_block *tb;
+ long retval;
+};
+
+static void local_error_count_handler(void *_tbcc)
{
- struct threshold_block *b = _b;
+ struct threshold_block_cross_cpu *tbcc = _tbcc;
+ struct threshold_block *b = tbcc->tb;
u32 low, high;
rdmsr(b->address, low, high);
- return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+ tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
}
static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
- return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
+ struct threshold_block_cross_cpu tbcc = { .tb = b, };
+
+ smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
+ return sprintf(buf, "%lx\n", tbcc.retval);
}
static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
{
struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
- work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
return 1;
}
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
return 0;
- if (rdmsr_safe(address, &low, &high))
+ if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
return err;
}
-static __cpuinit long local_allocate_threshold_blocks(void *_bank)
+static __cpuinit long
+local_allocate_threshold_blocks(int cpu, unsigned int bank)
{
- unsigned int *bank = _bank;
-
- return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
- MSR_IA32_MC0_MISC + *bank * 4);
+ return allocate_threshold_blocks(cpu, bank, 0,
+ MSR_IA32_MC0_MISC + bank * 4);
}
/* symlinks sibling shared banks to first core. first core owns dir/files. */
@@ -481,7 +485,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
#ifdef CONFIG_SMP
if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
- i = first_cpu(per_cpu(cpu_core_map, cpu));
+ i = cpumask_first(&per_cpu(cpu_core_map, cpu));
/* first core not up yet */
if (cpu_data(i).cpu_core_id)
@@ -501,7 +505,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (err)
goto out;
- b->cpus = per_cpu(cpu_core_map, cpu);
+ cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
per_cpu(threshold_banks, cpu)[bank] = b;
goto out;
}
@@ -512,24 +516,29 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = -ENOMEM;
goto out;
}
+ if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+ kfree(b);
+ err = -ENOMEM;
+ goto out;
+ }
b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
if (!b->kobj)
goto out_free;
#ifndef CONFIG_SMP
- b->cpus = CPU_MASK_ALL;
+ cpumask_setall(b->cpus);
#else
- b->cpus = per_cpu(cpu_core_map, cpu);
+ cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
#endif
per_cpu(threshold_banks, cpu)[bank] = b;
- err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
+ err = local_allocate_threshold_blocks(cpu, bank);
if (err)
goto out_free;
- for_each_cpu_mask_nr(i, b->cpus) {
+ for_each_cpu(i, b->cpus) {
if (i == cpu)
continue;
@@ -545,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
out_free:
per_cpu(threshold_banks, cpu)[bank] = NULL;
+ free_cpumask_var(b->cpus);
kfree(b);
out:
return err;
@@ -619,7 +629,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#endif
/* remove all sibling symlinks before unregistering */
- for_each_cpu_mask_nr(i, b->cpus) {
+ for_each_cpu(i, b->cpus) {
if (i == cpu)
continue;
@@ -632,6 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
free_out:
kobject_del(b->kobj);
kobject_put(b->kobj);
+ free_cpumask_var(b->cpus);
kfree(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index f44c3662436..57df3d38347 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,17 +1,21 @@
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
*/
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <asm/processor.h>
+#include <asm/apic.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
+#include <asm/apic.h>
asmlinkage void smp_thermal_interrupt(void)
{
@@ -24,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & 1))
- mce_log_therm_throt_event(smp_processor_id(), msr_val);
+ mce_log_therm_throt_event(msr_val);
inc_irq_stat(irq_thermal_count);
irq_exit();
@@ -48,13 +52,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = apic_read(APIC_LVTTHMR);
- if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG
"CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
- if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+ if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
tm2 = 1;
if (h & APIC_VECTOR_MASK) {
@@ -72,7 +76,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
@@ -84,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+ u64 cap;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+ if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+ return 0;
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+ if (*hdr == 0)
+ printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+ *hdr = 1;
+ printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+ unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+ int hdr = 0;
+ int i;
+
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+
+ if (test_bit(i, owned))
+ continue;
+
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Already owned by someone else? */
+ if (val & CMCI_EN) {
+ if (test_and_clear_bit(i, owned) || boot)
+ print_update("SHD", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ continue;
+ }
+
+ val |= CMCI_EN | CMCI_THRESHOLD;
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+ /* Did the enable bit stick? -- the bank supports CMCI */
+ if (val & CMCI_EN) {
+ if (!test_and_set_bit(i, owned) || boot)
+ print_update("CMCI", &hdr, i);
+ __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ } else {
+ WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+ }
+ }
+ spin_unlock(&cmci_discover_lock);
+ if (hdr)
+ printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+ return;
+ local_irq_save(flags);
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ int i;
+ int banks;
+ u64 val;
+
+ if (!cmci_supported(&banks))
+ return;
+ spin_lock(&cmci_discover_lock);
+ for (i = 0; i < banks; i++) {
+ if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+ continue;
+ /* Disable CMCI */
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+ __clear_bit(i, __get_cpu_var(mce_banks_owned));
+ }
+ spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+ int banks;
+ int cpu;
+ cpumask_var_t old;
+
+ if (!cmci_supported(&banks))
+ return;
+ if (!alloc_cpumask_var(&old, GFP_KERNEL))
+ return;
+ cpumask_copy(old, &current->cpus_allowed);
+
+ for_each_online_cpu (cpu) {
+ if (cpu == dying)
+ continue;
+ if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+ continue;
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+ }
+
+ set_cpus_allowed_ptr(current, old);
+ free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks, 0);
+}
+
+static void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks, 1);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
+ intel_init_cmci();
}
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index 9b60fce09f7..f53bdcbaf38 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -85,7 +85,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = apic_read(APIC_LVTTHMR);
- if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
+ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
cpu);
return; /* -EBUSY */
@@ -111,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
vendor_thermal_interrupt = intel_thermal_interrupt;
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..23ee9e730f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+ printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+ exit_idle();
+ irq_enter();
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ irq_exit();
+ /* Ack only at the end to avoid potential reentry */
+ ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc053364..f4361b56f8e 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y := main.o if.o generic.o state.o
+obj-y := main.o if.o generic.o state.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 00000000000..ce0fe4b5c04
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
+/* MTRR (Memory Type Range Register) cleanup
+
+ Copyright (C) 2009 Yinghai Lu
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/sort.h>
+
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/kvm_para.h>
+#include "mtrr.h"
+
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ /* out of slots */
+ if (nr_range >= RANGE_NUM)
+ return nr_range;
+
+ range[nr_range].start = start;
+ range[nr_range].end = end;
+
+ nr_range++;
+
+ return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ /* try to merge it with old one */
+ for (i = 0; i < nr_range; i++) {
+ unsigned long final_start, final_end;
+ unsigned long common_start, common_end;
+
+ if (!range[i].end)
+ continue;
+
+ common_start = max(range[i].start, start);
+ common_end = min(range[i].end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(range[i].start, start);
+ final_end = max(range[i].end, end);
+
+ range[i].start = final_start;
+ range[i].end = final_end;
+ return nr_range;
+ }
+
+ /* need to add that */
+ return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+ int i, j;
+
+ for (j = 0; j < RANGE_NUM; j++) {
+ if (!range[j].end)
+ continue;
+
+ if (start <= range[j].start && end >= range[j].end) {
+ range[j].start = 0;
+ range[j].end = 0;
+ continue;
+ }
+
+ if (start <= range[j].start && end < range[j].end &&
+ range[j].start < end + 1) {
+ range[j].start = end + 1;
+ continue;
+ }
+
+
+ if (start > range[j].start && end >= range[j].end &&
+ range[j].end > start - 1) {
+ range[j].end = start - 1;
+ continue;
+ }
+
+ if (start > range[j].start && end < range[j].end) {
+ /* find the new spare */
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (range[i].end == 0)
+ break;
+ }
+ if (i < RANGE_NUM) {
+ range[i].end = range[j].end;
+ range[i].start = end + 1;
+ } else {
+ printk(KERN_ERR "run of slot in ranges\n");
+ }
+ range[j].end = start - 1;
+ continue;
+ }
+ }
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+ const struct res_range *r1 = x1;
+ const struct res_range *r2 = x2;
+ long start1, start2;
+
+ start1 = r1->start;
+ start2 = r2->start;
+
+ return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+ unsigned long base_pfn;
+ unsigned long size_pfn;
+ mtrr_type type;
+};
+
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size)
+{
+ unsigned long base, size;
+ mtrr_type type;
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ nr_range = add_range_with_merge(range, nr_range, base,
+ base + size - 1);
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* take out UC ranges */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ continue;
+ base = range_state[i].base_pfn;
+ if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
+ (mtrr_state.enabled & 1)) {
+ /* Var MTRR contains UC entry below 1M? Skip it: */
+ printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
+ "contains strange UC entry under 1M, check "
+ "with your system vendor!\n", i);
+ if (base + size <= (1<<(20-PAGE_SHIFT)))
+ continue;
+ size -= (1<<(20-PAGE_SHIFT)) - base;
+ base = 1<<(20-PAGE_SHIFT);
+ }
+ subtract_range(range, base, base + size - 1);
+ }
+ if (extra_remove_size)
+ subtract_range(range, extra_remove_base,
+ extra_remove_base + extra_remove_size - 1);
+
+ /* get new range num */
+ nr_range = 0;
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+ nr_range++;
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After UC checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ if (debug_print) {
+ printk(KERN_DEBUG "After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* clear those is not used */
+ for (i = nr_range; i < RANGE_NUM; i++)
+ memset(&range[i], 0, sizeof(range[i]));
+
+ return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+ unsigned long sum;
+ int i;
+
+ sum = 0;
+ for (i = 0; i < nr_range; i++)
+ sum += range[i].end + 1 - range[i].start;
+
+ return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 0;
+ return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 1;
+ return 0;
+}
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
+struct var_mtrr_state {
+ unsigned long range_startk;
+ unsigned long range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+ unsigned int reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type, unsigned int address_bits)
+{
+ u32 base_lo, base_hi, mask_lo, mask_hi;
+ u64 base, mask;
+
+ if (!sizek) {
+ fill_mtrr_var_range(reg, 0, 0, 0, 0);
+ return;
+ }
+
+ mask = (1ULL << address_bits) - 1;
+ mask &= ~((((u64)sizek) << 10) - 1);
+
+ base = ((u64)basek) << 10;
+
+ base |= type;
+ mask |= 0x800;
+
+ base_lo = base & ((1ULL<<32) - 1);
+ base_hi = base >> 32;
+
+ mask_lo = mask & ((1ULL<<32) - 1);
+ mask_hi = mask >> 32;
+
+ fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type)
+{
+ range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+ range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+ range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+ unsigned long basek, sizek;
+ unsigned char type;
+ unsigned int reg;
+
+ for (reg = 0; reg < num_var_ranges; reg++) {
+ basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+ sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+ type = range_state[reg].type;
+
+ set_var_mtrr(reg, basek, sizek, type, address_bits);
+ }
+}
+
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ char factor;
+ unsigned long base = sizek;
+
+ if (base & ((1<<10) - 1)) {
+ /* not MB alignment */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)) {
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+ unsigned long range_sizek, unsigned char type)
+{
+ if (!range_sizek || (reg >= num_var_ranges))
+ return reg;
+
+ while (range_sizek) {
+ unsigned long max_align, align;
+ unsigned long sizek;
+
+ /* Compute the maximum size I can make a range */
+ if (range_startk)
+ max_align = ffs(range_startk) - 1;
+ else
+ max_align = 32;
+ align = fls(range_sizek) - 1;
+ if (align > max_align)
+ align = max_align;
+
+ sizek = 1 << align;
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk,
+ &start_factor),
+ size_base = to_size_factor(sizek, &size_factor),
+
+ printk(KERN_DEBUG "Setting variable MTRR %d, "
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
+ );
+ }
+ save_var_mtrr(reg++, range_startk, sizek, type);
+ range_startk += sizek;
+ range_sizek -= sizek;
+ if (reg >= num_var_ranges)
+ break;
+ }
+ return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+ unsigned long sizek)
+{
+ unsigned long hole_basek, hole_sizek;
+ unsigned long second_basek, second_sizek;
+ unsigned long range0_basek, range0_sizek;
+ unsigned long range_basek, range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+
+ hole_basek = 0;
+ hole_sizek = 0;
+ second_basek = 0;
+ second_sizek = 0;
+ chunk_sizek = state->chunk_sizek;
+ gran_sizek = state->gran_sizek;
+
+ /* align with gran size, prevent small block used up MTRRs */
+ range_basek = ALIGN(state->range_startk, gran_sizek);
+ if ((range_basek > basek) && basek)
+ return second_sizek;
+ state->range_sizek -= (range_basek - state->range_startk);
+ range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+ while (range_sizek > state->range_sizek) {
+ range_sizek -= gran_sizek;
+ if (!range_sizek)
+ return 0;
+ }
+ state->range_sizek = range_sizek;
+
+ /* try to append some small hole */
+ range0_basek = state->range_startk;
+ range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* no increase */
+ if (range0_sizek == state->range_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + state->range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ state->range_sizek, MTRR_TYPE_WRBACK);
+ return 0;
+ }
+
+ /* only cut back, when it is not the last */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* one hole in the middle */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* one hole in middle or at end */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* hole size should be less than half of range0 size */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
+ }
+
+ if (range0_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + range0_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ range0_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (range0_sizek < state->range_sizek) {
+ /* need to handle left over */
+ range_sizek = state->range_sizek - range0_sizek;
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
+ if (debug_print)
+ printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
+ }
+
+ return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+ unsigned long size_pfn)
+{
+ unsigned long basek, sizek;
+ unsigned long second_sizek = 0;
+
+ if (state->reg >= num_var_ranges)
+ return;
+
+ basek = base_pfn << (PAGE_SHIFT - 10);
+ sizek = size_pfn << (PAGE_SHIFT - 10);
+
+ /* See if I can merge with the last range */
+ if ((basek <= 1024) ||
+ (state->range_startk + state->range_sizek == basek)) {
+ unsigned long endk = basek + sizek;
+ state->range_sizek = endk - state->range_startk;
+ return;
+ }
+ /* Write the range mtrrs */
+ if (state->range_sizek != 0)
+ second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+ /* Allocate an msr */
+ state->range_startk = basek + second_sizek;
+ state->range_sizek = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_chunk_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_gran_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+ if (arg)
+ nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+ u64 chunk_size, u64 gran_size)
+{
+ struct var_mtrr_state var_state;
+ int i;
+ int num_reg;
+
+ var_state.range_startk = 0;
+ var_state.range_sizek = 0;
+ var_state.reg = 0;
+ var_state.chunk_sizek = chunk_size >> 10;
+ var_state.gran_sizek = gran_size >> 10;
+
+ memset(range_state, 0, sizeof(range_state));
+
+ /* Write the range etc */
+ for (i = 0; i < nr_range; i++)
+ set_var_mtrr_range(&var_state, range[i].start,
+ range[i].end - range[i].start + 1);
+
+ /* Write the last range */
+ if (var_state.range_sizek != 0)
+ range_to_mtrr_with_hole(&var_state, 0, 0);
+
+ num_reg = var_state.reg;
+ /* Clear out the extra MTRR's */
+ while (var_state.reg < num_var_ranges) {
+ save_var_mtrr(var_state.reg, 0, 0, 0);
+ var_state.reg++;
+ }
+
+ return num_reg;
+}
+
+struct mtrr_cleanup_result {
+ unsigned long gran_sizek;
+ unsigned long chunk_sizek;
+ unsigned long lose_cover_sizek;
+ unsigned int num_reg;
+ int bad;
+};
+
+/*
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
+ */
+#define NUM_RESULT 136
+#define PSHIFT (PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static void __init print_out_mtrr_range_state(void)
+{
+ int i;
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+ mtrr_type type;
+
+ for (i = 0; i < num_var_ranges; i++) {
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+ int i;
+ mtrr_type type;
+ unsigned long size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ size = range_state[i].size_pfn;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ if (type == MTRR_TYPE_WRPROT)
+ type = MTRR_TYPE_UNCACHABLE;
+ num[type]++;
+ }
+
+ /* check if we got UC entries */
+ if (!num[MTRR_TYPE_UNCACHABLE])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ return 1;
+}
+
+static unsigned long __initdata range_sums;
+static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size,
+ int i)
+{
+ int num_reg;
+ static struct res_range range_new[RANGE_NUM];
+ static int nr_range_new;
+ unsigned long range_sums_new;
+
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range,
+ chunk_size, gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base, extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
+
+ /* double check it */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range ||
+ memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
+
+ if (!result[i].bad && (range_sums - range_sums_new <
+ min_loss_pfn[num_reg])) {
+ min_loss_pfn[num_reg] =
+ range_sums - range_sums_new;
+ }
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad ? "*BAD*" : " ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad ? "-" : "",
+ lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+ int i;
+ int num_reg_good;
+ int index_good;
+
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i])
+ num_reg_good = i;
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ return index_good;
+}
+
+
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long extra_remove_base, extra_remove_size;
+ unsigned long base, size, def, dummy;
+ mtrr_type type;
+ u64 chunk_size, gran_size;
+ int index_good;
+ int i;
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* check if we need handle it and can handle it */
+ if (!mtrr_need_cleanup())
+ return 0;
+
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ print_out_mtrr_range_state();
+
+ memset(range, 0, sizeof(range));
+ extra_remove_size = 0;
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
+ extra_remove_size =
+ (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+ nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+ extra_remove_size);
+ /*
+ * [0, 1M) should always be coverred by var mtrr with WB
+ * and fixed mtrrs should take effective before var mtrr for it
+ */
+ nr_range = add_range_with_merge(range, nr_range, 0,
+ (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
+ range_sums = sum_ranges(range, nr_range);
+ printk(KERN_INFO "total RAM coverred: %ldM\n",
+ range_sums >> (20 - PAGE_SHIFT));
+
+ if (mtrr_chunk_size && mtrr_gran_size) {
+ i = 0;
+ mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+ extra_remove_base, extra_remove_size, i);
+
+ mtrr_print_out_one_result(i);
+
+ if (!result[i].bad) {
+ set_var_mtrr_all(address_bits);
+ printk(KERN_DEBUG "New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ }
+ printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+ "will find optimal one\n");
+ }
+
+ i = 0;
+ memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+ memset(result, 0, sizeof(result));
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+ chunk_size <<= 1) {
+
+ if (i >= NUM_RESULT)
+ continue;
+
+ mtrr_calc_range_state(chunk_size, gran_size,
+ extra_remove_base, extra_remove_size, i);
+ if (debug_print) {
+ mtrr_print_out_one_result(i);
+ printk(KERN_INFO "\n");
+ }
+
+ i++;
+ }
+ }
+
+ /* try to find the optimal index */
+ index_good = mtrr_search_optimal_index();
+
+ if (index_good != -1) {
+ printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ i = index_good;
+ mtrr_print_out_one_result(i);
+
+ /* convert ranges to var ranges state */
+ chunk_size = result[i].chunk_sizek;
+ chunk_size <<= 10;
+ gran_size = result[i].gran_sizek;
+ gran_size <<= 10;
+ x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ set_var_mtrr_all(address_bits);
+ printk(KERN_DEBUG "New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ } else {
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++)
+ mtrr_print_out_one_result(i);
+ }
+
+ printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+ printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+ return 0;
+}
+#else
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ return 0;
+}
+#endif
+
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+ disable_mtrr_trim = 1;
+ return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+ u32 l, h;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return 0;
+ if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ return 0;
+ /* In case some hypervisor doesn't pass SYSCFG through */
+ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ return 0;
+ /*
+ * Memory between 4GB and top of mem is forced WB by this magic bit.
+ * Reserved before K8RevF, but should be zero there.
+ */
+ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+ (Tom2Enabled | Tom2ForceMemTypeWB))
+ return 1;
+ return 0;
+}
+
+static u64 __init real_trim_memory(unsigned long start_pfn,
+ unsigned long limit_pfn)
+{
+ u64 trim_start, trim_size;
+ trim_start = start_pfn;
+ trim_start <<= PAGE_SHIFT;
+ trim_size = limit_pfn;
+ trim_size <<= PAGE_SHIFT;
+ trim_size -= trim_start;
+
+ return e820_update_range(trim_start, trim_size, E820_RAM,
+ E820_RESERVED);
+}
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations. This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+ unsigned long i, base, size, highest_pfn = 0, def, dummy;
+ mtrr_type type;
+ u64 total_trim_size;
+
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+ /*
+ * Make sure we only trim uncachable memory on machines that
+ * support the Intel MTRR architecture:
+ */
+ if (!is_cpu(INTEL) || disable_mtrr_trim)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* Find highest cached pfn */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ if (highest_pfn < base + size)
+ highest_pfn = base + size;
+ }
+
+ /* kvm/qemu doesn't have mtrr set right, don't trim them all */
+ if (!highest_pfn) {
+ printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+ return 0;
+ }
+
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* no entry for WB? */
+ if (!num[MTRR_TYPE_WRBACK])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ nr_range = 0;
+ if (mtrr_tom2) {
+ range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+ range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+ if (highest_pfn < range[nr_range].end + 1)
+ highest_pfn = range[nr_range].end + 1;
+ nr_range++;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+ total_trim_size = 0;
+ /* check the head */
+ if (range[0].start)
+ total_trim_size += real_trim_memory(0, range[0].start);
+ /* check the holes */
+ for (i = 0; i < nr_range - 1; i++) {
+ if (range[i].end + 1 < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ range[i+1].start);
+ }
+ /* check the top */
+ i = nr_range - 1;
+ if (range[i].end + 1 < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ end_pfn);
+
+ if (total_trim_size) {
+ printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+ " all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
+
+ if (!changed_by_mtrr_cleanup)
+ WARN_ON(1);
+
+ printk(KERN_INFO "update e820 for mtrr\n");
+ update_e820();
+
+ return 1;
+ }
+
+ return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95..37f28fc7cf9 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
struct mtrr_state_type mtrr_state = {};
EXPORT_SYMBOL_GPL(mtrr_state);
-static int __initdata mtrr_show;
-static int __init mtrr_debug(char *opt)
+/**
+ * BIOS is expected to clear MtrrFixDramModEn bit, see for example
+ * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ * Opteron Processors" (26094 Rev. 3.30 February 2006), section
+ * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
+ * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * 0 for operation."
+ */
+static inline void k8_check_syscfg_dram_mod_en(void)
{
- mtrr_show = 1;
- return 0;
+ u32 lo, hi;
+
+ if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 >= 0x0f)))
+ return;
+
+ rdmsr(MSR_K8_SYSCFG, lo, hi);
+ if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
+ printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ " not cleared by BIOS, clearing this bit\n",
+ smp_processor_id());
+ lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
+ mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ }
}
-early_param("mtrr.show", mtrr_debug);
/*
* Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
unsigned int *p = (unsigned int *) frs;
int i;
+ k8_check_syscfg_dram_mod_en();
+
rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
-static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+static unsigned __initdata last_fixed_start;
+static unsigned __initdata last_fixed_end;
+static mtrr_type __initdata last_fixed_type;
+
+static void __init print_fixed_last(void)
+{
+ if (!last_fixed_end)
+ return;
+
+ printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
+ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+
+ last_fixed_end = 0;
+}
+
+static void __init update_fixed_last(unsigned base, unsigned end,
+ mtrr_type type)
+{
+ last_fixed_start = base;
+ last_fixed_end = end;
+ last_fixed_type = type;
+}
+
+static void __init print_fixed(unsigned base, unsigned step,
+ const mtrr_type *types)
{
unsigned i;
- for (i = 0; i < 8; ++i, ++types, base += step)
- printk(KERN_INFO "MTRR %05X-%05X %s\n",
- base, base + step - 1, mtrr_attrib_to_str(*types));
+ for (i = 0; i < 8; ++i, ++types, base += step) {
+ if (last_fixed_end == 0) {
+ update_fixed_last(base, base + step, *types);
+ continue;
+ }
+ if (last_fixed_end == base && last_fixed_type == *types) {
+ last_fixed_end = base + step;
+ continue;
+ }
+ /* new segments: gap or different type */
+ print_fixed_last();
+ update_fixed_last(base, base + step, *types);
+ }
}
static void prepare_set(void);
static void post_set(void);
+static void __init print_mtrr_state(void)
+{
+ unsigned int i;
+ int high_width;
+
+ printk(KERN_DEBUG "MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
+ if (mtrr_state.have_fixed) {
+ printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
+ mtrr_state.enabled & 1 ? "en" : "dis");
+ print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+ for (i = 0; i < 2; ++i)
+ print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+ for (i = 0; i < 8; ++i)
+ print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+
+ /* tail */
+ print_fixed_last();
+ }
+ printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
+ mtrr_state.enabled & 2 ? "en" : "dis");
+ high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+ for (i = 0; i < num_var_ranges; ++i) {
+ if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+ printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ else
+ printk(KERN_DEBUG " %u disabled\n", i);
+ }
+ if (mtrr_tom2) {
+ printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
+ mtrr_tom2, mtrr_tom2>>20);
+ }
+}
+
/* Grab all of the MTRR state for this CPU into *state */
void __init get_mtrr_state(void)
{
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
mtrr_tom2 |= low;
mtrr_tom2 &= 0xffffff800000ULL;
}
- if (mtrr_show) {
- int high_width;
-
- printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
- if (mtrr_state.have_fixed) {
- printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
- print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
- for (i = 0; i < 2; ++i)
- print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
- for (i = 0; i < 8; ++i)
- print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
- }
- printk(KERN_INFO "MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
- high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
- for (i = 0; i < num_var_ranges; ++i) {
- if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
- printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
- i,
- high_width,
- mtrr_state.var_ranges[i].base_hi,
- mtrr_state.var_ranges[i].base_lo >> 12,
- high_width,
- mtrr_state.var_ranges[i].mask_hi,
- mtrr_state.var_ranges[i].mask_lo >> 12,
- mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
- else
- printk(KERN_INFO "MTRR %u disabled\n", i);
- }
- if (mtrr_tom2) {
- printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
- mtrr_tom2, mtrr_tom2>>20);
- }
- }
+
+ print_mtrr_state();
+
mtrr_state_set = 1;
/* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
}
/**
- * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
- * see AMD publication no. 24593, chapter 3.2.1 for more information
- */
-static inline void k8_enable_fixed_iorrs(void)
-{
- unsigned lo, hi;
-
- rdmsr(MSR_K8_SYSCFG, lo, hi);
- mtrr_wrmsr(MSR_K8_SYSCFG, lo
- | K8_MTRRFIXRANGE_DRAM_ENABLE
- | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
-}
-
-/**
* set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
* @msr: MSR address of the MTTR which should be checked and updated
* @changed: pointer which indicates whether the MTRR needed to be changed
* @msrwords: pointer to the MSR values which the MSR should have
- *
- * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
- * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
*/
static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
{
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
rdmsr(msr, lo, hi);
if (lo != msrwords[0] || hi != msrwords[1]) {
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
- ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
- k8_enable_fixed_iorrs();
mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
*changed = true;
}
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
{
unsigned int mask_lo, mask_hi, base_lo, base_hi;
unsigned int tmp, hi;
+ int cpu;
+
+ /*
+ * get_mtrr doesn't need to update mtrr_state, also it could be called
+ * from any cpu, so try to print it out directly.
+ */
+ cpu = get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+
if ((mask_lo & 0x800) == 0) {
/* Invalid (i.e. free) range */
*base = 0;
*size = 0;
*type = 0;
- return;
+ goto out_put_cpu;
}
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
- /* Work out the shifted address mask. */
+ /* Work out the shifted address mask: */
tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
mask_lo = size_or_mask | tmp;
- /* Expand tmp with high bits to all 1s*/
+
+ /* Expand tmp with high bits to all 1s: */
hi = fls(tmp);
if (hi > 0) {
tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
}
}
- /* This works correctly if size is a power of two, i.e. a
- contiguous range. */
+ /*
+ * This works correctly if size is a power of two, i.e. a
+ * contiguous range:
+ */
*size = -mask_lo;
*base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
*type = base_lo & 0xff;
+
+ printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
+ cpu, reg, *base, *size,
+ mtrr_attrib_to_str(*type & 0xff));
+out_put_cpu:
+ put_cpu();
}
/**
@@ -419,6 +479,8 @@ static int set_fixed_ranges(mtrr_type * frs)
bool changed = false;
int block=-1, range;
+ k8_check_syscfg_dram_mod_en();
+
while (fixed_range_blocks[++block].ranges)
for (range=0; range < fixed_range_blocks[block].ranges; range++)
set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b825..03cda01f57c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
unsigned long lsize;
};
-static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
+static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
{
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i,
- &mtrr_state[i].lbase,
- &mtrr_state[i].lsize,
- &mtrr_state[i].ltype);
+ &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
}
return 0;
}
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
int i;
for (i = 0; i < num_var_ranges; i++) {
- if (mtrr_state[i].lsize)
+ if (mtrr_value[i].lsize)
set_mtrr(i,
- mtrr_state[i].lbase,
- mtrr_state[i].lsize,
- mtrr_state[i].ltype);
+ mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
}
return 0;
}
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
.resume = mtrr_restore,
};
-/* should be related to MTRR_VAR_RANGES nums */
-#define RANGE_NUM 256
-
-struct res_range {
- unsigned long start;
- unsigned long end;
-};
-
-static int __init
-add_range(struct res_range *range, int nr_range, unsigned long start,
- unsigned long end)
-{
- /* out of slots */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
- unsigned long end)
-{
- int i;
-
- /* try to merge it with old one */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* need to add that */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* find the new spare */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
-
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
-}
-
-struct var_mtrr_range_state {
- unsigned long base_pfn;
- unsigned long size_pfn;
- mtrr_type type;
-};
-
-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-
-static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
- unsigned long extra_remove_base,
- unsigned long extra_remove_size)
-{
- unsigned long i, base, size;
- mtrr_type type;
-
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type != MTRR_TYPE_WRBACK)
- continue;
- base = range_state[i].base_pfn;
- size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
- }
- if (debug_print) {
- printk(KERN_DEBUG "After WB checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
- }
-
- /* take out UC ranges */
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type != MTRR_TYPE_UNCACHABLE &&
- type != MTRR_TYPE_WRPROT)
- continue;
- size = range_state[i].size_pfn;
- if (!size)
- continue;
- base = range_state[i].base_pfn;
- subtract_range(range, base, base + size - 1);
- }
- if (extra_remove_size)
- subtract_range(range, extra_remove_base,
- extra_remove_base + extra_remove_size - 1);
-
- /* get new range num */
- nr_range = 0;
- for (i = 0; i < RANGE_NUM; i++) {
- if (!range[i].end)
- continue;
- nr_range++;
- }
- if (debug_print) {
- printk(KERN_DEBUG "After UC checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
- }
-
- /* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
- if (debug_print) {
- printk(KERN_DEBUG "After sorting\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
- }
-
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
-
- return nr_range;
-}
-
-static struct res_range __initdata range[RANGE_NUM];
-static int __initdata nr_range;
-
-#ifdef CONFIG_MTRR_SANITIZER
-
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
-{
- unsigned long sum;
- int i;
-
- sum = 0;
- for (i = 0; i < nr_range; i++)
- sum += range[i].end + 1 - range[i].start;
-
- return sum;
-}
-
-static int enable_mtrr_cleanup __initdata =
- CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
-
-static int __init disable_mtrr_cleanup_setup(char *str)
-{
- enable_mtrr_cleanup = 0;
- return 0;
-}
-early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
-
-static int __init enable_mtrr_cleanup_setup(char *str)
-{
- enable_mtrr_cleanup = 1;
- return 0;
-}
-early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
- debug_print = 1;
- return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
-struct var_mtrr_state {
- unsigned long range_startk;
- unsigned long range_sizek;
- unsigned long chunk_sizek;
- unsigned long gran_sizek;
- unsigned int reg;
-};
-
-static void __init
-set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type, unsigned int address_bits)
-{
- u32 base_lo, base_hi, mask_lo, mask_hi;
- u64 base, mask;
-
- if (!sizek) {
- fill_mtrr_var_range(reg, 0, 0, 0, 0);
- return;
- }
-
- mask = (1ULL << address_bits) - 1;
- mask &= ~((((u64)sizek) << 10) - 1);
-
- base = ((u64)basek) << 10;
-
- base |= type;
- mask |= 0x800;
-
- base_lo = base & ((1ULL<<32) - 1);
- base_hi = base >> 32;
-
- mask_lo = mask & ((1ULL<<32) - 1);
- mask_hi = mask >> 32;
-
- fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
-}
-
-static void __init
-save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type)
-{
- range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
- range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
- range_state[reg].type = type;
-}
-
-static void __init
-set_var_mtrr_all(unsigned int address_bits)
-{
- unsigned long basek, sizek;
- unsigned char type;
- unsigned int reg;
-
- for (reg = 0; reg < num_var_ranges; reg++) {
- basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
- sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
- type = range_state[reg].type;
-
- set_var_mtrr(reg, basek, sizek, type, address_bits);
- }
-}
-
-static unsigned long to_size_factor(unsigned long sizek, char *factorp)
-{
- char factor;
- unsigned long base = sizek;
-
- if (base & ((1<<10) - 1)) {
- /* not MB alignment */
- factor = 'K';
- } else if (base & ((1<<20) - 1)){
- factor = 'M';
- base >>= 10;
- } else {
- factor = 'G';
- base >>= 20;
- }
-
- *factorp = factor;
-
- return base;
-}
-
-static unsigned int __init
-range_to_mtrr(unsigned int reg, unsigned long range_startk,
- unsigned long range_sizek, unsigned char type)
-{
- if (!range_sizek || (reg >= num_var_ranges))
- return reg;
-
- while (range_sizek) {
- unsigned long max_align, align;
- unsigned long sizek;
-
- /* Compute the maximum size I can make a range */
- if (range_startk)
- max_align = ffs(range_startk) - 1;
- else
- max_align = 32;
- align = fls(range_sizek) - 1;
- if (align > max_align)
- align = max_align;
-
- sizek = 1 << align;
- if (debug_print) {
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
-
- start_base = to_size_factor(range_startk, &start_factor),
- size_base = to_size_factor(sizek, &size_factor),
-
- printk(KERN_DEBUG "Setting variable MTRR %d, "
- "base: %ld%cB, range: %ld%cB, type %s\n",
- reg, start_base, start_factor,
- size_base, size_factor,
- (type == MTRR_TYPE_UNCACHABLE)?"UC":
- ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
- );
- }
- save_var_mtrr(reg++, range_startk, sizek, type);
- range_startk += sizek;
- range_sizek -= sizek;
- if (reg >= num_var_ranges)
- break;
- }
- return reg;
-}
-
-static unsigned __init
-range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
- unsigned long sizek)
-{
- unsigned long hole_basek, hole_sizek;
- unsigned long second_basek, second_sizek;
- unsigned long range0_basek, range0_sizek;
- unsigned long range_basek, range_sizek;
- unsigned long chunk_sizek;
- unsigned long gran_sizek;
-
- hole_basek = 0;
- hole_sizek = 0;
- second_basek = 0;
- second_sizek = 0;
- chunk_sizek = state->chunk_sizek;
- gran_sizek = state->gran_sizek;
-
- /* align with gran size, prevent small block used up MTRRs */
- range_basek = ALIGN(state->range_startk, gran_sizek);
- if ((range_basek > basek) && basek)
- return second_sizek;
- state->range_sizek -= (range_basek - state->range_startk);
- range_sizek = ALIGN(state->range_sizek, gran_sizek);
-
- while (range_sizek > state->range_sizek) {
- range_sizek -= gran_sizek;
- if (!range_sizek)
- return 0;
- }
- state->range_sizek = range_sizek;
-
- /* try to append some small hole */
- range0_basek = state->range_startk;
- range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
-
- /* no increase */
- if (range0_sizek == state->range_sizek) {
- if (debug_print)
- printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
- range0_basek<<10,
- (range0_basek + state->range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range0_basek,
- state->range_sizek, MTRR_TYPE_WRBACK);
- return 0;
- }
-
- /* only cut back, when it is not the last */
- if (sizek) {
- while (range0_basek + range0_sizek > (basek + sizek)) {
- if (range0_sizek >= chunk_sizek)
- range0_sizek -= chunk_sizek;
- else
- range0_sizek = 0;
-
- if (!range0_sizek)
- break;
- }
- }
-
-second_try:
- range_basek = range0_basek + range0_sizek;
-
- /* one hole in the middle */
- if (range_basek > basek && range_basek <= (basek + sizek))
- second_sizek = range_basek - basek;
-
- if (range0_sizek > state->range_sizek) {
-
- /* one hole in middle or at end */
- hole_sizek = range0_sizek - state->range_sizek - second_sizek;
-
- /* hole size should be less than half of range0 size */
- if (hole_sizek >= (range0_sizek >> 1) &&
- range0_sizek >= chunk_sizek) {
- range0_sizek -= chunk_sizek;
- second_sizek = 0;
- hole_sizek = 0;
-
- goto second_try;
- }
- }
-
- if (range0_sizek) {
- if (debug_print)
- printk(KERN_DEBUG "range0: %016lx - %016lx\n",
- range0_basek<<10,
- (range0_basek + range0_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range0_basek,
- range0_sizek, MTRR_TYPE_WRBACK);
- }
-
- if (range0_sizek < state->range_sizek) {
- /* need to handle left over */
- range_sizek = state->range_sizek - range0_sizek;
-
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n",
- range_basek<<10,
- (range_basek + range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range_basek,
- range_sizek, MTRR_TYPE_WRBACK);
- }
-
- if (hole_sizek) {
- hole_basek = range_basek - hole_sizek - second_sizek;
- if (debug_print)
- printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10,
- (hole_basek + hole_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, hole_basek,
- hole_sizek, MTRR_TYPE_UNCACHABLE);
- }
-
- return second_sizek;
-}
-
-static void __init
-set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
- unsigned long size_pfn)
-{
- unsigned long basek, sizek;
- unsigned long second_sizek = 0;
-
- if (state->reg >= num_var_ranges)
- return;
-
- basek = base_pfn << (PAGE_SHIFT - 10);
- sizek = size_pfn << (PAGE_SHIFT - 10);
-
- /* See if I can merge with the last range */
- if ((basek <= 1024) ||
- (state->range_startk + state->range_sizek == basek)) {
- unsigned long endk = basek + sizek;
- state->range_sizek = endk - state->range_startk;
- return;
- }
- /* Write the range mtrrs */
- if (state->range_sizek != 0)
- second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
-
- /* Allocate an msr */
- state->range_startk = basek + second_sizek;
- state->range_sizek = sizek - second_sizek;
-}
-
-/* mininum size of mtrr block that can take hole */
-static u64 mtrr_chunk_size __initdata = (256ULL<<20);
-
-static int __init parse_mtrr_chunk_size_opt(char *p)
-{
- if (!p)
- return -EINVAL;
- mtrr_chunk_size = memparse(p, &p);
- return 0;
-}
-early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
-
-/* granity of mtrr of block */
-static u64 mtrr_gran_size __initdata;
-
-static int __init parse_mtrr_gran_size_opt(char *p)
-{
- if (!p)
- return -EINVAL;
- mtrr_gran_size = memparse(p, &p);
- return 0;
-}
-early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
-
-static int nr_mtrr_spare_reg __initdata =
- CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
-
-static int __init parse_mtrr_spare_reg(char *arg)
-{
- if (arg)
- nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
- return 0;
-}
-
-early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
-
-static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
- u64 chunk_size, u64 gran_size)
-{
- struct var_mtrr_state var_state;
- int i;
- int num_reg;
-
- var_state.range_startk = 0;
- var_state.range_sizek = 0;
- var_state.reg = 0;
- var_state.chunk_sizek = chunk_size >> 10;
- var_state.gran_sizek = gran_size >> 10;
-
- memset(range_state, 0, sizeof(range_state));
-
- /* Write the range etc */
- for (i = 0; i < nr_range; i++)
- set_var_mtrr_range(&var_state, range[i].start,
- range[i].end - range[i].start + 1);
-
- /* Write the last range */
- if (var_state.range_sizek != 0)
- range_to_mtrr_with_hole(&var_state, 0, 0);
-
- num_reg = var_state.reg;
- /* Clear out the extra MTRR's */
- while (var_state.reg < num_var_ranges) {
- save_var_mtrr(var_state.reg, 0, 0, 0);
- var_state.reg++;
- }
-
- return num_reg;
-}
-
-struct mtrr_cleanup_result {
- unsigned long gran_sizek;
- unsigned long chunk_sizek;
- unsigned long lose_cover_sizek;
- unsigned int num_reg;
- int bad;
-};
-
-/*
- * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 2G
- * so we need (1+16)*8
- */
-#define NUM_RESULT 136
-#define PSHIFT (PAGE_SHIFT - 10)
-
-static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
-static unsigned long __initdata min_loss_pfn[RANGE_NUM];
-
-static void __init print_out_mtrr_range_state(void)
-{
- int i;
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
- mtrr_type type;
-
- for (i = 0; i < num_var_ranges; i++) {
-
- size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
- if (!size_base)
- continue;
-
- size_base = to_size_factor(size_base, &size_factor),
- start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
- type = range_state[i].type;
-
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
- i, start_base, start_factor,
- size_base, size_factor,
- (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
- ((type == MTRR_TYPE_WRPROT) ? "WP" :
- ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
- );
- }
-}
-
-static int __init mtrr_need_cleanup(void)
-{
- int i;
- mtrr_type type;
- unsigned long size;
- /* extra one for all 0 */
- int num[MTRR_NUM_TYPES + 1];
-
- /* check entries number */
- memset(num, 0, sizeof(num));
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- size = range_state[i].size_pfn;
- if (type >= MTRR_NUM_TYPES)
- continue;
- if (!size)
- type = MTRR_NUM_TYPES;
- if (type == MTRR_TYPE_WRPROT)
- type = MTRR_TYPE_UNCACHABLE;
- num[type]++;
- }
-
- /* check if we got UC entries */
- if (!num[MTRR_TYPE_UNCACHABLE])
- return 0;
-
- /* check if we only had WB and UC */
- if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
- num_var_ranges - num[MTRR_NUM_TYPES])
- return 0;
-
- return 1;
-}
-
-static unsigned long __initdata range_sums;
-static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
- unsigned long extra_remove_base,
- unsigned long extra_remove_size,
- int i)
-{
- int num_reg;
- static struct res_range range_new[RANGE_NUM];
- static int nr_range_new;
- unsigned long range_sums_new;
-
- /* convert ranges to var ranges state */
- num_reg = x86_setup_var_mtrrs(range, nr_range,
- chunk_size, gran_size);
-
- /* we got new setting in range_state, check it */
- memset(range_new, 0, sizeof(range_new));
- nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
- extra_remove_base, extra_remove_size);
- range_sums_new = sum_ranges(range_new, nr_range_new);
-
- result[i].chunk_sizek = chunk_size >> 10;
- result[i].gran_sizek = gran_size >> 10;
- result[i].num_reg = num_reg;
- if (range_sums < range_sums_new) {
- result[i].lose_cover_sizek =
- (range_sums_new - range_sums) << PSHIFT;
- result[i].bad = 1;
- } else
- result[i].lose_cover_sizek =
- (range_sums - range_sums_new) << PSHIFT;
-
- /* double check it */
- if (!result[i].bad && !result[i].lose_cover_sizek) {
- if (nr_range_new != nr_range ||
- memcmp(range, range_new, sizeof(range)))
- result[i].bad = 1;
- }
-
- if (!result[i].bad && (range_sums - range_sums_new <
- min_loss_pfn[num_reg])) {
- min_loss_pfn[num_reg] =
- range_sums - range_sums_new;
- }
-}
-
-static void __init mtrr_print_out_one_result(int i)
-{
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad ? "*BAD*" : " ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad ? "-" : "",
- lose_base, lose_factor);
-}
-
-static int __init mtrr_search_optimal_index(void)
-{
- int i;
- int num_reg_good;
- int index_good;
-
- if (nr_mtrr_spare_reg >= num_var_ranges)
- nr_mtrr_spare_reg = num_var_ranges - 1;
- num_reg_good = -1;
- for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i])
- num_reg_good = i;
- }
-
- index_good = -1;
- if (num_reg_good != -1) {
- for (i = 0; i < NUM_RESULT; i++) {
- if (!result[i].bad &&
- result[i].num_reg == num_reg_good &&
- !result[i].lose_cover_sizek) {
- index_good = i;
- break;
- }
- }
- }
-
- return index_good;
-}
-
-
-static int __init mtrr_cleanup(unsigned address_bits)
-{
- unsigned long extra_remove_base, extra_remove_size;
- unsigned long base, size, def, dummy;
- mtrr_type type;
- u64 chunk_size, gran_size;
- int index_good;
- int i;
-
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
- return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
- def &= 0xff;
- if (def != MTRR_TYPE_UNCACHABLE)
- return 0;
-
- /* get it and store it aside */
- memset(range_state, 0, sizeof(range_state));
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
- range_state[i].base_pfn = base;
- range_state[i].size_pfn = size;
- range_state[i].type = type;
- }
-
- /* check if we need handle it and can handle it */
- if (!mtrr_need_cleanup())
- return 0;
-
- /* print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
- print_out_mtrr_range_state();
-
- memset(range, 0, sizeof(range));
- extra_remove_size = 0;
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
- if (mtrr_tom2)
- extra_remove_size =
- (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
- nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
- extra_remove_size);
- /*
- * [0, 1M) should always be coverred by var mtrr with WB
- * and fixed mtrrs should take effective before var mtrr for it
- */
- nr_range = add_range_with_merge(range, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
- /* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
- range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM coverred: %ldM\n",
- range_sums >> (20 - PAGE_SHIFT));
-
- if (mtrr_chunk_size && mtrr_gran_size) {
- i = 0;
- mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
- extra_remove_base, extra_remove_size, i);
-
- mtrr_print_out_one_result(i);
-
- if (!result[i].bad) {
- set_var_mtrr_all(address_bits);
- return 1;
- }
- printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
- "will find optimal one\n");
- }
-
- i = 0;
- memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
- memset(result, 0, sizeof(result));
- for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
-
- for (chunk_size = gran_size; chunk_size < (1ULL<<32);
- chunk_size <<= 1) {
-
- if (i >= NUM_RESULT)
- continue;
-
- mtrr_calc_range_state(chunk_size, gran_size,
- extra_remove_base, extra_remove_size, i);
- if (debug_print) {
- mtrr_print_out_one_result(i);
- printk(KERN_INFO "\n");
- }
-
- i++;
- }
- }
-
- /* try to find the optimal index */
- index_good = mtrr_search_optimal_index();
-
- if (index_good != -1) {
- printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
- i = index_good;
- mtrr_print_out_one_result(i);
-
- /* convert ranges to var ranges state */
- chunk_size = result[i].chunk_sizek;
- chunk_size <<= 10;
- gran_size = result[i].gran_sizek;
- gran_size <<= 10;
- x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
- print_out_mtrr_range_state();
- return 1;
- } else {
- /* print out all */
- for (i = 0; i < NUM_RESULT; i++)
- mtrr_print_out_one_result(i);
- }
-
- printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
- printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
-
- return 0;
-}
-#else
-static int __init mtrr_cleanup(unsigned address_bits)
-{
- return 0;
-}
-#endif
-
-static int __initdata changed_by_mtrr_cleanup;
-
-static int disable_mtrr_trim;
-
-static int __init disable_mtrr_trim_setup(char *str)
-{
- disable_mtrr_trim = 1;
- return 0;
-}
-early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
-
-/*
- * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
- * for memory >4GB. Check for that here.
- * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
- * apply to are wrong, but so far we don't know of any such case in the wild.
- */
-#define Tom2Enabled (1U << 21)
-#define Tom2ForceMemTypeWB (1U << 22)
-
-int __init amd_special_default_mtrr(void)
-{
- u32 l, h;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return 0;
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
- return 0;
- /* In case some hypervisor doesn't pass SYSCFG through */
- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
- return 0;
- /*
- * Memory between 4GB and top of mem is forced WB by this magic bit.
- * Reserved before K8RevF, but should be zero there.
- */
- if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
- (Tom2Enabled | Tom2ForceMemTypeWB))
- return 1;
- return 0;
-}
-
-static u64 __init real_trim_memory(unsigned long start_pfn,
- unsigned long limit_pfn)
-{
- u64 trim_start, trim_size;
- trim_start = start_pfn;
- trim_start <<= PAGE_SHIFT;
- trim_size = limit_pfn;
- trim_size <<= PAGE_SHIFT;
- trim_size -= trim_start;
-
- return e820_update_range(trim_start, trim_size, E820_RAM,
- E820_RESERVED);
-}
-/**
- * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
- * @end_pfn: ending page frame number
- *
- * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
- * memory configurations. This routine checks that the highest MTRR matches
- * the end of memory, to make sure the MTRRs having a write back type cover
- * all of the memory the kernel is intending to use. If not, it'll trim any
- * memory off the end by adjusting end_pfn, removing it from the kernel's
- * allocation pools, warning the user with an obnoxious message.
- */
-int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
-{
- unsigned long i, base, size, highest_pfn = 0, def, dummy;
- mtrr_type type;
- u64 total_trim_size;
-
- /* extra one for all 0 */
- int num[MTRR_NUM_TYPES + 1];
- /*
- * Make sure we only trim uncachable memory on machines that
- * support the Intel MTRR architecture:
- */
- if (!is_cpu(INTEL) || disable_mtrr_trim)
- return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
- def &= 0xff;
- if (def != MTRR_TYPE_UNCACHABLE)
- return 0;
-
- /* get it and store it aside */
- memset(range_state, 0, sizeof(range_state));
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
- range_state[i].base_pfn = base;
- range_state[i].size_pfn = size;
- range_state[i].type = type;
- }
-
- /* Find highest cached pfn */
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type != MTRR_TYPE_WRBACK)
- continue;
- base = range_state[i].base_pfn;
- size = range_state[i].size_pfn;
- if (highest_pfn < base + size)
- highest_pfn = base + size;
- }
-
- /* kvm/qemu doesn't have mtrr set right, don't trim them all */
- if (!highest_pfn) {
- printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
- return 0;
- }
-
- /* check entries number */
- memset(num, 0, sizeof(num));
- for (i = 0; i < num_var_ranges; i++) {
- type = range_state[i].type;
- if (type >= MTRR_NUM_TYPES)
- continue;
- size = range_state[i].size_pfn;
- if (!size)
- type = MTRR_NUM_TYPES;
- num[type]++;
- }
-
- /* no entry for WB? */
- if (!num[MTRR_TYPE_WRBACK])
- return 0;
-
- /* check if we only had WB and UC */
- if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
- num_var_ranges - num[MTRR_NUM_TYPES])
- return 0;
-
- memset(range, 0, sizeof(range));
- nr_range = 0;
- if (mtrr_tom2) {
- range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
- if (highest_pfn < range[nr_range].end + 1)
- highest_pfn = range[nr_range].end + 1;
- nr_range++;
- }
- nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
-
- total_trim_size = 0;
- /* check the head */
- if (range[0].start)
- total_trim_size += real_trim_memory(0, range[0].start);
- /* check the holes */
- for (i = 0; i < nr_range - 1; i++) {
- if (range[i].end + 1 < range[i+1].start)
- total_trim_size += real_trim_memory(range[i].end + 1,
- range[i+1].start);
- }
- /* check the top */
- i = nr_range - 1;
- if (range[i].end + 1 < end_pfn)
- total_trim_size += real_trim_memory(range[i].end + 1,
- end_pfn);
-
- if (total_trim_size) {
- printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
- " all of memory, losing %lluMB of RAM.\n",
- total_trim_size >> 20);
-
- if (!changed_by_mtrr_cleanup)
- WARN_ON(1);
-
- printk(KERN_INFO "update e820 for mtrr\n");
- update_e820();
-
- return 1;
- }
-
- return 0;
-}
+int __initdata changed_by_mtrr_cleanup;
/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6..77f67f7b347 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
+extern struct mtrr_state_type mtrr_state;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
int amd_init_mtrr(void);
int cyrix_init_mtrr(void);
int centaur_init_mtrr(void);
+
+extern int changed_by_mtrr_cleanup;
+extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b2267..f6c70a164e3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,7 +19,7 @@
#include <linux/nmi.h>
#include <linux/kprobes.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
#include <asm/intel_arch_perfmon.h>
struct nmi_watchdog_ctlblk {
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 01b1244ef1c..d67e0e48bc2 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,11 +7,10 @@
/*
* Get CPU information for use by the procfs.
*/
-#ifdef CONFIG_X86_32
static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
if (c->x86_max_cores * smp_num_siblings > 1) {
seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
seq_printf(m, "siblings\t: %d\n",
@@ -24,6 +23,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
#endif
}
+#ifdef CONFIG_X86_32
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
/*
@@ -50,22 +50,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
c->wp_works_ok ? "yes" : "no");
}
#else
-static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
- unsigned int cpu)
-{
-#ifdef CONFIG_SMP
- if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpus_weight(per_cpu(cpu_core_map, cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
- }
-#endif
-}
-
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
seq_printf(m,
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5a..bb62b3e5caa 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e096..fd2c37bf7ac 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
* so no special init takes place.
*/
-static struct cpu_dev umc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
.c_models = {
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c689d19e35a..ff958248e61 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,12 +24,10 @@
#include <asm/apic.h>
#include <asm/hpet.h>
#include <linux/kdebug.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
-#include <mach_ipi.h>
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f866..87d103ded1c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -99,7 +99,7 @@ print_context_stack(struct thread_info *tinfo,
frame = frame->next_frame;
bp = (unsigned long) frame;
} else {
- ops->address(data, addr, bp == 0);
+ ops->address(data, addr, 0);
}
print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d070704..d35db5993fd 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irq_stack_end =
+ (unsigned long *)per_cpu(irq_stack_ptr, cpu);
unsigned used = 0;
struct thread_info *tinfo;
int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
stack = (unsigned long *) estack_end[-2];
continue;
}
- if (irqstack_end) {
- unsigned long *irqstack;
- irqstack = irqstack_end -
- (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+ if (irq_stack_end) {
+ unsigned long *irq_stack;
+ irq_stack = irq_stack_end -
+ (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
- if (stack >= irqstack && stack < irqstack_end) {
+ if (stack >= irq_stack && stack < irq_stack_end) {
if (ops->stack(data, "IRQ") < 0)
break;
bp = print_context_stack(tinfo, stack, bp,
- ops, data, irqstack_end, &graph);
+ ops, data, irq_stack_end, &graph);
/*
* We link to the next stack (which would be
* the process stack normally) the last
* pointer (index -1 to end) in the IRQ stack:
*/
- stack = (unsigned long *) (irqstack_end[-1]);
- irqstack_end = NULL;
+ stack = (unsigned long *) (irq_stack_end[-1]);
+ irq_stack_end = NULL;
ops->stack(data, "EOI");
continue;
}
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack =
- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irq_stack_end =
+ (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
+ unsigned long *irq_stack =
+ (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
/*
* debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
- if (stack >= irqstack && stack <= irqstack_end) {
- if (stack == irqstack_end) {
- stack = (unsigned long *) (irqstack_end[-1]);
+ if (stack >= irq_stack && stack <= irq_stack_end) {
+ if (stack == irq_stack_end) {
+ stack = (unsigned long *) (irq_stack_end[-1]);
printk(" <EOI> ");
}
} else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = current;
sp = regs->sp;
printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e85826829cf..fb638d9ce6d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
/*
* Add a memory region to the kernel e820 map.
*/
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+ int type)
{
- int x = e820.nr_map;
+ int x = e820x->nr_map;
- if (x == ARRAY_SIZE(e820.map)) {
+ if (x == ARRAY_SIZE(e820x->map)) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
+ e820x->map[x].addr = start;
+ e820x->map[x].size = size;
+ e820x->map[x].type = type;
+ e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+ __e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+ switch (type) {
+ case E820_RAM:
+ case E820_RESERVED_KERN:
+ printk(KERN_CONT "(usable)");
+ break;
+ case E820_RESERVED:
+ printk(KERN_CONT "(reserved)");
+ break;
+ case E820_ACPI:
+ printk(KERN_CONT "(ACPI data)");
+ break;
+ case E820_NVS:
+ printk(KERN_CONT "(ACPI NVS)");
+ break;
+ case E820_UNUSABLE:
+ printk(KERN_CONT "(unusable)");
+ break;
+ default:
+ printk(KERN_CONT "type %u", type);
+ break;
+ }
}
void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
(unsigned long long) e820.map[i].addr,
(unsigned long long)
(e820.map[i].addr + e820.map[i].size));
- switch (e820.map[i].type) {
- case E820_RAM:
- case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)\n");
- break;
- case E820_RESERVED:
- printk(KERN_CONT "(reserved)\n");
- break;
- case E820_ACPI:
- printk(KERN_CONT "(ACPI data)\n");
- break;
- case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)\n");
- break;
- case E820_UNUSABLE:
- printk("(unusable)\n");
- break;
- default:
- printk(KERN_CONT "type %u\n", e820.map[i].type);
- break;
- }
+ e820_print_type(e820.map[i].type);
+ printk(KERN_CONT "\n");
}
}
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
return __append_e820_map(biosmap, nr_map);
}
-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
u64 size, unsigned old_type,
unsigned new_type)
{
- int i;
+ u64 end;
+ unsigned int i;
u64 real_updated_size = 0;
BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
- for (i = 0; i < e820.nr_map; i++) {
+ end = start + size;
+ printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+ (unsigned long long) start,
+ (unsigned long long) end);
+ e820_print_type(old_type);
+ printk(KERN_CONT " ==> ");
+ e820_print_type(new_type);
+ printk(KERN_CONT "\n");
+
+ for (i = 0; i < e820x->nr_map; i++) {
struct e820entry *ei = &e820x->map[i];
u64 final_start, final_end;
+ u64 ei_end;
+
if (ei->type != old_type)
continue;
- /* totally covered? */
- if (ei->addr >= start &&
- (ei->addr + ei->size) <= (start + size)) {
+
+ ei_end = ei->addr + ei->size;
+ /* totally covered by new range? */
+ if (ei->addr >= start && ei_end <= end) {
ei->type = new_type;
real_updated_size += ei->size;
continue;
}
+
+ /* new range is totally covered? */
+ if (ei->addr < start && ei_end > end) {
+ __e820_add_region(e820x, start, size, new_type);
+ __e820_add_region(e820x, end, ei_end - end, ei->type);
+ ei->size = start - ei->addr;
+ real_updated_size += size;
+ continue;
+ }
+
/* partially covered */
final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
+ final_end = min(end, ei_end);
if (final_start >= final_end)
continue;
- e820_add_region(final_start, final_end - final_start,
- new_type);
+
+ __e820_add_region(e820x, final_start, final_end - final_start,
+ new_type);
+
real_updated_size += final_end - final_start;
+ /*
+ * left range could be head or tail, so need to update
+ * size at first.
+ */
ei->size -= final_end - final_start;
if (ei->addr < final_start)
continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
- return e820_update_range_map(&e820, start, size, old_type, new_type);
+ return __e820_update_range(&e820, start, size, old_type, new_type);
}
static u64 __init e820_update_range_saved(u64 start, u64 size,
unsigned old_type, unsigned new_type)
{
- return e820_update_range_map(&e820_saved, start, size, old_type,
+ return __e820_update_range(&e820_saved, start, size, old_type,
new_type);
}
@@ -858,6 +899,9 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
*/
void __init reserve_early(u64 start, u64 end, char *name)
{
+ if (start >= end)
+ return;
+
drop_overlaps_that_are_ok(start, end);
__reserve_early(start, end, name, 0);
}
@@ -1017,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
continue;
return addr;
}
- return -1UL;
+ return -1ULL;
}
/*
@@ -1031,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
u64 start;
start = startt;
- while (size < sizet)
+ while (size < sizet && (start + 1))
start = find_e820_area_size(start, &size, align);
if (size < sizet)
return 0;
+#ifdef CONFIG_X86_32
+ if (start >= MAXMEM)
+ return 0;
+ if (start + size > MAXMEM)
+ size = MAXMEM - start;
+#endif
+
addr = round_down(start + size - sizet, align);
+ if (addr < start)
+ return 0;
e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
printk(KERN_INFO "update e820 for early_reserve_e820\n");
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 504ad198e4a..335f049d110 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -13,8 +13,8 @@
#include <asm/setup.h>
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
-#include <asm/pgtable.h>
#include <asm/fixmap.h>
+#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
/* Simple VGA output */
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
}
-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
{
int i;
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
writel(hi, &ehci_debug->data47);
}
-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
{
unsigned char *bytes = buf;
u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
return ret;
}
-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
int size)
{
u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
return ret;
}
-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
- int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+ int request, int value, int index, void *data, int size)
{
u32 pids, addr, ctrl;
struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
return 0;
}
-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
{
u32 portsc;
u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
return -EBUSY;
}
-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
{
u32 status;
int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
typedef void (*set_debug_port_t)(int port);
-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
{
}
-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
{
u32 dword;
dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index eb1ef3b67dd..1736acc4d7a 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
SMBIOS_TABLE_GUID)) {
efi.smbios = config_tables[i].table;
printk(" SMBIOS=0x%lx ", config_tables[i].table);
+#ifdef CONFIG_X86_UV
} else if (!efi_guidcmp(config_tables[i].guid,
UV_SYSTEM_TABLE_GUID)) {
efi.uv_systab = config_tables[i].table;
printk(" UVsystab=0x%lx ", config_tables[i].table);
+#endif
} else if (!efi_guidcmp(config_tables[i].guid,
HCDP_TABLE_GUID)) {
efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index cb783b92c50..22c3b7828c5 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
#include <asm/proto.h>
#include <asm/efi.h>
#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
static pgd_t save_pgd __initdata;
static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
index ef00bb77d7e..fbe66e626c0 100644
--- a/arch/x86/kernel/efi_stub_32.S
+++ b/arch/x86/kernel/efi_stub_32.S
@@ -6,7 +6,7 @@
*/
#include <linux/linkage.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
/*
* efi_call_phys(void *, ...) is a function with variable parameters.
@@ -113,6 +113,7 @@ ENTRY(efi_call_phys)
movl (%edx), %ecx
pushl %ecx
ret
+ENDPROC(efi_call_phys)
.previous
.data
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
index 99b47d48c9f..4c07ccab814 100644
--- a/arch/x86/kernel/efi_stub_64.S
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -41,6 +41,7 @@ ENTRY(efi_call0)
addq $32, %rsp
RESTORE_XMM
ret
+ENDPROC(efi_call0)
ENTRY(efi_call1)
SAVE_XMM
@@ -50,6 +51,7 @@ ENTRY(efi_call1)
addq $32, %rsp
RESTORE_XMM
ret
+ENDPROC(efi_call1)
ENTRY(efi_call2)
SAVE_XMM
@@ -59,6 +61,7 @@ ENTRY(efi_call2)
addq $32, %rsp
RESTORE_XMM
ret
+ENDPROC(efi_call2)
ENTRY(efi_call3)
SAVE_XMM
@@ -69,6 +72,7 @@ ENTRY(efi_call3)
addq $32, %rsp
RESTORE_XMM
ret
+ENDPROC(efi_call3)
ENTRY(efi_call4)
SAVE_XMM
@@ -80,6 +84,7 @@ ENTRY(efi_call4)
addq $32, %rsp
RESTORE_XMM
ret
+ENDPROC(efi_call4)
ENTRY(efi_call5)
SAVE_XMM
@@ -92,6 +97,7 @@ ENTRY(efi_call5)
addq $48, %rsp
RESTORE_XMM
ret
+ENDPROC(efi_call5)
ENTRY(efi_call6)
SAVE_XMM
@@ -107,3 +113,4 @@ ENTRY(efi_call6)
addq $48, %rsp
RESTORE_XMM
ret
+ENDPROC(efi_call6)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 46469029e9d..c929add475c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - orig_eax
- * 2C(%esp) - %eip
- * 30(%esp) - %cs
- * 34(%esp) - %eflags
- * 38(%esp) - %oldesp
- * 3C(%esp) - %oldss
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
@@ -46,7 +47,7 @@
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/desc.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
@@ -101,121 +102,221 @@
#define resume_userspace_sig resume_userspace
#endif
-#define SAVE_ALL \
- cld; \
- pushl %fs; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET fs, 0;*/\
- pushl %es; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET es, 0;*/\
- pushl %ds; \
- CFI_ADJUST_CFA_OFFSET 4;\
- /*CFI_REL_OFFSET ds, 0;*/\
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET eax, 0;\
- pushl %ebp; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebp, 0;\
- pushl %edi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edi, 0;\
- pushl %esi; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET esi, 0;\
- pushl %edx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET edx, 0;\
- pushl %ecx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ecx, 0;\
- pushl %ebx; \
- CFI_ADJUST_CFA_OFFSET 4;\
- CFI_REL_OFFSET ebx, 0;\
- movl $(__USER_DS), %edx; \
- movl %edx, %ds; \
- movl %edx, %es; \
- movl $(__KERNEL_PERCPU), %edx; \
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+ CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE gs*/
+ .if \pop <> 0
+ add $\pop, %esp
+ CFI_ADJUST_CFA_OFFSET -\pop
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+ /*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+ /*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+ cld
+ PUSH_GS
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0;*/
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0;*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0;*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
+ SET_KERNEL_GS %edx
+.endm
-#define RESTORE_INT_REGS \
- popl %ebx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebx;\
- popl %ecx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ecx;\
- popl %edx; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edx;\
- popl %esi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE esi;\
- popl %edi; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE edi;\
- popl %ebp; \
- CFI_ADJUST_CFA_OFFSET -4;\
- CFI_RESTORE ebp;\
- popl %eax; \
- CFI_ADJUST_CFA_OFFSET -4;\
+.macro RESTORE_INT_REGS
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ popl %edi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edi
+ popl %ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebp
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
CFI_RESTORE eax
+.endm
-#define RESTORE_REGS \
- RESTORE_INT_REGS; \
-1: popl %ds; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE ds;*/\
-2: popl %es; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE es;*/\
-3: popl %fs; \
- CFI_ADJUST_CFA_OFFSET -4;\
- /*CFI_RESTORE fs;*/\
-.pushsection .fixup,"ax"; \
-4: movl $0,(%esp); \
- jmp 1b; \
-5: movl $0,(%esp); \
- jmp 2b; \
-6: movl $0,(%esp); \
- jmp 3b; \
-.section __ex_table,"a";\
- .align 4; \
- .long 1b,4b; \
- .long 2b,5b; \
- .long 3b,6b; \
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE ds;*/
+2: popl %es
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE es;*/
+3: popl %fs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE fs;*/
+ POP_GS \pop
+.pushsection .fixup, "ax"
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
+.section __ex_table, "a"
+ .align 4
+ .long 1b, 4b
+ .long 2b, 5b
+ .long 3b, 6b
.popsection
+ POP_GS_EX
+.endm
-#define RING0_INT_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 3*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_INT_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 3*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_EC_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, 4*4;\
- /*CFI_OFFSET cs, -2*4;*/\
+.macro RING0_EC_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 4*4
+ /*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
+.endm
-#define RING0_PTREGS_FRAME \
- CFI_STARTPROC simple;\
- CFI_SIGNAL_FRAME;\
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
- CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
- CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
- CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
- CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
- CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+.macro RING0_PTREGS_FRAME
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
+ CFI_OFFSET eip, PT_EIP-PT_OLDESP
+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
+ CFI_OFFSET eax, PT_EAX-PT_OLDESP
+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP
+ CFI_OFFSET edi, PT_EDI-PT_OLDESP
+ CFI_OFFSET esi, PT_ESI-PT_OLDESP
+ CFI_OFFSET edx, PT_EDX-PT_OLDESP
+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP
CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+.endm
ENTRY(ret_from_fork)
CFI_STARTPROC
@@ -341,8 +442,7 @@ sysenter_past_esp:
GET_THREAD_INFO(%ebp)
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz sysenter_audit
sysenter_do_call:
cmpl $(nr_syscalls), %eax
@@ -353,7 +453,7 @@ sysenter_do_call:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx
+ testl $_TIF_ALLWORK_MASK, %ecx
jne sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
@@ -362,11 +462,12 @@ sysenter_exit:
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
#ifdef CONFIG_AUDITSYSCALL
sysenter_audit:
- testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
addl $4,%esp
CFI_ADJUST_CFA_OFFSET -4
@@ -383,7 +484,7 @@ sysenter_audit:
jmp sysenter_do_call
sysexit_audit:
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
@@ -396,7 +497,7 @@ sysexit_audit:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
movl PT_EAX(%esp),%eax /* reload syscall return value */
jmp sysenter_exit
@@ -410,6 +511,7 @@ sysexit_audit:
.align 4
.long 1b,2b
.popsection
+ PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
# system call handler stub
@@ -420,8 +522,7 @@ ENTRY(system_call)
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
@@ -435,7 +536,7 @@ syscall_exit:
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
- testw $_TIF_ALLWORK_MASK, %cx # current->work
+ testl $_TIF_ALLWORK_MASK, %ecx # current->work
jne syscall_exit_work
restore_all:
@@ -452,8 +553,7 @@ restore_all:
restore_nocheck:
TRACE_IRQS_IRET
restore_nocheck_notrace:
- RESTORE_REGS
- addl $4, %esp # skip orig_eax/error_code
+ RESTORE_REGS 4 # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
@@ -571,7 +671,7 @@ END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
- testb $_TIF_WORK_SYSCALL_EXIT, %cl
+ testl $_TIF_WORK_SYSCALL_EXIT, %ecx
jz work_pending
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
@@ -595,28 +695,50 @@ syscall_badsys:
END(syscall_badsys)
CFI_ENDPROC
-#define FIXUP_ESPFIX_STACK \
- /* since we are on a wrong stack, we cant make it a C code :( */ \
- PER_CPU(gdt_page, %ebx); \
- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
- addl %esp, %eax; \
- pushl $__KERNEL_DS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl %eax; \
- CFI_ADJUST_CFA_OFFSET 4; \
- lss (%esp), %esp; \
- CFI_ADJUST_CFA_OFFSET -8;
-#define UNWIND_ESPFIX_STACK \
- movl %ss, %eax; \
- /* see if on espfix stack */ \
- cmpw $__ESPFIX_SS, %ax; \
- jne 27f; \
- movl $__KERNEL_DS, %eax; \
- movl %eax, %ds; \
- movl %eax, %es; \
- /* switch to normal stack */ \
- FIXUP_ESPFIX_STACK; \
-27:;
+/*
+ * System calls that need a pt_regs pointer.
+ */
+#define PTREGSCALL(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%eax; \
+ jmp sys_##name;
+
+PTREGSCALL(iopl)
+PTREGSCALL(fork)
+PTREGSCALL(clone)
+PTREGSCALL(vfork)
+PTREGSCALL(execve)
+PTREGSCALL(sigaltstack)
+PTREGSCALL(sigreturn)
+PTREGSCALL(rt_sigreturn)
+PTREGSCALL(vm86)
+PTREGSCALL(vm86old)
+
+.macro FIXUP_ESPFIX_STACK
+ /* since we are on a wrong stack, we cant make it a C code :( */
+ PER_CPU(gdt_page, %ebx)
+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ addl %esp, %eax
+ pushl $__KERNEL_DS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ lss (%esp), %esp
+ CFI_ADJUST_CFA_OFFSET -8
+.endm
+.macro UNWIND_ESPFIX_STACK
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+27:
+.endm
/*
* Build the entry stubs and pointer table with some assembler magic.
@@ -672,7 +794,7 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
-#define BUILD_INTERRUPT(name, nr) \
+#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
pushl $~(nr); \
@@ -680,13 +802,15 @@ ENTRY(name) \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
- call smp_##name; \
+ call fn; \
jmp ret_from_intr; \
CFI_ENDPROC; \
ENDPROC(name)
+#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
+
/* The include is where all of the SMP etc. interrupts come from */
-#include "entry_arch.h"
+#include <asm/entry_arch.h>
ENTRY(coprocessor_error)
RING0_INT_FRAME
@@ -1068,7 +1192,10 @@ ENTRY(page_fault)
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
- /* the function address is in %fs's slot on the stack */
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
@@ -1097,20 +1224,15 @@ error_code:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
@@ -1134,26 +1256,27 @@ END(page_fault)
* by hand onto the new stack - while updating the return eip past
* the instruction that would have done it for sysenter.
*/
-#define FIX_STACK(offset, ok, label) \
- cmpw $__KERNEL_CS,4(%esp); \
- jne ok; \
-label: \
- movl TSS_sysenter_sp0+offset(%esp),%esp; \
- CFI_DEF_CFA esp, 0; \
- CFI_UNDEFINED eip; \
- pushfl; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $__KERNEL_CS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $sysenter_past_esp; \
- CFI_ADJUST_CFA_OFFSET 4; \
+.macro FIX_STACK offset ok label
+ cmpw $__KERNEL_CS, 4(%esp)
+ jne \ok
+\label:
+ movl TSS_sysenter_sp0 + \offset(%esp), %esp
+ CFI_DEF_CFA esp, 0
+ CFI_UNDEFINED eip
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $__KERNEL_CS
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $sysenter_past_esp
+ CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
+.endm
ENTRY(debug)
RING0_INT_FRAME
cmpl $ia32_sysenter_target,(%esp)
jne debug_stack_correct
- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+ FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
@@ -1211,7 +1334,7 @@ nmi_stack_correct:
nmi_stack_fixup:
RING0_INT_FRAME
- FIX_STACK(12,nmi_stack_correct, 1)
+ FIX_STACK 12, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_debug_stack_check:
@@ -1222,7 +1345,7 @@ nmi_debug_stack_check:
jb nmi_stack_correct
cmpl $debug_esp_fix_insn,(%esp)
ja nmi_stack_correct
- FIX_STACK(24,nmi_stack_correct, 1)
+ FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_espfix_stack:
@@ -1234,7 +1357,7 @@ nmi_espfix_stack:
CFI_ADJUST_CFA_OFFSET 4
pushl %esp
CFI_ADJUST_CFA_OFFSET 4
- addw $4, (%esp)
+ addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
pushl 16(%esp)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a1346217e43..a331ec38af9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -48,10 +48,11 @@
#include <asm/unistd.h>
#include <asm/thread_info.h>
#include <asm/hw_irq.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
#include <asm/ftrace.h>
+#include <asm/percpu.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -76,20 +77,17 @@ ENTRY(ftrace_caller)
movq 8(%rbp), %rsi
subq $MCOUNT_INSN_SIZE, %rdi
-.globl ftrace_call
-ftrace_call:
+GLOBAL(ftrace_call)
call ftrace_stub
MCOUNT_RESTORE_FRAME
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
+GLOBAL(ftrace_graph_call)
jmp ftrace_stub
#endif
-.globl ftrace_stub
-ftrace_stub:
+GLOBAL(ftrace_stub)
retq
END(ftrace_caller)
@@ -109,8 +107,7 @@ ENTRY(mcount)
jnz ftrace_graph_caller
#endif
-.globl ftrace_stub
-ftrace_stub:
+GLOBAL(ftrace_stub)
retq
trace:
@@ -147,9 +144,7 @@ ENTRY(ftrace_graph_caller)
retq
END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
+GLOBAL(return_to_handler)
subq $80, %rsp
movq %rax, (%rsp)
@@ -187,6 +182,7 @@ return_to_handler:
ENTRY(native_usergs_sysret64)
swapgs
sysretq
+ENDPROC(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
@@ -209,7 +205,7 @@ ENTRY(native_usergs_sysret64)
/* %rsp:at FRAMEEND */
.macro FIXUP_TOP_OF_STACK tmp offset=0
- movq %gs:pda_oldrsp,\tmp
+ movq PER_CPU_VAR(old_rsp),\tmp
movq \tmp,RSP+\offset(%rsp)
movq $__USER_DS,SS+\offset(%rsp)
movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +216,7 @@ ENTRY(native_usergs_sysret64)
.macro RESTORE_TOP_OF_STACK tmp offset=0
movq RSP+\offset(%rsp),\tmp
- movq \tmp,%gs:pda_oldrsp
+ movq \tmp,PER_CPU_VAR(old_rsp)
movq EFLAGS+\offset(%rsp),\tmp
movq \tmp,R11+\offset(%rsp)
.endm
@@ -336,15 +332,15 @@ ENTRY(save_args)
je 1f
SWAPGS
/*
- * irqcount is used to check if a CPU is already on an interrupt stack
+ * irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl %gs:pda_irqcount
+1: incl PER_CPU_VAR(irq_count)
jne 2f
popq_cfi %rax /* move return address... */
- mov %gs:pda_irqstackptr,%rsp
+ mov PER_CPU_VAR(irq_stack_ptr),%rsp
EMPTY_FRAME 0
pushq_cfi %rbp /* backlink for unwinder */
pushq_cfi %rax /* ... to the new stack */
@@ -372,6 +368,7 @@ ENTRY(save_rest)
END(save_rest)
/* save complete stack frame */
+ .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -400,6 +397,7 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
+ .popsection
/*
* A newly forked process directly context switches into this address.
@@ -409,6 +407,8 @@ END(save_paranoid)
ENTRY(ret_from_fork)
DEFAULT_FRAME
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
push kernel_eflags(%rip)
CFI_ADJUST_CFA_OFFSET 8
popf # reset kernel eflags
@@ -418,7 +418,6 @@ ENTRY(ret_from_fork)
GET_THREAD_INFO(%rcx)
- CFI_REMEMBER_STATE
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -430,7 +429,6 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
- CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
@@ -468,7 +466,7 @@ END(ret_from_fork)
ENTRY(system_call)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,PDA_STACKOFFSET
+ CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
@@ -479,8 +477,8 @@ ENTRY(system_call)
*/
ENTRY(system_call_after_swapgs)
- movq %rsp,%gs:pda_oldrsp
- movq %gs:pda_kernelstack,%rsp
+ movq %rsp,PER_CPU_VAR(old_rsp)
+ movq PER_CPU_VAR(kernel_stack),%rsp
/*
* No need to follow this irqs off/on section - it's straight
* and short:
@@ -523,7 +521,7 @@ sysret_check:
CFI_REGISTER rip,rcx
RESTORE_ARGS 0,-ARG_SKIP,1
/*CFI_REGISTER rflags,r11*/
- movq %gs:pda_oldrsp, %rsp
+ movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
CFI_RESTORE_STATE
@@ -630,16 +628,14 @@ tracesys:
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
- .globl int_ret_from_sys_call
- .globl int_with_check
-int_ret_from_sys_call:
+GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $3,CS-ARGOFFSET(%rsp)
je retint_restore_args
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
-int_with_check:
+GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%edx
@@ -833,11 +829,11 @@ common_interrupt:
XCPT_FRAME
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
- /* 0(%rsp): oldrsp-ARGOFFSET */
+ /* 0(%rsp): old_rsp-ARGOFFSET */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
@@ -982,10 +978,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
#endif
+#ifdef CONFIG_X86_UV
apicinterrupt UV_BAU_MESSAGE \
uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
apicinterrupt LOCAL_TIMER_VECTOR \
apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+ generic_interrupt smp_generic_interrupt
#ifdef CONFIG_SMP
apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1073,10 +1073,10 @@ ENTRY(\sym)
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- movq %gs:pda_data_offset, %rbp
- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ PER_CPU(init_tss, %rbp)
+ subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
call \do_sym
- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
END(\sym)
@@ -1138,7 +1138,7 @@ ENTRY(native_load_gs_index)
CFI_STARTPROC
pushf
CFI_ADJUST_CFA_OFFSET 8
- DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+ DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
gs_change:
movl %edi,%gs
@@ -1260,14 +1260,14 @@ ENTRY(call_softirq)
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- incl %gs:pda_irqcount
- cmove %gs:pda_irqstackptr,%rsp
+ incl PER_CPU_VAR(irq_count)
+ cmove PER_CPU_VAR(irq_stack_ptr),%rsp
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
ret
CFI_ENDPROC
END(call_softirq)
@@ -1297,15 +1297,15 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
DEFAULT_FRAME
-11: incl %gs:pda_irqcount
+11: incl PER_CPU_VAR(irq_count)
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
- cmovzq %gs:pda_irqstackptr,%rsp
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
pushq %rbp # backlink for old unwinder
call xen_evtchn_do_upcall
popq %rsp
CFI_DEF_CFA_REGISTER rsp
- decl %gs:pda_irqcount
+ decl PER_CPU_VAR(irq_count)
jmp error_exit
CFI_ENDPROC
END(do_hypervisor_callback)
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
deleted file mode 100644
index 53699c931ad..00000000000
--- a/arch/x86/kernel/es7000_32.c
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Written by: Garry Forsgren, Unisys Corporation
- * Natalie Protasevich, Unisys Corporation
- * This file contains the code to configure and interface
- * with Unisys ES7000 series hardware system manager.
- *
- * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Unisys Corporation, Township Line & Union Meeting
- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
- *
- * http://www.unisys.com
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/init.h>
-#include <linux/acpi.h>
-#include <asm/io.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/atomic.h>
-#include <asm/apicdef.h>
-#include <mach_mpparse.h>
-#include <asm/genapic.h>
-#include <asm/setup.h>
-
-/*
- * ES7000 chipsets
- */
-
-#define NON_UNISYS 0
-#define ES7000_CLASSIC 1
-#define ES7000_ZORRO 2
-
-
-#define MIP_REG 1
-#define MIP_PSAI_REG 4
-
-#define MIP_BUSY 1
-#define MIP_SPIN 0xf0000
-#define MIP_VALID 0x0100000000000000ULL
-#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
-
-#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
-
-struct mip_reg_info {
- unsigned long long mip_info;
- unsigned long long delivery_info;
- unsigned long long host_reg;
- unsigned long long mip_reg;
-};
-
-struct part_info {
- unsigned char type;
- unsigned char length;
- unsigned char part_id;
- unsigned char apic_mode;
- unsigned long snum;
- char ptype[16];
- char sname[64];
- char pname[64];
-};
-
-struct psai {
- unsigned long long entry_type;
- unsigned long long addr;
- unsigned long long bep_addr;
-};
-
-struct es7000_mem_info {
- unsigned char type;
- unsigned char length;
- unsigned char resv[6];
- unsigned long long start;
- unsigned long long size;
-};
-
-struct es7000_oem_table {
- unsigned long long hdr;
- struct mip_reg_info mip;
- struct part_info pif;
- struct es7000_mem_info shm;
- struct psai psai;
-};
-
-#ifdef CONFIG_ACPI
-
-struct oem_table {
- struct acpi_table_header Header;
- u32 OEMTableAddr;
- u32 OEMTableSize;
-};
-
-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
-extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
-#endif
-
-struct mip_reg {
- unsigned long long off_0;
- unsigned long long off_8;
- unsigned long long off_10;
- unsigned long long off_18;
- unsigned long long off_20;
- unsigned long long off_28;
- unsigned long long off_30;
- unsigned long long off_38;
-};
-
-#define MIP_SW_APIC 0x1020b
-#define MIP_FUNC(VALUE) (VALUE & 0xff)
-
-/*
- * ES7000 Globals
- */
-
-static volatile unsigned long *psai = NULL;
-static struct mip_reg *mip_reg;
-static struct mip_reg *host_reg;
-static int mip_port;
-static unsigned long mip_addr, host_addr;
-
-int es7000_plat;
-
-/*
- * GSI override for ES7000 platforms.
- */
-
-static unsigned int base;
-
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
- if (es7000_plat == ES7000_ZORRO)
- return gsi;
-
- if (!base) {
- int i;
- for (i = 0; i < nr_ioapics; i++)
- base += nr_ioapic_registers[i];
- }
-
- if (!ioapic && (gsi < 16))
- gsi += base;
- return gsi;
-}
-
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
-{
- unsigned long vect = 0, psaival = 0;
-
- if (psai == NULL)
- return -1;
-
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
-
- while (*psai & 0x1000000)
- ;
-
- *psai = psaival;
-
- return 0;
-}
-
-static void noop_wait_for_deassert(atomic_t *deassert_not_used)
-{
-}
-
-static int __init es7000_update_genapic(void)
-{
- genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
-
- /* MPENTIUMIII */
- if (boot_cpu_data.x86 == 6 &&
- (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
- es7000_update_genapic_to_cluster();
- genapic->wait_for_init_deassert = noop_wait_for_deassert;
- genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
- }
-
- return 0;
-}
-
-void __init
-setup_unisys(void)
-{
- /*
- * Determine the generation of the ES7000 currently running.
- *
- * es7000_plat = 1 if the machine is a 5xx ES7000 box
- * es7000_plat = 2 if the machine is a x86_64 ES7000 box
- *
- */
- if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
- es7000_plat = ES7000_ZORRO;
- else
- es7000_plat = ES7000_CLASSIC;
- ioapic_renumber_irq = es7000_rename_gsi;
-
- x86_quirks->update_genapic = es7000_update_genapic;
-}
-
-/*
- * Parse the OEM Table
- */
-
-int __init
-parse_unisys_oem (char *oemptr)
-{
- int i;
- int success = 0;
- unsigned char type, size;
- unsigned long val;
- char *tp = NULL;
- struct psai *psaip = NULL;
- struct mip_reg_info *mi;
- struct mip_reg *host, *mip;
-
- tp = oemptr;
-
- tp += 8;
-
- for (i=0; i <= 6; i++) {
- type = *tp++;
- size = *tp++;
- tp -= 2;
- switch (type) {
- case MIP_REG:
- mi = (struct mip_reg_info *)tp;
- val = MIP_RD_LO(mi->host_reg);
- host_addr = val;
- host = (struct mip_reg *)val;
- host_reg = __va(host);
- val = MIP_RD_LO(mi->mip_reg);
- mip_port = MIP_PORT(mi->mip_info);
- mip_addr = val;
- mip = (struct mip_reg *)val;
- mip_reg = __va(mip);
- pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
- (unsigned long)host_reg);
- pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
- (unsigned long)mip_reg);
- success++;
- break;
- case MIP_PSAI_REG:
- psaip = (struct psai *)tp;
- if (tp != NULL) {
- if (psaip->addr)
- psai = __va(psaip->addr);
- else
- psai = NULL;
- success++;
- }
- break;
- default:
- break;
- }
- tp += size;
- }
-
- if (success < 2) {
- es7000_plat = NON_UNISYS;
- } else
- setup_unisys();
- return es7000_plat;
-}
-
-#ifdef CONFIG_ACPI
-static unsigned long oem_addrX;
-static unsigned long oem_size;
-int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
-{
- struct acpi_table_header *header = NULL;
- int i = 0;
-
- while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
- if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
- struct oem_table *t = (struct oem_table *)header;
-
- oem_addrX = t->OEMTableAddr;
- oem_size = t->OEMTableSize;
-
- *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
- oem_size);
- return 0;
- }
- }
- return -1;
-}
-
-void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
-{
-}
-#endif
-
-static void
-es7000_spin(int n)
-{
- int i = 0;
-
- while (i++ < n)
- rep_nop();
-}
-
-static int __init
-es7000_mip_write(struct mip_reg *mip_reg)
-{
- int status = 0;
- int spin;
-
- spin = MIP_SPIN;
- while (((unsigned long long)host_reg->off_38 &
- (unsigned long long)MIP_VALID) != 0) {
- if (--spin <= 0) {
- printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
- outb(1, mip_port);
-
- spin = MIP_SPIN;
-
- while (((unsigned long long)mip_reg->off_38 &
- (unsigned long long)MIP_VALID) == 0) {
- if (--spin <= 0) {
- printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- status = ((unsigned long long)mip_reg->off_0 &
- (unsigned long long)0xffff0000000000ULL) >> 48;
- mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
- (unsigned long long)~MIP_VALID);
- return status;
-}
-
-void __init
-es7000_sw_apic(void)
-{
- if (es7000_plat) {
- int mip_status;
- struct mip_reg es7000_mip_reg;
-
- printk("ES7000: Enabling APIC mode.\n");
- memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
- es7000_mip_reg.off_0 = MIP_SW_APIC;
- es7000_mip_reg.off_38 = (MIP_VALID);
- while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
- printk("es7000_sw_apic: command failed, status = %x\n",
- mip_status);
- return;
- }
-}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
deleted file mode 100644
index 2bced78b0b8..00000000000
--- a/arch/x86/kernel/genapic_64.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Generic APIC sub-arch probe layer.
- *
- * Hacked for x86-64 by James Cleverdon from i386 architecture code by
- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
- * James Cleverdon.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/hardirq.h>
-#include <linux/dmar.h>
-
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include <asm/genapic.h>
-#include <asm/setup.h>
-
-extern struct genapic apic_flat;
-extern struct genapic apic_physflat;
-extern struct genapic apic_x2xpic_uv_x;
-extern struct genapic apic_x2apic_phys;
-extern struct genapic apic_x2apic_cluster;
-
-struct genapic __read_mostly *genapic = &apic_flat;
-
-static struct genapic *apic_probe[] __initdata = {
- &apic_x2apic_uv_x,
- &apic_x2apic_phys,
- &apic_x2apic_cluster,
- &apic_physflat,
- NULL,
-};
-
-/*
- * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
- */
-void __init setup_apic_routing(void)
-{
- if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
- if (!intr_remapping_enabled)
- genapic = &apic_flat;
- }
-
- if (genapic == &apic_flat) {
- if (max_physical_apicid >= 8)
- genapic = &apic_physflat;
- printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
- }
-
- if (x86_quirks->update_genapic)
- x86_quirks->update_genapic();
-}
-
-/* Same for both flat and physical. */
-
-void apic_send_IPI_self(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
-}
-
-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- int i;
-
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
- genapic = apic_probe[i];
- printk(KERN_INFO "Setting APIC routing to %s.\n",
- genapic->name);
- return 1;
- }
- }
- return 0;
-}
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
deleted file mode 100644
index 6ce497cc372..00000000000
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ /dev/null
@@ -1,198 +0,0 @@
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/dmar.h>
-
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include <asm/genapic.h>
-
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
-
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- if (cpu_has_x2apic)
- return 1;
-
- return 0;
-}
-
-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-
-static const struct cpumask *x2apic_target_cpus(void)
-{
- return cpumask_of(0);
-}
-
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
- unsigned int dest)
-{
- unsigned long cfg;
-
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * send the IPI.
- */
- x2apic_icr_write(cfg, apicid);
-}
-
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- unsigned long flags;
- unsigned long query_cpu;
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, APIC_DEST_LOGICAL);
- local_irq_restore(flags);
-}
-
-static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
- int vector)
-{
- unsigned long flags;
- unsigned long query_cpu;
- unsigned long this_cpu = smp_processor_id();
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- if (query_cpu != this_cpu)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, APIC_DEST_LOGICAL);
- local_irq_restore(flags);
-}
-
-static void x2apic_send_IPI_allbutself(int vector)
-{
- unsigned long flags;
- unsigned long query_cpu;
- unsigned long this_cpu = smp_processor_id();
-
- local_irq_save(flags);
- for_each_online_cpu(query_cpu)
- if (query_cpu != this_cpu)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, APIC_DEST_LOGICAL);
- local_irq_restore(flags);
-}
-
-static void x2apic_send_IPI_all(int vector)
-{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int x2apic_apic_id_registered(void)
-{
- return 1;
-}
-
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- cpu = cpumask_first(cpumask);
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
- return BAD_APICID;
-}
-
-static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask)
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- return BAD_APICID;
-}
-
-static unsigned int get_apic_id(unsigned long x)
-{
- unsigned int id;
-
- id = x;
- return id;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
- unsigned long x;
-
- x = id;
- return x;
-}
-
-static unsigned int phys_pkg_id(int index_msb)
-{
- return current_cpu_data.initial_apicid >> index_msb;
-}
-
-static void x2apic_send_IPI_self(int vector)
-{
- apic_write(APIC_SELF_IPI, vector);
-}
-
-static void init_x2apic_ldr(void)
-{
- int cpu = smp_processor_id();
-
- per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
- return;
-}
-
-struct genapic apic_x2apic_cluster = {
- .name = "cluster x2apic",
- .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .int_delivery_mode = dest_LowestPrio,
- .int_dest_mode = (APIC_DEST_LOGICAL != 0),
- .target_cpus = x2apic_target_cpus,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
- .apic_id_registered = x2apic_apic_id_registered,
- .init_apic_ldr = init_x2apic_ldr,
- .send_IPI_all = x2apic_send_IPI_all,
- .send_IPI_allbutself = x2apic_send_IPI_allbutself,
- .send_IPI_mask = x2apic_send_IPI_mask,
- .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
- .send_IPI_self = x2apic_send_IPI_self,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
- .phys_pkg_id = phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = (0xFFFFFFFFu),
-};
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe18..3f8579f8d42 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
{
reserve_trampoline_memory();
- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
- reserve_early(init_pg_tables_start, init_pg_tables_end,
- "INIT_PG_TABLE");
-
reserve_ebda_region();
/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b93..70eaa852c73 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
#include <asm/bios_ebda.h>
#include <asm/trampoline.h>
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-}
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
if (console_loglevel == 10)
early_printk("Kernel alive\n");
- x86_64_init_pda();
-
x86_64_start_reservations(real_mode_data);
}
@@ -123,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
reserve_trampoline_memory();
- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70..30683883e0c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -11,14 +11,15 @@
#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
#include <asm/desc.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
+#include <asm/percpu.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -37,42 +38,40 @@
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
/*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
* We need:
- * - one bit for each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
- * - enough space to map all low memory, which means
- * (2^32/4096) / 1024 pages (worst case, non PAE)
- * (2^32/4096) / 512 + 4 pages (worst case for PAE)
- * - a few pages for allocator use before the kernel pagetable has
- * been set up
+ * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
*
* Modulo rounding, each megabyte assigned here requires a kilobyte of
* memory, which is currently unreclaimed.
*
* This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
*/
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
#if PTRS_PER_PMD > 1
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
#else
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
-ALLOCATOR_SLOP = 4
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+/* Enough space to fit pagetables for the low memory linear map */
+MAPPING_BEYOND_END = \
+ PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * the worst-case size of the kernel itself, plus the extra we need
+ * to map for the linear map.
+ */
+KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
/*
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -165,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
/*
* Initialize page tables. This creates a PDE and a set of page
- * tables, which are located immediately beyond _end. The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
* Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end.
*
* Note that the stack is not yet set up!
*/
@@ -189,8 +188,7 @@ default_entry:
xorl %ebx,%ebx /* %ebx is kept at zero */
- movl $pa(pg0), %edi
- movl %edi, pa(init_pg_tables_start)
+ movl $pa(__brk_base), %edi
movl $pa(swapper_pg_pmd), %edx
movl $PTE_IDENT_ATTR, %eax
10:
@@ -208,14 +206,14 @@ default_entry:
loop 11b
/*
- * End condition: we must map up to and including INIT_MAP_BEYOND_END
- * bytes beyond the end of our own page tables.
+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
*/
- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
cmpl %ebp,%eax
jb 10b
1:
- movl %edi,pa(init_pg_tables_end)
+ addl $__PAGE_OFFSET, %edi
+ movl %edi, pa(_brk_end)
shrl $12, %eax
movl %eax, pa(max_pfn_mapped)
@@ -226,8 +224,7 @@ default_entry:
page_pde_offset = (__PAGE_OFFSET >> 20);
- movl $pa(pg0), %edi
- movl %edi, pa(init_pg_tables_start)
+ movl $pa(__brk_base), %edi
movl $pa(swapper_pg_dir), %edx
movl $PTE_IDENT_ATTR, %eax
10:
@@ -241,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
addl $0x1000,%eax
loop 11b
/*
- * End condition: we must map up to and including INIT_MAP_BEYOND_END
- * bytes beyond the end of our own page tables; the +0x007 is
- * the attribute bits
+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
*/
- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
cmpl %ebp,%eax
jb 10b
- movl %edi,pa(init_pg_tables_end)
+ addl $__PAGE_OFFSET, %edi
+ movl %edi, pa(_brk_end)
shrl $12, %eax
movl %eax, pa(max_pfn_mapped)
@@ -429,14 +425,34 @@ is386: movl $2,%ecx # set MP
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
- movl %eax,%fs # gets reset once there's real percpu
movl $(__USER_DS),%eax # DS/ES contains default USER segment
movl %eax,%ds
movl %eax,%es
- xorl %eax,%eax # Clear GS and LDT
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax,%fs # set this cpu's percpu
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ /*
+ * The linker can't handle this by relocation. Manually set
+ * base address in stack canary segment descriptor.
+ */
+ cmpb $0,ready
+ jne 1f
+ movl $per_cpu__gdt_page,%eax
+ movl $per_cpu__stack_canary,%ecx
+ subl $20, %ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+1:
+#endif
+ movl $(__KERNEL_STACK_CANARY),%eax
movl %eax,%gs
+
+ xorl %eax,%eax # Clear LDT
lldt %ax
cld # gcc2 wants the direction flag cleared at all times
@@ -446,8 +462,6 @@ is386: movl $2,%ecx # set MP
movb $1, ready
cmpb $0,%cl # the first CPU calls start_kernel
je 1f
- movl $(__KERNEL_PERCPU), %eax
- movl %eax,%fs # set this cpu's percpu
movl (stack_start), %esp
1:
#endif /* CONFIG_SMP */
@@ -548,12 +562,8 @@ early_fault:
pushl %eax
pushl %edx /* trapno */
pushl $fault_msg
-#ifdef CONFIG_EARLY_PRINTK
- call early_printk
-#else
call printk
#endif
-#endif
call dump_stack
hlt_loop:
hlt
@@ -580,11 +590,10 @@ ignore_int:
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
-#ifdef CONFIG_EARLY_PRINTK
- call early_printk
-#else
call printk
-#endif
+
+ call dump_stack
+
addl $(5*4),%esp
popl %ds
popl %es
@@ -622,6 +631,7 @@ swapper_pg_fixmap:
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
+
/*
* This starts the data section.
*/
@@ -660,7 +670,7 @@ early_recursion_flag:
.long 0
int_msg:
- .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+ .asciz "Unknown interrupt or fault at: %p %p %p\n"
fault_msg:
/* fault info: */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d49556..54b29bb24e7 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/processor-flags.h>
+#include <asm/percpu.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -226,12 +227,15 @@ ENTRY(secondary_startup_64)
movl %eax,%fs
movl %eax,%gs
- /*
- * Setup up a dummy PDA. this is just for some early bootup code
- * that does in_interrupt()
- */
+ /* Set up %gs.
+ *
+ * The base of %gs always points to the bottom of the irqstack
+ * union. If the stack protector canary is enabled, it is
+ * located at %gs:40. Note that, on SMP, the boot cpu uses
+ * init data section till per cpu areas are set up.
+ */
movl $MSR_GS_BASE,%ecx
- movq $empty_zero_page,%rax
+ movq initial_gs(%rip),%rax
movq %rax,%rdx
shrq $32,%rdx
wrmsr
@@ -257,6 +261,8 @@ ENTRY(secondary_startup_64)
.align 8
ENTRY(initial_code)
.quad x86_64_start_kernel
+ ENTRY(initial_gs)
+ .quad INIT_PER_CPU_VAR(irq_stack_union)
__FINITDATA
ENTRY(stack_start)
@@ -323,8 +329,6 @@ early_idt_ripmsg:
#endif /* CONFIG_EARLY_PRINTK */
.previous
-.balign PAGE_SIZE
-
#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
ENTRY(name)
@@ -401,7 +405,8 @@ NEXT_PAGE(level2_spare_pgt)
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
- .quad per_cpu__gdt_page
+early_gdt_descr_base:
+ .quad INIT_PER_CPU_VAR(gdt_page)
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
@@ -412,7 +417,7 @@ ENTRY(phys_base)
.section .bss, "aw", @nobits
.align L1_CACHE_BYTES
ENTRY(idt_table)
- .skip 256 * 16
+ .skip IDT_ENTRIES * 16
.section .bss.page_aligned, "aw", @nobits
.align PAGE_SIZE
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 11d5093eb28..df89102bef8 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -22,7 +22,6 @@
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
-#include <asm/arch_hooks.h>
#include <asm/i8259.h>
/*
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index b12208f4dfe..99c4d308f16 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
t->io_bitmap_max = bytes;
-#ifdef CONFIG_X86_32
- /*
- * Sets the lazy trigger so that the next I/O operation will
- * reload the correct bitmap.
- * Reset the owner so that a process switch will not set
- * tss->io_bitmap_base to IO_BITMAP_OFFSET.
- */
- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
- tss->io_bitmap_owner = NULL;
-#else
/* Update the TSS: */
memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
-#endif
put_cpu();
@@ -131,9 +120,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
}
#ifdef CONFIG_X86_32
-asmlinkage long sys_iopl(unsigned long regsp)
+long sys_iopl(struct pt_regs *regs)
{
- struct pt_regs *regs = (struct pt_regs *)&regsp;
unsigned int level = regs->bx;
struct thread_struct *t = &current->thread;
int rc;
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
deleted file mode 100644
index 285bbf8831f..00000000000
--- a/arch/x86/kernel/ipi.c
+++ /dev/null
@@ -1,190 +0,0 @@
-#include <linux/cpumask.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/apic.h>
-#include <asm/proto.h>
-
-#ifdef CONFIG_X86_32
-#include <mach_apic.h>
-#include <mach_ipi.h>
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline int __prepare_ICR(unsigned int shortcut, int vector)
-{
- unsigned int icr = shortcut | APIC_DEST_LOGICAL;
-
- switch (vector) {
- default:
- icr |= APIC_DM_FIXED | vector;
- break;
- case NMI_VECTOR:
- icr |= APIC_DM_NMI;
- break;
- }
- return icr;
-}
-
-static inline int __prepare_ICR2(unsigned int mask)
-{
- return SET_APIC_DEST_FIELD(mask);
-}
-
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
-{
- /*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- /*
- * No need to touch the target chip field
- */
- cfg = __prepare_ICR(shortcut, vector);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-
-void send_IPI_self(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_SELF, vector);
-}
-
-/*
- * This is used to send an IPI with no shorthand notation (the destination is
- * specified in bits 56 to 63 of the ICR).
- */
-static inline void __send_IPI_dest_field(unsigned long mask, int vector)
-{
- unsigned long cfg;
-
- /*
- * Wait for idle.
- */
- if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
- else
- apic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- apic_write(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-
-/*
- * This is only used on smaller machines.
- */
-void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
- unsigned long flags;
-
- local_irq_save(flags);
- WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
- __send_IPI_dest_field(mask, vector);
- local_irq_restore(flags);
-}
-
-void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
-{
- unsigned long flags;
- unsigned int query_cpu;
-
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
- */
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
- local_irq_restore(flags);
-}
-
-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
-{
- unsigned long flags;
- unsigned int query_cpu;
- unsigned int this_cpu = smp_processor_id();
-
- /* See Hack comment above */
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- if (query_cpu != this_cpu)
- __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
- vector);
- local_irq_restore(flags);
-}
-
-/* must come after the send_IPI functions above for inlining */
-static int convert_apicid_to_cpu(int apic_id)
-{
- int i;
-
- for_each_possible_cpu(i) {
- if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
- return i;
- }
- return -1;
-}
-
-int safe_smp_processor_id(void)
-{
- int apicid, cpuid;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
-
- apicid = hard_smp_processor_id();
- if (apicid == BAD_APICID)
- return 0;
-
- cpuid = convert_apicid_to_cpu(apicid);
-
- return cpuid >= 0 ? cpuid : 0;
-}
-#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f8..b8ac3b6cf77 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -6,13 +6,18 @@
#include <linux/kernel_stat.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
+#include <linux/ftrace.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
+#include <asm/idle.h>
atomic_t irq_err_count;
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
@@ -36,63 +41,65 @@ void ack_bad_irq(unsigned int irq)
#endif
}
-#ifdef CONFIG_X86_32
-# define irq_stats(x) (&per_cpu(irq_stat, x))
-#else
-# define irq_stats(x) cpu_pda(x)
-#endif
+#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
* /proc/interrupts printing:
*/
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
{
int j;
- seq_printf(p, "NMI: ");
+ seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
seq_printf(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
+ seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
seq_printf(p, " Local timer interrupts\n");
#endif
+ if (generic_interrupt_extension) {
+ seq_printf(p, "PLT: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+ seq_printf(p, " Platform interrupts\n");
+ }
#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
+ seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
+ seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
+ seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
+ seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
# ifdef CONFIG_X86_64
- seq_printf(p, "THR: ");
+ seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
+ seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
return 0;
}
@@ -100,19 +107,22 @@ static int show_other_interrupts(struct seq_file *p)
int show_interrupts(struct seq_file *p, void *v)
{
unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j;
+ int i = *(loff_t *) v, j, prec;
struct irqaction *action;
struct irq_desc *desc;
if (i > nr_irqs)
return 0;
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
if (i == nr_irqs)
- return show_other_interrupts(p);
+ return show_other_interrupts(p, prec);
/* print header */
if (i == 0) {
- seq_printf(p, " ");
+ seq_printf(p, "%*s", prec + 8, "");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d", j);
seq_putc(p, '\n');
@@ -133,7 +143,7 @@ int show_interrupts(struct seq_file *p, void *v)
if (!action && !any_count)
goto out;
- seq_printf(p, "%3d: ", i);
+ seq_printf(p, "%*d: ", prec, i);
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
@@ -165,6 +175,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
#endif
+ if (generic_interrupt_extension)
+ sum += irq_stats(cpu)->generic_irqs;
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
@@ -192,4 +204,63 @@ u64 arch_irq_stat(void)
return sum;
}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* high bit used in ret_from_ code */
+ unsigned vector = ~regs->orig_ax;
+ unsigned irq;
+
+ exit_idle();
+ irq_enter();
+
+ irq = __get_cpu_var(vector_irq)[vector];
+
+ if (!handle_irq(irq, regs)) {
+#ifdef CONFIG_X86_64
+ if (!disable_apic)
+ ack_APIC_irq();
+#endif
+
+ if (printk_ratelimit())
+ printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(), vector, irq);
+ }
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ ack_APIC_irq();
+
+ exit_idle();
+
+ irq_enter();
+
+ inc_irq_stat(generic_irqs);
+
+ if (generic_interrupt_extension)
+ generic_interrupt_extension();
+
+ irq_exit();
+
+ set_irq_regs(old_regs);
+}
+
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e..3b09634a515 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
+#include <linux/percpu.h>
#include <asm/apic.h>
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
union irq_ctx {
struct thread_info tinfo;
u32 stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
static void call_on_stack(void *func, void *stack)
{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
u32 *isp, arg1, arg2;
curctx = (union irq_ctx *) current_thread_info();
- irqctx = hardirq_ctx[smp_processor_id()];
+ irqctx = __get_cpu_var(hardirq_ctx);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
{
union irq_ctx *irqctx;
- if (hardirq_ctx[cpu])
+ if (per_cpu(hardirq_ctx, cpu))
return;
- irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+ irqctx = &per_cpu(hardirq_stack, cpu);
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
- hardirq_ctx[cpu] = irqctx;
+ per_cpu(hardirq_ctx, cpu) = irqctx;
- irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+ irqctx = &per_cpu(softirq_stack, cpu);
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
- softirq_ctx[cpu] = irqctx;
+ per_cpu(softirq_ctx, cpu) = irqctx;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+ cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
}
void irq_ctx_exit(int cpu)
{
- hardirq_ctx[cpu] = NULL;
+ per_cpu(hardirq_ctx, cpu) = NULL;
}
asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
if (local_softirq_pending()) {
curctx = current_thread_info();
- irqctx = softirq_ctx[smp_processor_id()];
+ irqctx = __get_cpu_var(softirq_ctx);
irqctx->tinfo.task = curctx->task;
irqctx->tinfo.previous_esp = current_stack_pointer;
@@ -191,33 +192,16 @@ static inline int
execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
#endif
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-unsigned int do_IRQ(struct pt_regs *regs)
+bool handle_irq(unsigned irq, struct pt_regs *regs)
{
- struct pt_regs *old_regs;
- /* high bit used in ret_from_ code */
- int overflow;
- unsigned vector = ~regs->orig_ax;
struct irq_desc *desc;
- unsigned irq;
-
-
- old_regs = set_irq_regs(regs);
- irq_enter();
- irq = __get_cpu_var(vector_irq)[vector];
+ int overflow;
overflow = check_stack_overflow();
desc = irq_to_desc(irq);
- if (unlikely(!desc)) {
- printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
- __func__, irq, vector, smp_processor_id());
- BUG();
- }
+ if (unlikely(!desc))
+ return false;
if (!execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
@@ -225,13 +209,10 @@ unsigned int do_IRQ(struct pt_regs *regs)
desc->handle_irq(irq, desc);
}
- irq_exit();
- set_irq_regs(old_regs);
- return 1;
+ return true;
}
#ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
@@ -248,7 +229,7 @@ void fixup_irqs(void)
if (irq == 2)
continue;
- affinity = &desc->affinity;
+ affinity = desc->affinity;
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
printk("Breaking affinity for irq %i\n", irq);
affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec02..977d8b43a0d 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,13 @@
#include <linux/smp.h>
#include <asm/io_apic.h>
#include <asm/idle.h>
+#include <asm/apic.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
/*
* Probabilistic stack overflow check:
@@ -41,42 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs)
#endif
}
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+bool handle_irq(unsigned irq, struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
- unsigned irq;
-
- exit_idle();
- irq_enter();
- irq = __get_cpu_var(vector_irq)[vector];
-
stack_overflow_check(regs);
desc = irq_to_desc(irq);
- if (likely(desc))
- generic_handle_irq_desc(irq, desc);
- else {
- if (!disable_apic)
- ack_APIC_irq();
-
- if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
- __func__, smp_processor_id(), vector);
- }
-
- irq_exit();
+ if (unlikely(!desc))
+ return false;
- set_irq_regs(old_regs);
- return 1;
+ generic_handle_irq_desc(irq, desc);
+ return true;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -100,7 +83,7 @@ void fixup_irqs(void)
/* interrupt's are disabled at this point */
spin_lock(&desc->lock);
- affinity = &desc->affinity;
+ affinity = desc->affinity;
if (!irq_has_action(irq) ||
cpumask_equal(affinity, cpu_online_mask)) {
spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 10a09c2f182..bc132610544 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -18,7 +18,7 @@
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
-#include <asm/arch_hooks.h>
+#include <asm/setup.h>
#include <asm/i8259.h>
#include <asm/traps.h>
@@ -78,6 +78,15 @@ void __init init_ISA_irqs(void)
}
}
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+ .handler = no_action,
+ .mask = CPU_MASK_NONE,
+ .name = "cascade",
+};
+
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[0 ... IRQ0_VECTOR - 1] = -1,
[IRQ0_VECTOR] = 0,
@@ -118,8 +127,8 @@ void __init native_init_IRQ(void)
{
int i;
- /* all the set up before the call gates are initialised */
- pre_intr_init_hook();
+ /* Execute any quirks before the call gates are initialised: */
+ x86_quirk_pre_intr_init();
/*
* Cover the whole vector space, no vector can escape
@@ -140,8 +149,15 @@ void __init native_init_IRQ(void)
*/
alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
- /* IPI for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+ /* IPIs for invalidation */
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
/* IPI for generic function call */
alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -159,6 +175,9 @@ void __init native_init_IRQ(void)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
@@ -169,10 +188,14 @@ void __init native_init_IRQ(void)
alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
#endif
- /* setup after call gates are initialised (usually add in
- * the architecture specific gates)
+ if (!acpi_ioapic)
+ setup_irq(2, &irq2);
+
+ /*
+ * Call quirks after call gates are initialised (usually add in
+ * the architecture specific gates):
*/
- intr_init_hook();
+ x86_quirk_intr_init();
/*
* External FPU? Set up irq13 if so, for
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f3..c7a49e0ffbf 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ /* generic IPI for platform specific use */
+ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 10435a120d2..eedfaebe106 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,7 @@
#include <asm/apicdef.h>
#include <asm/system.h>
-#include <mach_ipi.h>
+#include <asm/apic.h>
/*
* Put the error code here just in case the user cares:
@@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
*/
void kgdb_roundup_cpus(unsigned long flags)
{
- send_IPI_allbutself(APIC_DM_NMI);
+ apic->send_IPI_allbutself(APIC_DM_NMI);
}
#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 4558dd3918c..55b94614e34 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,7 +193,7 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
- if (search_exception_tables(opcodes))
+ if (search_exception_tables((unsigned long)opcodes))
return 0; /* Page fault may occur on this address. */
retry:
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986ec..33019ddb56b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
kvm_mmu_write(ptep, pte_val(pte));
}
-static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
static void kvm_pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.set_pte_present = kvm_set_pte_present;
pv_mmu_ops.pte_clear = kvm_pte_clear;
pv_mmu_ops.pmd_clear = kvm_pmd_clear;
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 652fce6d2cc..137f2e8132d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -19,7 +19,6 @@
#include <linux/clocksource.h>
#include <linux/kvm_para.h>
#include <asm/pvclock.h>
-#include <asm/arch_hooks.h>
#include <asm/msr.h>
#include <asm/apic.h>
#include <linux/percpu.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 37f420018a4..e7368c1da01 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
#include <linux/ftrace.h>
#include <linux/suspend.h>
#include <linux/gfp.h>
+#include <linux/io.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#include <asm/io.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
"\tmovl %%eax,%%fs\n"
"\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
- ::: "eax", "memory");
+ : : : "eax", "memory");
#undef STR
#undef __STR
}
@@ -121,7 +121,7 @@ static void machine_kexec_page_table_set_one(
static void machine_kexec_prepare_page_tables(struct kimage *image)
{
void *control_page;
- pmd_t *pmd = 0;
+ pmd_t *pmd = NULL;
control_page = page_address(image->control_code_page);
#ifdef CONFIG_X86_PAE
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
- /* We need to put APICs in legacy mode so that we can
+ /*
+ * We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel. kexec/kdump
* paths already have calls to disable_IO_APIC() in
* one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);
- /* The segment registers are funny things, they have both a
+ /*
+ * The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
- /* The gdt & idt are now invalid.
+ /*
+ * The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index c43caa3a91f..89cea4d4467 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,20 +12,47 @@
#include <linux/reboot.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#include <asm/io.h>
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-static u64 kexec_pgd[512] PAGE_ALIGNED;
-static u64 kexec_pud0[512] PAGE_ALIGNED;
-static u64 kexec_pmd0[512] PAGE_ALIGNED;
-static u64 kexec_pte0[512] PAGE_ALIGNED;
-static u64 kexec_pud1[512] PAGE_ALIGNED;
-static u64 kexec_pmd1[512] PAGE_ALIGNED;
-static u64 kexec_pte1[512] PAGE_ALIGNED;
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+ unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ struct page *page;
+ int result = -ENOMEM;
+
+ addr &= PMD_MASK;
+ pgd += pgd_index(addr);
+ if (!pgd_present(*pgd)) {
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page)
+ goto out;
+ pud = (pud_t *)page_address(page);
+ memset(pud, 0, PAGE_SIZE);
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud)) {
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page)
+ goto out;
+ pmd = (pmd_t *)page_address(page);
+ memset(pmd, 0, PAGE_SIZE);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ result = 0;
+out:
+ return result;
+}
static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
@@ -92,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
}
level3p = (pud_t *)page_address(page);
result = init_level3_page(image, level3p, addr, last_addr);
- if (result) {
+ if (result)
goto out;
- }
set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
addr += PGDIR_SIZE;
}
@@ -107,12 +133,72 @@ out:
return result;
}
+static void free_transition_pgtable(struct kimage *image)
+{
+ free_page((unsigned long)image->arch.pud);
+ free_page((unsigned long)image->arch.pmd);
+ free_page((unsigned long)image->arch.pte);
+}
+
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long vaddr, paddr;
+ int result = -ENOMEM;
+
+ vaddr = (unsigned long)relocate_kernel;
+ paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pud)
+ goto err;
+ image->arch.pud = pud;
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pmd)
+ goto err;
+ image->arch.pmd = pmd;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pte)
+ goto err;
+ image->arch.pte = pte;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+ return 0;
+err:
+ free_transition_pgtable(image);
+ return result;
+}
+
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
pgd_t *level4p;
+ int result;
level4p = (pgd_t *)__va(start_pgtable);
- return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+ result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+ if (result)
+ return result;
+ /*
+ * image->start may be outside 0 ~ max_pfn, for example when
+ * jump back to original kernel from kexeced kernel
+ */
+ result = init_one_level2_page(image, level4p, image->start);
+ if (result)
+ return result;
+ return init_transition_pgtable(image, level4p);
}
static void set_idt(void *newidt, u16 limit)
@@ -174,7 +260,7 @@ int machine_kexec_prepare(struct kimage *image)
void machine_kexec_cleanup(struct kimage *image)
{
- return;
+ free_transition_pgtable(image);
}
/*
@@ -185,36 +271,45 @@ void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
+ int save_ftrace_enabled;
- tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ save_processor_state();
+#endif
+
+ save_ftrace_enabled = __ftrace_enabled_save();
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to disable_IO_APIC() in
+ * one form or other. kexec jump path also need
+ * one.
+ */
+ disable_IO_APIC();
+#endif
+ }
+
control_page = page_address(image->control_code_page) + PAGE_SIZE;
- memcpy(control_page, relocate_kernel, PAGE_SIZE);
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
- page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
- page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
- page_list[VA_PGD] = (unsigned long)kexec_pgd;
- page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
- page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
- page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
- page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
- page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
- page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
- page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
- page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
- page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
- page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
- page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
- page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
-
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
page_list[PA_TABLE_PAGE] =
(unsigned long)__pa(page_address(image->control_code_page));
- /* The segment registers are funny things, they have both a
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
+
+ /*
+ * The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
@@ -224,15 +319,25 @@ void machine_kexec(struct kimage *image)
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
- /* The gdt & idt are now invalid.
+ /*
+ * The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0),0);
- set_idt(phys_to_virt(0),0);
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
/* now call it */
- relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
- image->start);
+ image->start = relocate_kernel((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
}
void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 2dc183758be..845d80ce1ef 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -51,7 +51,6 @@
#include <linux/ioport.h>
#include <asm/uaccess.h>
#include <linux/init.h>
-#include <asm/arch_hooks.h>
static unsigned char which_scsi;
@@ -474,6 +473,4 @@ void __kprobes mca_handle_nmi(void)
* adapter was responsible for the error.
*/
bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-
- mca_nmi_hook();
-} /* mca_handle_nmi */
+}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index b7f4c929e61..5e9f4fc5138 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -87,9 +87,9 @@
#include <linux/cpu.h>
#include <linux/firmware.h>
#include <linux/platform_device.h>
+#include <linux/uaccess.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/microcode.h>
@@ -196,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
}
-static inline int
+static inline int
update_match_revision(struct microcode_header_intel *mc_header, int rev)
{
return (mc_header->rev <= rev) ? 0 : 1;
@@ -442,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)
return ret;
}
- ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
- &get_ucode_fw);
+ ret = generic_load_microcode(cpu, (void *)firmware->data,
+ firmware->size, &get_ucode_fw);
release_firmware(firmware);
@@ -460,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
/* We should bind the task to the CPU */
BUG_ON(cpu != raw_smp_processor_id());
- return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
+ return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
static void microcode_fini_cpu(int cpu)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f..712d15fdc41 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
+static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
{
.callback = set_check_enable_amd_mmconf,
.ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
index 3db0a5442eb..0edd819050e 100644
--- a/arch/x86/kernel/module_32.c
+++ b/arch/x86/kernel/module_32.c
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
/* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
+ table entries. */
}
/* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
*para = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".text", secstrings + s->sh_name))
text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks= s;
+ locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 6ba87830d4b..c23880b90b5 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -30,14 +30,14 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-#define DEBUGP(fmt...)
+#define DEBUGP(fmt...)
#ifndef CONFIG_UML
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
/* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
+ table entries. */
}
void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
- u64 val;
+ u64 val;
DEBUGP("Applying relocate section %u to %u\n", relsec,
sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+ ELF64_R_SYM(rel[i].r_info);
- DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info),
- sym->st_value, rel[i].r_addend, (u64)loc);
+ DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info),
+ sym->st_value, rel[i].r_addend, (u64)loc);
- val = sym->st_value + rel[i].r_addend;
+ val = sym->st_value + rel[i].r_addend;
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
if ((s64)val != *(s32 *)loc)
goto overflow;
break;
- case R_X86_64_PC32:
+ case R_X86_64_PC32:
val -= (u64)loc;
*(u32 *)loc = val;
#if 0
if ((s64)val != *(s32 *)loc)
- goto overflow;
+ goto overflow;
#endif
break;
default:
- printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
+ printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return 0;
overflow:
- printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
+ printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), val);
printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
unsigned int relsec,
struct module *me)
{
- printk("non add relocation not supported\n");
+ printk(KERN_ERR "non add relocation not supported\n");
return -ENOSYS;
-}
+}
int module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
+ const Elf_Shdr *sechdrs,
+ struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
*para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks= s;
+ locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a649a4ccad4..290cb57f469 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -3,7 +3,7 @@
* compliant MP-table parsing routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
*/
@@ -29,12 +29,7 @@
#include <asm/setup.h>
#include <asm/smp.h>
-#include <mach_apic.h>
-#ifdef CONFIG_X86_32
-#include <mach_apicdef.h>
-#include <mach_mpparse.h>
-#endif
-
+#include <asm/apic.h>
/*
* Checksum an MP configuration block.
*/
@@ -114,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
-#endif
-
-#ifdef CONFIG_X86_IO_APIC
static int bad_ioapic(unsigned long address)
{
@@ -144,11 +136,11 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
if (bad_ioapic(m->apicaddr))
return;
- mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr;
- mp_ioapics[nr_ioapics].mp_apicid = m->apicid;
- mp_ioapics[nr_ioapics].mp_type = m->type;
- mp_ioapics[nr_ioapics].mp_apicver = m->apicver;
- mp_ioapics[nr_ioapics].mp_flags = m->flags;
+ mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
+ mp_ioapics[nr_ioapics].apicid = m->apicid;
+ mp_ioapics[nr_ioapics].type = m->type;
+ mp_ioapics[nr_ioapics].apicver = m->apicver;
+ mp_ioapics[nr_ioapics].flags = m->flags;
nr_ioapics++;
}
@@ -160,55 +152,55 @@ static void print_MP_intsrc_info(struct mpc_intsrc *m)
m->srcbusirq, m->dstapic, m->dstirq);
}
-static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
{
apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
- mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
- (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
- mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+ mp_irq->irqtype, mp_irq->irqflag & 3,
+ (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+ mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
}
static void __init assign_to_mp_irq(struct mpc_intsrc *m,
- struct mp_config_intsrc *mp_irq)
+ struct mpc_intsrc *mp_irq)
{
- mp_irq->mp_dstapic = m->dstapic;
- mp_irq->mp_type = m->type;
- mp_irq->mp_irqtype = m->irqtype;
- mp_irq->mp_irqflag = m->irqflag;
- mp_irq->mp_srcbus = m->srcbus;
- mp_irq->mp_srcbusirq = m->srcbusirq;
- mp_irq->mp_dstirq = m->dstirq;
+ mp_irq->dstapic = m->dstapic;
+ mp_irq->type = m->type;
+ mp_irq->irqtype = m->irqtype;
+ mp_irq->irqflag = m->irqflag;
+ mp_irq->srcbus = m->srcbus;
+ mp_irq->srcbusirq = m->srcbusirq;
+ mp_irq->dstirq = m->dstirq;
}
-static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
struct mpc_intsrc *m)
{
- m->dstapic = mp_irq->mp_dstapic;
- m->type = mp_irq->mp_type;
- m->irqtype = mp_irq->mp_irqtype;
- m->irqflag = mp_irq->mp_irqflag;
- m->srcbus = mp_irq->mp_srcbus;
- m->srcbusirq = mp_irq->mp_srcbusirq;
- m->dstirq = mp_irq->mp_dstirq;
+ m->dstapic = mp_irq->dstapic;
+ m->type = mp_irq->type;
+ m->irqtype = mp_irq->irqtype;
+ m->irqflag = mp_irq->irqflag;
+ m->srcbus = mp_irq->srcbus;
+ m->srcbusirq = mp_irq->srcbusirq;
+ m->dstirq = mp_irq->dstirq;
}
-static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
struct mpc_intsrc *m)
{
- if (mp_irq->mp_dstapic != m->dstapic)
+ if (mp_irq->dstapic != m->dstapic)
return 1;
- if (mp_irq->mp_type != m->type)
+ if (mp_irq->type != m->type)
return 2;
- if (mp_irq->mp_irqtype != m->irqtype)
+ if (mp_irq->irqtype != m->irqtype)
return 3;
- if (mp_irq->mp_irqflag != m->irqflag)
+ if (mp_irq->irqflag != m->irqflag)
return 4;
- if (mp_irq->mp_srcbus != m->srcbus)
+ if (mp_irq->srcbus != m->srcbus)
return 5;
- if (mp_irq->mp_srcbusirq != m->srcbusirq)
+ if (mp_irq->srcbusirq != m->srcbusirq)
return 6;
- if (mp_irq->mp_dstirq != m->dstirq)
+ if (mp_irq->dstirq != m->dstirq)
return 7;
return 0;
@@ -229,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
+#endif /* CONFIG_X86_IO_APIC */
-#endif
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
@@ -280,6 +276,12 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
return 1;
}
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+ *ptr += size;
+ *count += size;
+}
+
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -292,16 +294,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
return 0;
#ifdef CONFIG_X86_32
- /*
- * need to make sure summit and es7000's mps_oem_check is safe to be
- * called early via genericarch 's mps_oem_check
- */
- if (early) {
-#ifdef CONFIG_X86_NUMAQ
- numaq_mps_oem_check(mpc, oem, str);
-#endif
- } else
- mps_oem_check(mpc, oem, str);
+ generic_mps_oem_check(mpc, oem, str);
#endif
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
@@ -324,55 +317,27 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
- {
- struct mpc_cpu *m = (struct mpc_cpu *)mpt;
- /* ACPI may have already provided this data */
- if (!acpi_lapic)
- MP_processor_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ /* ACPI may have already provided this data */
+ if (!acpi_lapic)
+ MP_processor_info((struct mpc_cpu *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
case MP_BUS:
- {
- struct mpc_bus *m = (struct mpc_bus *)mpt;
-#ifdef CONFIG_X86_IO_APIC
- MP_bus_info(m);
-#endif
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ MP_bus_info((struct mpc_bus *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
case MP_IOAPIC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
- MP_ioapic_info(m);
-#endif
- mpt += sizeof(struct mpc_ioapic);
- count += sizeof(struct mpc_ioapic);
- break;
- }
+ MP_ioapic_info((struct mpc_ioapic *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
case MP_INTSRC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
- MP_intsrc_info(m);
-#endif
- mpt += sizeof(struct mpc_intsrc);
- count += sizeof(struct mpc_intsrc);
- break;
- }
+ MP_intsrc_info((struct mpc_intsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
case MP_LINTSRC:
- {
- struct mpc_lintsrc *m =
- (struct mpc_lintsrc *)mpt;
- MP_lintsrc_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
default:
/* wrong mptable */
printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
@@ -386,13 +351,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
(*x86_quirks->mpc_record)++;
}
-#ifdef CONFIG_X86_GENERICARCH
- generic_bigsmp_probe();
+#ifdef CONFIG_X86_BIGSMP
+ generic_bigsmp_probe();
#endif
-#ifdef CONFIG_X86_32
- setup_apic_routing();
-#endif
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
@@ -417,7 +382,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
intsrc.type = MP_INTSRC;
intsrc.irqflag = 0; /* conforming */
intsrc.srcbus = 0;
- intsrc.dstapic = mp_ioapics[0].mp_apicid;
+ intsrc.dstapic = mp_ioapics[0].apicid;
intsrc.irqtype = mp_INT;
@@ -570,14 +535,27 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
}
}
-static struct intel_mp_floating *mpf_found;
+static struct mpf_intel *mpf_found;
+
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ mpc = early_ioremap(physptr, PAGE_SIZE);
+ size = mpc->length;
+ early_iounmap(mpc, PAGE_SIZE);
+ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+
+ return size;
+}
/*
* Scan the memory blocks for an SMP configuration block.
*/
static void __init __get_smp_config(unsigned int early)
{
- struct intel_mp_floating *mpf = mpf_found;
+ struct mpf_intel *mpf = mpf_found;
if (!mpf)
return;
@@ -598,9 +576,9 @@ static void __init __get_smp_config(unsigned int early)
}
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
- mpf->mpf_specification);
+ mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
- if (mpf->mpf_feature2 & (1 << 7)) {
+ if (mpf->feature2 & (1 << 7)) {
printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
@@ -611,7 +589,7 @@ static void __init __get_smp_config(unsigned int early)
/*
* Now see if we need to read further.
*/
- if (mpf->mpf_feature1 != 0) {
+ if (mpf->feature1 != 0) {
if (early) {
/*
* local APIC has default address
@@ -621,16 +599,20 @@ static void __init __get_smp_config(unsigned int early)
}
printk(KERN_INFO "Default MP configuration #%d\n",
- mpf->mpf_feature1);
- construct_default_ISA_mptable(mpf->mpf_feature1);
+ mpf->feature1);
+ construct_default_ISA_mptable(mpf->feature1);
- } else if (mpf->mpf_physptr) {
+ } else if (mpf->physptr) {
+ struct mpc_table *mpc;
+ unsigned long size;
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_ioremap(mpf->physptr, size);
/*
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+ if (!smp_read_mpc(mpc, early)) {
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
@@ -638,8 +620,10 @@ static void __init __get_smp_config(unsigned int early)
"BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. "
"(tell your hw vendor)\n");
+ early_iounmap(mpc, size);
return;
}
+ early_iounmap(mpc, size);
if (early)
return;
@@ -684,54 +668,62 @@ void __init get_smp_config(void)
__get_smp_config(0);
}
+static void smp_reserve_bootmem(struct mpf_intel *mpf)
+{
+ unsigned long size = get_mpc_size(mpf->physptr);
+#ifdef CONFIG_X86_32
+ /*
+ * We cannot access to MPC table to compute table size yet,
+ * as only few megabytes from the bottom is mapped now.
+ * PC-9800's MPC table places on the very last of physical
+ * memory; so that simply reserving PAGE_SIZE from mpf->physptr
+ * yields BUG() in reserve_bootmem.
+ * also need to make sure physptr is below than max_low_pfn
+ * we don't need reserve the area above max_low_pfn
+ */
+ unsigned long end = max_low_pfn * PAGE_SIZE;
+
+ if (mpf->physptr < end) {
+ if (mpf->physptr + size > end)
+ size = end - mpf->physptr;
+ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+ }
+#else
+ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+#endif
+}
+
static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned reserve)
{
unsigned int *bp = phys_to_virt(base);
- struct intel_mp_floating *mpf;
+ struct mpf_intel *mpf;
apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
bp, length);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
- mpf = (struct intel_mp_floating *)bp;
+ mpf = (struct mpf_intel *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
- (mpf->mpf_length == 1) &&
+ (mpf->length == 1) &&
!mpf_checksum((unsigned char *)bp, 16) &&
- ((mpf->mpf_specification == 1)
- || (mpf->mpf_specification == 4))) {
+ ((mpf->specification == 1)
+ || (mpf->specification == 4))) {
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
#endif
mpf_found = mpf;
- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
- mpf, virt_to_phys(mpf));
+ printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
+ mpf, (u64)virt_to_phys(mpf));
if (!reserve)
return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
- BOOTMEM_DEFAULT);
- if (mpf->mpf_physptr) {
- unsigned long size = PAGE_SIZE;
-#ifdef CONFIG_X86_32
- /*
- * We cannot access to MPC table to compute
- * table size yet, as only few megabytes from
- * the bottom is mapped now.
- * PC-9800's MPC table places on the very last
- * of physical memory; so that simply reserving
- * PAGE_SIZE from mpg->mpf_physptr yields BUG()
- * in reserve_bootmem.
- */
- unsigned long end = max_low_pfn * PAGE_SIZE;
- if (mpf->mpf_physptr + size > end)
- size = end - mpf->mpf_physptr;
-#endif
- reserve_bootmem_generic(mpf->mpf_physptr, size,
+ reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
BOOTMEM_DEFAULT);
- }
+ if (mpf->physptr)
+ smp_reserve_bootmem(mpf);
return 1;
}
@@ -809,15 +801,15 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
/* not legacy */
for (i = 0; i < mp_irq_entries; i++) {
- if (mp_irqs[i].mp_irqtype != mp_INT)
+ if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].mp_irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != 0x0f)
continue;
- if (mp_irqs[i].mp_srcbus != m->srcbus)
+ if (mp_irqs[i].srcbus != m->srcbus)
continue;
- if (mp_irqs[i].mp_srcbusirq != m->srcbusirq)
+ if (mp_irqs[i].srcbusirq != m->srcbusirq)
continue;
if (irq_used[i]) {
/* already claimed */
@@ -834,7 +826,57 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
#define SPARE_SLOT_NUM 20
static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
-#endif
+
+static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "OLD ");
+ print_MP_intsrc_info(m);
+
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ apic_printk(APIC_VERBOSE, "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ return;
+ }
+ if (!i) {
+ /* legacy, do nothing */
+ return;
+ }
+ if (*nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2) are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[*nr_m_spare] = m;
+ *nr_m_spare += 1;
+ }
+}
+#else /* CONFIG_X86_IO_APIC */
+static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
+ int count)
+{
+ if (!mpc_new_phys) {
+ pr_info("No spare slots, try to append...take your risk, "
+ "new mpc_length %x\n", count);
+ } else {
+ if (count <= mpc_new_length)
+ pr_info("No spare slots, try to append..., "
+ "new mpc_length %x\n", count);
+ else {
+ pr_err("mpc_new_length %lx is too small\n",
+ mpc_new_length);
+ return -1;
+ }
+ }
+
+ return 0;
+}
static int __init replace_intsrc_all(struct mpc_table *mpc,
unsigned long mpc_new_phys,
@@ -842,71 +884,30 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
{
#ifdef CONFIG_X86_IO_APIC
int i;
- int nr_m_spare = 0;
#endif
-
int count = sizeof(*mpc);
+ int nr_m_spare = 0;
unsigned char *mpt = ((unsigned char *)mpc) + count;
printk(KERN_INFO "mpc_length %x\n", mpc->length);
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
- {
- struct mpc_cpu *m = (struct mpc_cpu *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
case MP_BUS:
- {
- struct mpc_bus *m = (struct mpc_bus *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
case MP_IOAPIC:
- {
- mpt += sizeof(struct mpc_ioapic);
- count += sizeof(struct mpc_ioapic);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
case MP_INTSRC:
- {
-#ifdef CONFIG_X86_IO_APIC
- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
- printk(KERN_INFO "OLD ");
- print_MP_intsrc_info(m);
- i = get_MP_intsrc_index(m);
- if (i > 0) {
- assign_to_mpc_intsrc(&mp_irqs[i], m);
- printk(KERN_INFO "NEW ");
- print_mp_irq_info(&mp_irqs[i]);
- } else if (!i) {
- /* legacy, do nothing */
- } else if (nr_m_spare < SPARE_SLOT_NUM) {
- /*
- * not found (-1), or duplicated (-2)
- * are invalid entries,
- * we need to use the slot later
- */
- m_spare[nr_m_spare] = m;
- nr_m_spare++;
- }
-#endif
- mpt += sizeof(struct mpc_intsrc);
- count += sizeof(struct mpc_intsrc);
- break;
- }
+ check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
case MP_LINTSRC:
- {
- struct mpc_lintsrc *m =
- (struct mpc_lintsrc *)mpt;
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
default:
/* wrong mptable */
printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
@@ -922,30 +923,22 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
if (irq_used[i])
continue;
- if (mp_irqs[i].mp_irqtype != mp_INT)
+ if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].mp_irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != 0x0f)
continue;
if (nr_m_spare > 0) {
- printk(KERN_INFO "*NEW* found ");
+ apic_printk(APIC_VERBOSE, "*NEW* found\n");
nr_m_spare--;
assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
m_spare[nr_m_spare] = NULL;
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
- if (!mpc_new_phys) {
- printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
- } else {
- if (count <= mpc_new_length)
- printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
- else {
- printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
- goto out;
- }
- }
+ if (!check_slot(mpc_new_phys, mpc_new_length, count))
+ goto out;
assign_to_mpc_intsrc(&mp_irqs[i], m);
mpc->length = count;
mpt += sizeof(struct mpc_intsrc);
@@ -1001,7 +994,7 @@ static int __init update_mp_table(void)
{
char str[16];
char oem[10];
- struct intel_mp_floating *mpf;
+ struct mpf_intel *mpf;
struct mpc_table *mpc, *mpc_new;
if (!enable_update_mptable)
@@ -1014,19 +1007,19 @@ static int __init update_mp_table(void)
/*
* Now see if we need to go further.
*/
- if (mpf->mpf_feature1 != 0)
+ if (mpf->feature1 != 0)
return 0;
- if (!mpf->mpf_physptr)
+ if (!mpf->physptr)
return 0;
- mpc = phys_to_virt(mpf->mpf_physptr);
+ mpc = phys_to_virt(mpf->physptr);
if (!smp_check_mpc(mpc, oem, str))
return 0;
- printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
- printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
+ printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
+ printk(KERN_INFO "physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
@@ -1047,23 +1040,23 @@ static int __init update_mp_table(void)
}
printk(KERN_INFO "use in-positon replacing\n");
} else {
- mpf->mpf_physptr = mpc_new_phys;
+ mpf->physptr = mpc_new_phys;
mpc_new = phys_to_virt(mpc_new_phys);
memcpy(mpc_new, mpc, mpc->length);
mpc = mpc_new;
/* check if we can modify that */
- if (mpc_new_phys - mpf->mpf_physptr) {
- struct intel_mp_floating *mpf_new;
+ if (mpc_new_phys - mpf->physptr) {
+ struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
mpf_new = phys_to_virt(0x400 - 16);
memcpy(mpf_new, mpf, 16);
mpf = mpf_new;
- mpf->mpf_physptr = mpc_new_phys;
+ mpf->physptr = mpc_new_phys;
}
- mpf->mpf_checksum = 0;
- mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
- printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
+ mpf->checksum = 0;
+ mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
}
/*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 726266695b2..3cf3413ec62 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -35,10 +35,10 @@
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
static struct class *msr_class;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
deleted file mode 100644
index f2191d4f271..00000000000
--- a/arch/x86/kernel/numaq_32.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Written by: Patricia Gaughen, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
- */
-
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <asm/numaq.h>
-#include <asm/topology.h>
-#include <asm/processor.h>
-#include <asm/genapic.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
-
-/*
- * Function: smp_dump_qct()
- *
- * Description: gets memory layout from the quad config table. This
- * function also updates node_online_map with the nodes (quads) present.
- */
-static void __init smp_dump_qct(void)
-{
- int node;
- struct eachquadmem *eq;
- struct sys_cfg_data *scd =
- (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
-
- nodes_clear(node_online_map);
- for_each_node(node) {
- if (scd->quads_present31_0 & (1 << node)) {
- node_set_online(node);
- eq = &scd->eq[node];
- /* Convert to pages */
- node_start_pfn[node] = MB_TO_PAGES(
- eq->hi_shrd_mem_start - eq->priv_mem_size);
- node_end_pfn[node] = MB_TO_PAGES(
- eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
- e820_register_active_regions(node, node_start_pfn[node],
- node_end_pfn[node]);
- memory_present(node,
- node_start_pfn[node], node_end_pfn[node]);
- node_remap_size[node] = node_memmap_size_bytes(node,
- node_start_pfn[node],
- node_end_pfn[node]);
- }
- }
-}
-
-
-void __cpuinit numaq_tsc_disable(void)
-{
- if (!found_numaq)
- return;
-
- if (num_online_nodes() > 1) {
- printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
- setup_clear_cpu_cap(X86_FEATURE_TSC);
- }
-}
-
-static int __init numaq_pre_time_init(void)
-{
- numaq_tsc_disable();
- return 0;
-}
-
-int found_numaq;
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_config_translation {
- unsigned char mpc_type;
- unsigned char trans_len;
- unsigned char trans_type;
- unsigned char trans_quad;
- unsigned char trans_global;
- unsigned char trans_local;
- unsigned short trans_reserved;
-};
-
-/* x86_quirks member */
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
- __cpuinitdata;
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
- return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-/* x86_quirks member */
-static int mpc_apic_id(struct mpc_cpu *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int logical_apicid = generate_logical_apicid(quad, m->apicid);
-
- printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->cpufeature & CPU_MODEL_MASK) >> 4,
- m->apicver, quad, logical_apicid);
- return logical_apicid;
-}
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-
-/* x86_quirks member */
-static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- mp_bus_id_to_node[m->busid] = quad;
- mp_bus_id_to_local[m->busid] = local;
- printk(KERN_INFO "Bus #%d is %s (node %d)\n",
- m->busid, name, quad);
-}
-
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-
-/* x86_quirks member */
-static void mpc_oem_pci_bus(struct mpc_bus *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- quad_local_to_mp_bus_id[quad][local] = m->busid;
-}
-
-static void __init MP_translation_info(struct mpc_config_translation *m)
-{
- printk(KERN_INFO
- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
- m->trans_local);
-
- if (mpc_record >= MAX_MPC_ENTRY)
- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
- else
- translation_table[mpc_record] = m; /* stash this for later */
- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
- node_set_online(m->trans_quad);
-}
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
- int sum = 0;
-
- while (len--)
- sum += *mp++;
-
- return sum & 0xFF;
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-
-static void __init smp_read_mpc_oem(struct mpc_oemtable *oemtable,
- unsigned short oemsize)
-{
- int count = sizeof(*oemtable); /* the header size */
- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
- mpc_record = 0;
- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
- oemtable);
- if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
- printk(KERN_WARNING
- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->signature[0], oemtable->signature[1],
- oemtable->signature[2], oemtable->signature[3]);
- return;
- }
- if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
- return;
- }
- while (count < oemtable->length) {
- switch (*oemptr) {
- case MP_TRANSLATION:
- {
- struct mpc_config_translation *m =
- (struct mpc_config_translation *)oemptr;
- MP_translation_info(m);
- oemptr += sizeof(*m);
- count += sizeof(*m);
- ++mpc_record;
- break;
- }
- default:
- {
- printk(KERN_WARNING
- "Unrecognised OEM table entry type! - %d\n",
- (int)*oemptr);
- return;
- }
- }
- }
-}
-
-static int __init numaq_setup_ioapic_ids(void)
-{
- /* so can skip it */
- return 1;
-}
-
-static int __init numaq_update_genapic(void)
-{
- genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
-
- return 0;
-}
-
-static struct x86_quirks numaq_x86_quirks __initdata = {
- .arch_pre_time_init = numaq_pre_time_init,
- .arch_time_init = NULL,
- .arch_pre_intr_init = NULL,
- .arch_memory_setup = NULL,
- .arch_intr_init = NULL,
- .arch_trap_init = NULL,
- .mach_get_smp_config = NULL,
- .mach_find_smp_config = NULL,
- .mpc_record = &mpc_record,
- .mpc_apic_id = mpc_apic_id,
- .mpc_oem_bus_info = mpc_oem_bus_info,
- .mpc_oem_pci_bus = mpc_oem_pci_bus,
- .smp_read_mpc_oem = smp_read_mpc_oem,
- .setup_ioapic_ids = numaq_setup_ioapic_ids,
- .update_genapic = numaq_update_genapic,
-};
-
-void numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- if (strncmp(oem, "IBM NUMA", 8))
- printk("Warning! Not a NUMA-Q system!\n");
- else
- found_numaq = 1;
-}
-
-static __init void early_check_numaq(void)
-{
- /*
- * Find possible boot-time SMP configuration:
- */
- early_find_smp_config();
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-
- if (found_numaq)
- x86_quirks = &numaq_x86_quirks;
-}
-
-int __init get_memcfg_numaq(void)
-{
- early_check_numaq();
- if (!found_numaq)
- return 0;
- smp_dump_qct();
- return 1;
-}
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 95777b0faa7..3a7c5a44082 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -26,13 +26,3 @@ struct pv_lock_ops pv_lock_ops = {
};
EXPORT_SYMBOL(pv_lock_ops);
-void __init paravirt_use_bytelocks(void)
-{
-#ifdef CONFIG_SMP
- pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
- pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
- pv_lock_ops.spin_lock = __byte_spin_lock;
- pv_lock_ops.spin_trylock = __byte_spin_trylock;
- pv_lock_ops.spin_unlock = __byte_spin_unlock;
-#endif
-}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c6520a4e85d..8e45f446488 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -28,7 +28,6 @@
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/setup.h>
-#include <asm/arch_hooks.h>
#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
@@ -44,6 +43,17 @@ void _paravirt_nop(void)
{
}
+/* identity function, which can be inlined */
+u32 _paravirt_ident_32(u32 x)
+{
+ return x;
+}
+
+u64 _paravirt_ident_64(u64 x)
+{
+ return x;
+}
+
static void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +148,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
if (opfunc == NULL)
/* If there's no function, patch it with a ud2a (BUG) */
ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
- else if (opfunc == paravirt_nop)
+ else if (opfunc == _paravirt_nop)
/* If the operation is a nop, then nop the callsite */
ret = paravirt_patch_nop();
+
+ /* identity functions just return their single argument */
+ else if (opfunc == _paravirt_ident_32)
+ ret = paravirt_patch_ident_32(insnbuf, len);
+ else if (opfunc == _paravirt_ident_64)
+ ret = paravirt_patch_ident_64(insnbuf, len);
+
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -318,10 +335,10 @@ struct pv_time_ops pv_time_ops = {
struct pv_irq_ops pv_irq_ops = {
.init_IRQ = native_init_IRQ,
- .save_fl = native_save_fl,
- .restore_fl = native_restore_fl,
- .irq_disable = native_irq_disable,
- .irq_enable = native_irq_enable,
+ .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+ .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+ .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+ .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
.halt = native_halt,
#ifdef CONFIG_X86_64
@@ -399,6 +416,14 @@ struct pv_apic_ops pv_apic_ops = {
#endif
};
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
+/* 32-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
+#else
+/* 64-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
+#endif
+
struct pv_mmu_ops pv_mmu_ops = {
#ifndef CONFIG_X86_64
.pagetable_setup_start = native_pagetable_setup_start,
@@ -445,27 +470,27 @@ struct pv_mmu_ops pv_mmu_ops = {
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
- .set_pte_present = native_set_pte_present,
.pte_clear = native_pte_clear,
.pmd_clear = native_pmd_clear,
#endif
.set_pud = native_set_pud,
- .pmd_val = native_pmd_val,
- .make_pmd = native_make_pmd,
+
+ .pmd_val = PTE_IDENT,
+ .make_pmd = PTE_IDENT,
#if PAGETABLE_LEVELS == 4
- .pud_val = native_pud_val,
- .make_pud = native_make_pud,
+ .pud_val = PTE_IDENT,
+ .make_pud = PTE_IDENT,
+
.set_pgd = native_set_pgd,
#endif
#endif /* PAGETABLE_LEVELS >= 3 */
- .pte_val = native_pte_val,
- .pte_flags = native_pte_flags,
- .pgd_val = native_pgd_val,
+ .pte_val = PTE_IDENT,
+ .pgd_val = PTE_IDENT,
- .make_pte = native_make_pte,
- .make_pgd = native_make_pgd,
+ .make_pte = PTE_IDENT,
+ .make_pgd = PTE_IDENT,
.dup_mmap = paravirt_nop,
.exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861..d9f32e6d6ab 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ /* arg in %eax, return in %eax */
+ return 0;
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ /* arg in %edx:%eax, return in %edx:%eax */
+ return 0;
+}
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae..3f08f34f93e 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
+DEF_NATIVE(, mov32, "mov %edi, %eax");
+DEF_NATIVE(, mov64, "mov %rdi, %rax");
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov32, end__mov32);
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov64, end__mov64);
+}
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
index 675a48c404a..071e7fea42e 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -18,7 +18,7 @@
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/io.h>
-#include <setup_arch.h>
+#include <asm/setup_arch.h>
static struct resource system_rom_resource = {
.name = "System ROM",
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6d12f7e37f8..156f87582c6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,8 +1,8 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <asm/idle.h>
#include <linux/smp.h>
+#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/module.h>
@@ -11,6 +11,9 @@
#include <linux/ftrace.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -56,6 +59,193 @@ void arch_task_cache_init(void)
}
/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+ struct task_struct *me = current;
+ struct thread_struct *t = &me->thread;
+ unsigned long *bp = t->io_bitmap_ptr;
+
+ if (bp) {
+ struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+ /*
+ * Careful, clear this in the TSS too:
+ */
+ memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+ t->io_bitmap_max = 0;
+ put_cpu();
+ kfree(bp);
+ }
+
+ ds_exit_thread(current);
+}
+
+void flush_thread(void)
+{
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_X86_64
+ if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
+ clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
+ if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+ clear_tsk_thread_flag(tsk, TIF_IA32);
+ } else {
+ set_tsk_thread_flag(tsk, TIF_IA32);
+ current_thread_info()->status |= TS_COMPAT;
+ }
+ }
+#endif
+
+ clear_tsk_thread_flag(tsk, TIF_DEBUG);
+
+ tsk->thread.debugreg0 = 0;
+ tsk->thread.debugreg1 = 0;
+ tsk->thread.debugreg2 = 0;
+ tsk->thread.debugreg3 = 0;
+ tsk->thread.debugreg6 = 0;
+ tsk->thread.debugreg7 = 0;
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ /*
+ * Forget coprocessor state..
+ */
+ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+}
+
+static void hard_disable_TSC(void)
+{
+ write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_disable_TSC();
+ preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_enable_TSC();
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+ struct tss_struct *tss)
+{
+ struct thread_struct *prev, *next;
+
+ prev = &prev_p->thread;
+ next = &next_p->thread;
+
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
+ update_debugctlmsr(next->debugctlmsr);
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ set_debugreg(next->debugreg0, 0);
+ set_debugreg(next->debugreg1, 1);
+ set_debugreg(next->debugreg2, 2);
+ set_debugreg(next->debugreg3, 3);
+ /* no 4 and 5 */
+ set_debugreg(next->debugreg6, 6);
+ set_debugreg(next->debugreg7, 7);
+ }
+
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+ hard_disable_TSC();
+ else
+ hard_enable_TSC();
+ }
+
+ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+ /*
+ * Copy the relevant range of the IO bitmap.
+ * Normally this is 128 bytes or less:
+ */
+ memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+ max(prev->io_bitmap_max, next->io_bitmap_max));
+ } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+ /*
+ * Clear any possible leftover bits:
+ */
+ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ }
+}
+
+int sys_fork(struct pt_regs *regs)
+{
+ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+int sys_vfork(struct pt_regs *regs)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+ NULL, NULL);
+}
+
+
+/*
* Idle related variables and functions
*/
unsigned long boot_option_idle_override = 0;
@@ -350,7 +540,7 @@ static void c1e_idle(void)
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd4da2af08a..14014d766ca 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -11,6 +11,7 @@
#include <stdarg.h>
+#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -66,9 +67,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
/*
* Return saved PC of a blocked thread.
*/
@@ -94,6 +92,15 @@ void cpu_idle(void)
{
int cpu = smp_processor_id();
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+
current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
@@ -108,7 +115,6 @@ void cpu_idle(void)
play_dead();
local_irq_disable();
- __get_cpu_var(irq_stat).idle_timestamp = jiffies;
/* Don't trace irqs off for idle */
stop_critical_timings();
pm_idle();
@@ -132,7 +138,7 @@ void __show_regs(struct pt_regs *regs, int all)
if (user_mode_vm(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
- savesegment(gs, gs);
+ gs = get_user_gs(regs);
} else {
sp = (unsigned long) (&regs->sp);
savesegment(ss, ss);
@@ -213,6 +219,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
regs.ds = __USER_DS;
regs.es = __USER_DS;
regs.fs = __KERNEL_PERCPU;
+ regs.gs = __KERNEL_STACK_CANARY;
regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
@@ -223,55 +230,6 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
}
EXPORT_SYMBOL(kernel_thread);
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
- /* The process may have allocated an io port bitmap... nuke it. */
- if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
- struct task_struct *tsk = current;
- struct thread_struct *t = &tsk->thread;
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
- kfree(t->io_bitmap_ptr);
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
- /*
- * Careful, clear this in the TSS too:
- */
- memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
- t->io_bitmap_max = 0;
- tss->io_bitmap_owner = NULL;
- tss->io_bitmap_max = 0;
- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
- put_cpu();
- }
-
- ds_exit_thread(current);
-}
-
-void flush_thread(void)
-{
- struct task_struct *tsk = current;
-
- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
-}
-
void release_thread(struct task_struct *dead_task)
{
BUG_ON(dead_task->mm);
@@ -305,7 +263,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
p->thread.ip = (unsigned long) ret_from_fork;
- savesegment(gs, p->thread.gs);
+ task_user_gs(p) = get_user_gs(regs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -343,7 +301,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- __asm__("movl %0, %%gs" : : "r"(0));
+ set_user_gs(regs, 0);
regs->fs = 0;
set_fs(USER_DS);
regs->ds = __USER_DS;
@@ -359,127 +317,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
}
EXPORT_SYMBOL_GPL(start_thread);
-static void hard_disable_TSC(void)
-{
- write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
- preempt_disable();
- if (!test_and_set_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_disable_TSC();
- preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
- preempt_disable();
- if (test_and_clear_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_enable_TSC();
- preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
- unsigned int val;
-
- if (test_thread_flag(TIF_NOTSC))
- val = PR_TSC_SIGSEGV;
- else
- val = PR_TSC_ENABLE;
-
- return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
- if (val == PR_TSC_SIGSEGV)
- disable_TSC();
- else if (val == PR_TSC_ENABLE)
- enable_TSC();
- else
- return -EINVAL;
-
- return 0;
-}
-
-static noinline void
-__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss)
-{
- struct thread_struct *prev, *next;
-
- prev = &prev_p->thread;
- next = &next_p->thread;
-
- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
- ds_switch_to(prev_p, next_p);
- else if (next->debugctlmsr != prev->debugctlmsr)
- update_debugctlmsr(next->debugctlmsr);
-
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- set_debugreg(next->debugreg0, 0);
- set_debugreg(next->debugreg1, 1);
- set_debugreg(next->debugreg2, 2);
- set_debugreg(next->debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(next->debugreg6, 6);
- set_debugreg(next->debugreg7, 7);
- }
-
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOTSC))
- hard_disable_TSC();
- else
- hard_enable_TSC();
- }
-
- if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
- /*
- * Disable the bitmap via an invalid offset. We still cache
- * the previous bitmap owner and the IO bitmap contents:
- */
- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
- return;
- }
-
- if (likely(next == tss->io_bitmap_owner)) {
- /*
- * Previous owner of the bitmap (hence the bitmap content)
- * matches the next task, we dont have to do anything but
- * to set a valid offset in the TSS:
- */
- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
- return;
- }
- /*
- * Lazy TSS's I/O bitmap copy. We set an invalid offset here
- * and we let the task to get a GPF in case an I/O instruction
- * is performed. The handler of the GPF will verify that the
- * faulting task has a valid I/O bitmap and, it true, does the
- * real copy and restart the instruction. This will save us
- * redundant copies when the currently switched task does not
- * perform any I/O during its timeslice.
- */
- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-}
/*
* switch_to(x,yn) should switch tasks from x to y.
@@ -540,7 +377,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- savesegment(gs, prev->gs);
+ lazy_save_gs(prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -586,64 +423,44 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- loadsegment(gs, next->gs);
+ lazy_load_gs(next->gs);
- x86_write_percpu(current_task, next_p);
+ percpu_write(current_task, next_p);
return prev_p;
}
-asmlinkage int sys_fork(struct pt_regs regs)
-{
- return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
-}
-
-asmlinkage int sys_clone(struct pt_regs regs)
+int sys_clone(struct pt_regs *regs)
{
unsigned long clone_flags;
unsigned long newsp;
int __user *parent_tidptr, *child_tidptr;
- clone_flags = regs.bx;
- newsp = regs.cx;
- parent_tidptr = (int __user *)regs.dx;
- child_tidptr = (int __user *)regs.di;
+ clone_flags = regs->bx;
+ newsp = regs->cx;
+ parent_tidptr = (int __user *)regs->dx;
+ child_tidptr = (int __user *)regs->di;
if (!newsp)
- newsp = regs.sp;
- return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
-}
-
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage int sys_vfork(struct pt_regs regs)
-{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
+ newsp = regs->sp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
}
/*
* sys_execve() executes a new program.
*/
-asmlinkage int sys_execve(struct pt_regs regs)
+int sys_execve(struct pt_regs *regs)
{
int error;
char *filename;
- filename = getname((char __user *) regs.bx);
+ filename = getname((char __user *) regs->bx);
error = PTR_ERR(filename);
if (IS_ERR(filename))
goto out;
error = do_execve(filename,
- (char __user * __user *) regs.cx,
- (char __user * __user *) regs.dx,
- &regs);
+ (char __user * __user *) regs->cx,
+ (char __user * __user *) regs->dx,
+ regs);
if (error == 0) {
/* Make sure we don't return using sysenter.. */
set_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 85b4cb5c198..abb7e6a7f0c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
#include <stdarg.h>
+#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -47,7 +48,6 @@
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
-#include <asm/pda.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
@@ -58,6 +58,12 @@
asmlinkage extern void ret_from_fork(void);
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(unsigned long, old_rsp);
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -76,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
void enter_idle(void)
{
- write_pda(isidle, 1);
+ percpu_write(is_idle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
- if (test_and_clear_bit_pda(0, isidle) == 0)
+ if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
@@ -112,6 +118,16 @@ static inline void play_dead(void)
void cpu_idle(void)
{
current_thread_info()->status |= TS_POLLING;
+
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+
/* endless idle loop with no priority at all */
while (1) {
tick_nohz_stop_sched_tick(1);
@@ -221,61 +237,6 @@ void show_regs(struct pt_regs *regs)
show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
}
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
- struct task_struct *me = current;
- struct thread_struct *t = &me->thread;
-
- if (me->thread.io_bitmap_ptr) {
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-
- kfree(t->io_bitmap_ptr);
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
- /*
- * Careful, clear this in the TSS too:
- */
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
- t->io_bitmap_max = 0;
- put_cpu();
- }
-
- ds_exit_thread(current);
-}
-
-void flush_thread(void)
-{
- struct task_struct *tsk = current;
-
- if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
- clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
- if (test_tsk_thread_flag(tsk, TIF_IA32)) {
- clear_tsk_thread_flag(tsk, TIF_IA32);
- } else {
- set_tsk_thread_flag(tsk, TIF_IA32);
- current_thread_info()->status |= TS_COMPAT;
- }
- }
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
-}
-
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
@@ -397,7 +358,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
- write_pda(oldrsp, new_sp);
+ percpu_write(old_rsp, new_sp);
regs->cs = __USER_CS;
regs->ss = __USER_DS;
regs->flags = 0x200;
@@ -409,118 +370,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
}
EXPORT_SYMBOL_GPL(start_thread);
-static void hard_disable_TSC(void)
-{
- write_cr4(read_cr4() | X86_CR4_TSD);
-}
-
-void disable_TSC(void)
-{
- preempt_disable();
- if (!test_and_set_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_disable_TSC();
- preempt_enable();
-}
-
-static void hard_enable_TSC(void)
-{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
-static void enable_TSC(void)
-{
- preempt_disable();
- if (test_and_clear_thread_flag(TIF_NOTSC))
- /*
- * Must flip the CPU state synchronously with
- * TIF_NOTSC in the current running context.
- */
- hard_enable_TSC();
- preempt_enable();
-}
-
-int get_tsc_mode(unsigned long adr)
-{
- unsigned int val;
-
- if (test_thread_flag(TIF_NOTSC))
- val = PR_TSC_SIGSEGV;
- else
- val = PR_TSC_ENABLE;
-
- return put_user(val, (unsigned int __user *)adr);
-}
-
-int set_tsc_mode(unsigned int val)
-{
- if (val == PR_TSC_SIGSEGV)
- disable_TSC();
- else if (val == PR_TSC_ENABLE)
- enable_TSC();
- else
- return -EINVAL;
-
- return 0;
-}
-
-/*
- * This special macro can be used to load a debugging register
- */
-#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
-
-static inline void __switch_to_xtra(struct task_struct *prev_p,
- struct task_struct *next_p,
- struct tss_struct *tss)
-{
- struct thread_struct *prev, *next;
-
- prev = &prev_p->thread,
- next = &next_p->thread;
-
- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
- ds_switch_to(prev_p, next_p);
- else if (next->debugctlmsr != prev->debugctlmsr)
- update_debugctlmsr(next->debugctlmsr);
-
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- loaddebug(next, 0);
- loaddebug(next, 1);
- loaddebug(next, 2);
- loaddebug(next, 3);
- /* no 4 and 5 */
- loaddebug(next, 6);
- loaddebug(next, 7);
- }
-
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOTSC))
- hard_disable_TSC();
- else
- hard_enable_TSC();
- }
-
- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
- /*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
- /*
- * Clear any possible leftover bits:
- */
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
- }
-}
-
/*
* switch_to(x,y) should switch tasks from x to y.
*
@@ -618,21 +467,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = read_pda(oldrsp);
- write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ prev->usersp = percpu_read(old_rsp);
+ percpu_write(old_rsp, next->usersp);
+ percpu_write(current_task, next_p);
- write_pda(kernelstack,
+ percpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - PDA_STACKOFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
- write_pda(stack_canary, next_p->stack_canary);
- /*
- * Build time only check to make sure the stack_canary is at
- * offset 40 in the pda; this is a gcc ABI requirement
- */
- BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
+ THREAD_SIZE - KERNEL_STACK_OFFSET);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
@@ -686,11 +527,6 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
-asmlinkage long sys_fork(struct pt_regs *regs)
-{
- return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
asmlinkage long
sys_clone(unsigned long clone_flags, unsigned long newsp,
void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
@@ -700,22 +536,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage long sys_vfork(struct pt_regs *regs)
-{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
- NULL, NULL);
-}
-
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 06ca07f6ad8..19378715f41 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -75,10 +75,7 @@ static inline bool invalid_selector(u16 value)
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
- regno >>= 2;
- if (regno > FS)
- --regno;
- return &regs->bx + regno;
+ return &regs->bx + (regno >> 2);
}
static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
@@ -90,9 +87,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
if (offset != offsetof(struct user_regs_struct, gs))
retval = *pt_regs_access(task_pt_regs(task), offset);
else {
- retval = task->thread.gs;
if (task == current)
- savesegment(gs, retval);
+ retval = get_user_gs(task_pt_regs(task));
+ else
+ retval = task_user_gs(task);
}
return retval;
}
@@ -126,13 +124,10 @@ static int set_segment_reg(struct task_struct *task,
break;
case offsetof(struct user_regs_struct, gs):
- task->thread.gs = value;
if (task == current)
- /*
- * The user-mode %gs is not affected by
- * kernel entry, so we must update the CPU.
- */
- loadsegment(gs, value);
+ set_user_gs(task_pt_regs(task), value);
+ else
+ task_user_gs(task) = value;
}
return 0;
@@ -273,7 +268,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task)
if (test_tsk_thread_flag(task, TIF_IA32))
return IA32_PAGE_OFFSET - 3;
#endif
- return TASK_SIZE64 - 7;
+ return TASK_SIZE_MAX - 7;
}
#endif /* CONFIG_X86_32 */
@@ -690,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child,
if (!cfg.signal)
return -EINVAL;
- return -EOPNOTSUPP;
-
child->thread.bts_ovfl_signal = cfg.signal;
+ return -EOPNOTSUPP;
}
if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c..6a5a2970f4c 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
if (!force_hpet_address)
return;
- if (rcba_base == NULL)
- BUG();
+ BUG_ON(rcba_base == NULL);
/* read the Function Disable register, dword mode only */
val = readl(rcba_base + 0x3404);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 4526b3a75ed..2aef36d8aca 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -14,6 +14,7 @@
#include <asm/reboot.h>
#include <asm/pci_x86.h>
#include <asm/virtext.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_32
# include <linux/dmi.h>
@@ -23,8 +24,6 @@
# include <asm/iommu.h>
#endif
-#include <mach_ipi.h>
-
/*
* Power off function, if any
*/
@@ -658,7 +657,7 @@ static int crash_nmi_callback(struct notifier_block *self,
static void smp_send_nmi_allbutself(void)
{
- send_IPI_allbutself(NMI_VECTOR);
+ apic->send_IPI_allbutself(NMI_VECTOR);
}
static struct notifier_block crash_nmi_nb = {
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index a160f311972..41235531b11 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -7,7 +7,7 @@
*/
#include <linux/linkage.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
@@ -17,7 +17,8 @@
#define PTR(x) (x << 2)
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
* ~ control_page + PAGE_SIZE are used as data storage and stack for
* jumping back
*/
@@ -76,8 +77,10 @@ relocate_kernel:
movl %eax, CP_PA_SWAP_PAGE(%edi)
movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
- /* get physical address of control page now */
- /* this is impossible after page table switch */
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
/* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
/* store the start address on the stack */
pushl %edx
- /* Set cr0 to a known state:
+ /*
+ * Set cr0 to a known state:
* - Paging disabled
* - Alignment check disabled
* - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
/* clear cr4 if applicable */
testl %ecx, %ecx
jz 1f
- /* Set cr4 to a known state:
+ /*
+ * Set cr4 to a known state:
* Setting everything to zero seems safe.
*/
xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
call swap_pages
addl $8, %esp
- /* To be certain of avoiding problems with self-modifying code
+ /*
+ * To be certain of avoiding problems with self-modifying code
* I need to execute a serializing instruction here.
* So I flush the TLB, it's handy, and not processor dependent.
*/
xorl %eax, %eax
movl %eax, %cr3
- /* set all of the registers to known values */
- /* leave %esp alone */
+ /*
+ * set all of the registers to known values
+ * leave %esp alone
+ */
testl %esi, %esi
jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index f5afe665a82..4de8f5b3d47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -7,10 +7,10 @@
*/
#include <linux/linkage.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
-#include <asm/pgtable.h>
+#include <asm/pgtable_types.h>
/*
* Must be relocatable PIC code callable as a C function
@@ -19,145 +19,76 @@
#define PTR(x) (x << 3)
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP DATA(0x0)
+#define CR0 DATA(0x8)
+#define CR3 DATA(0x10)
+#define CR4 DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE DATA(0x20)
+#define CP_PA_SWAP_PAGE DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+
.text
.align PAGE_SIZE
.code64
.globl relocate_kernel
relocate_kernel:
- /* %rdi indirection_page
+ /*
+ * %rdi indirection_page
* %rsi page_list
* %rdx start address
+ * %rcx preserve_context
*/
- /* map the control page at its virtual address */
-
- movq $0x0000ff8000000000, %r10 /* mask */
- mov $(39 - 3), %cl /* bits to shift */
- movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PGD)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_PUD_0)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
- shrq $9, %r10
- sub $9, %cl
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PUD_0)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_PMD_0)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
- shrq $9, %r10
- sub $9, %cl
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PMD_0)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_PTE_0)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
- shrq $9, %r10
- sub $9, %cl
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PTE_0)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
- /* identity map the control page at its physical address */
-
- movq $0x0000ff8000000000, %r10 /* mask */
- mov $(39 - 3), %cl /* bits to shift */
- movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PGD)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_PUD_1)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
- shrq $9, %r10
- sub $9, %cl
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PUD_1)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_PMD_1)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
- shrq $9, %r10
- sub $9, %cl
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PMD_1)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_PTE_1)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
- shrq $9, %r10
- sub $9, %cl
-
- movq %r11, %r9
- andq %r10, %r9
- shrq %cl, %r9
-
- movq PTR(VA_PTE_1)(%rsi), %r8
- addq %r8, %r9
- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
- orq $PAGE_ATTR, %r8
- movq %r8, (%r9)
-
-relocate_new_kernel:
- /* %rdi indirection_page
- * %rsi page_list
- * %rdx start address
- */
+ /* Save the CPU context, used for jumping back */
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushf
+
+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
+ movq %rsp, RSP(%r11)
+ movq %cr0, %rax
+ movq %rax, CR0(%r11)
+ movq %cr3, %rax
+ movq %rax, CR3(%r11)
+ movq %cr4, %rax
+ movq %rax, CR4(%r11)
/* zero out flags, and disable interrupts */
pushq $0
popfq
- /* get physical address of control page now */
- /* this is impossible after page table switch */
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
/* get physical address of page table now too */
- movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
+ movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+ /* get physical address of swap page now */
+ movq PTR(PA_SWAP_PAGE)(%rsi), %r10
- /* switch to new set of page tables */
- movq PTR(PA_PGD)(%rsi), %r9
+ /* save some information for jumping back */
+ movq %r9, CP_PA_TABLE_PAGE(%r11)
+ movq %r10, CP_PA_SWAP_PAGE(%r11)
+ movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+
+ /* Switch to the identity mapped page tables */
movq %r9, %cr3
/* setup a new stack at the end of the physical control page */
@@ -172,7 +103,8 @@ identity_mapped:
/* store the start address on the stack */
pushq %rdx
- /* Set cr0 to a known state:
+ /*
+ * Set cr0 to a known state:
* - Paging enabled
* - Alignment check disabled
* - Write protect disabled
@@ -185,7 +117,8 @@ identity_mapped:
orl $(X86_CR0_PG | X86_CR0_PE), %eax
movq %rax, %cr0
- /* Set cr4 to a known state:
+ /*
+ * Set cr4 to a known state:
* - physical address extension enabled
*/
movq $X86_CR4_PAE, %rax
@@ -194,12 +127,88 @@ identity_mapped:
jmp 1f
1:
- /* Switch to the identity mapped page tables,
- * and flush the TLB.
- */
- movq %rcx, %cr3
+ /* Flush the TLB (needed?) */
+ movq %r9, %cr3
+
+ movq %rcx, %r11
+ call swap_pages
+
+ /*
+ * To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /*
+ * set all of the registers to known values
+ * leave %rsp alone
+ */
+
+ testq %r11, %r11
+ jnz 1f
+ xorq %rax, %rax
+ xorq %rbx, %rbx
+ xorq %rcx, %rcx
+ xorq %rdx, %rdx
+ xorq %rsi, %rsi
+ xorq %rdi, %rdi
+ xorq %rbp, %rbp
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r9
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+
+ ret
+
+1:
+ popq %rdx
+ leaq PAGE_SIZE(%r10), %rsp
+ call *%rdx
+
+ /* get the re-entry point of the peer system */
+ movq 0(%rsp), %rbp
+ call 1f
+1:
+ popq %r8
+ subq $(1b - relocate_kernel), %r8
+ movq CP_PA_SWAP_PAGE(%r8), %r10
+ movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+ movq CP_PA_TABLE_PAGE(%r8), %rax
+ movq %rax, %cr3
+ lea PAGE_SIZE(%r8), %rsp
+ call swap_pages
+ movq $virtual_mapped, %rax
+ pushq %rax
+ ret
+
+virtual_mapped:
+ movq RSP(%r8), %rsp
+ movq CR4(%r8), %rax
+ movq %rax, %cr4
+ movq CR3(%r8), %rax
+ movq CR0(%r8), %r8
+ movq %rax, %cr3
+ movq %r8, %cr0
+ movq %rbp, %rax
+
+ popf
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ ret
/* Do the copies */
+swap_pages:
movq %rdi, %rcx /* Put the page_list in %rcx */
xorq %rdi, %rdi
xorq %rsi, %rsi
@@ -231,36 +240,27 @@ identity_mapped:
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
+ movq %rdi, %rdx
+ movq %rsi, %rax
+
+ movq %r10, %rdi
movq $512, %rcx
rep ; movsq
- jmp 0b
-3:
-
- /* To be certain of avoiding problems with self-modifying code
- * I need to execute a serializing instruction here.
- * So I flush the TLB by reloading %cr3 here, it's handy,
- * and not processor dependent.
- */
- movq %cr3, %rax
- movq %rax, %cr3
- /* set all of the registers to known values */
- /* leave %rsp alone */
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq $512, %rcx
+ rep ; movsq
- xorq %rax, %rax
- xorq %rbx, %rbx
- xorq %rcx, %rcx
- xorq %rdx, %rdx
- xorq %rsi, %rsi
- xorq %rdi, %rdi
- xorq %rbp, %rbp
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r9
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- xorq %r14, %r14
- xorq %r15, %r15
+ movq %rdx, %rdi
+ movq %r10, %rsi
+ movq $512, %rcx
+ rep ; movsq
+ lea PAGE_SIZE(%rax), %rsi
+ jmp 0b
+3:
ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6a8811a6932..a0d26237d7c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -74,14 +74,15 @@
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
-#include <asm/arch_hooks.h>
#include <asm/efi.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
#include <asm/sections.h>
#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/vmi.h>
-#include <setup_arch.h>
+#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
@@ -89,7 +90,7 @@
#include <asm/system.h>
#include <asm/vsyscall.h>
-#include <asm/smp.h>
+#include <asm/cpu.h>
#include <asm/desc.h>
#include <asm/dma.h>
#include <asm/iommu.h>
@@ -97,7 +98,6 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
-#include <mach_apic.h>
#include <asm/paravirt.h>
#include <asm/hypervisor.h>
@@ -112,6 +112,25 @@
#define ARCH_SETUP
#endif
+RESERVE_BRK(dmi_alloc, 65536);
+
+unsigned int boot_cpu_id __read_mostly;
+
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
+#ifdef CONFIG_X86_64
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+ return __default_cpu_present_to_apicid(mps_cpu);
+}
+
+int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+}
+#endif
+
#ifndef CONFIG_DEBUG_BOOT_PARAMS
struct boot_params __initdata boot_params;
#else
@@ -144,12 +163,6 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
- immediately after the boot time page tables. It contains a *physical*
- address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
static struct resource video_ram_resource = {
.name = "Video RAM area",
.start = 0xa0000,
@@ -188,7 +201,9 @@ struct ist_info ist_info;
#endif
#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+ .x86_phys_bits = MAX_PHYSMEM_BITS,
+};
EXPORT_SYMBOL(boot_cpu_data);
#endif
@@ -203,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
int bootloader_type;
/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
* Setup options
*/
struct screen_info screen_info;
@@ -253,6 +262,35 @@ static inline void copy_edd(void)
}
#endif
+void * __init extend_brk(size_t size, size_t align)
+{
+ size_t mask = align - 1;
+ void *ret;
+
+ BUG_ON(_brk_start == 0);
+ BUG_ON(align & mask);
+
+ _brk_end = (_brk_end + mask) & ~mask;
+ BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+ ret = (void *)_brk_end;
+ _brk_end += size;
+
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+static void __init reserve_brk(void)
+{
+ if (_brk_end > _brk_start)
+ reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+ /* Mark brk area as locked down and no longer taking any
+ new allocations */
+ _brk_start = 0;
+}
+
#ifdef CONFIG_BLK_DEV_INITRD
#ifdef CONFIG_X86_32
@@ -586,20 +624,7 @@ static int __init setup_elfcorehdr(char *arg)
early_param("elfcorehdr", setup_elfcorehdr);
#endif
-static int __init default_update_genapic(void)
-{
-#ifdef CONFIG_X86_SMP
-# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
- genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
-# endif
-#endif
-
- return 0;
-}
-
-static struct x86_quirks default_x86_quirks __initdata = {
- .update_genapic = default_update_genapic,
-};
+static struct x86_quirks default_x86_quirks __initdata;
struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
@@ -656,7 +681,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
- pre_setup_arch_hook();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
@@ -715,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
- init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
- init_mm.brk = (unsigned long) &_end;
-#endif
+ init_mm.brk = _brk_end;
code_resource.start = virt_to_phys(_text);
code_resource.end = virt_to_phys(_etext)-1;
@@ -824,8 +844,7 @@ void __init setup_arch(char **cmdline_p)
#else
num_physpages = max_pfn;
- if (cpu_has_x2apic)
- check_x2apic();
+ check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
@@ -841,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
+ reserve_brk();
+
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
max_pfn_mapped = max_low_pfn_mapped;
@@ -865,9 +886,7 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
-#ifdef CONFIG_X86_64
vsmp_init();
-#endif
io_delay_init();
@@ -893,12 +912,11 @@ void __init setup_arch(char **cmdline_p)
*/
acpi_reserve_bootmem();
#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
-#endif
+
reserve_crashkernel();
#ifdef CONFIG_X86_64
@@ -925,9 +943,7 @@ void __init setup_arch(char **cmdline_p)
map_vsyscall();
#endif
-#ifdef CONFIG_X86_GENERICARCH
generic_apic_probe();
-#endif
early_quirks();
@@ -978,4 +994,95 @@ void __init setup_arch(char **cmdline_p)
#endif
}
+#ifdef CONFIG_X86_32
+
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ * Perform any necessary interrupt initialisation prior to setting up
+ * the "ordinary" interrupt call gates. For legacy reasons, the ISA
+ * interrupts should be initialised here if the machine emulates a PC
+ * in any way.
+ **/
+void __init x86_quirk_pre_intr_init(void)
+{
+ if (x86_quirks->arch_pre_intr_init) {
+ if (x86_quirks->arch_pre_intr_init())
+ return;
+ }
+ init_ISA_irqs();
+}
+/**
+ * x86_quirk_intr_init - post gate setup interrupt initialisation
+ *
+ * Description:
+ * Fill in any interrupts that may have been left out by the general
+ * init_IRQ() routine. interrupts having to do with the machine rather
+ * than the devices on the I/O bus (like APIC interrupts in intel MP
+ * systems) are started here.
+ **/
+void __init x86_quirk_intr_init(void)
+{
+ if (x86_quirks->arch_intr_init) {
+ if (x86_quirks->arch_intr_init())
+ return;
+ }
+}
+
+/**
+ * x86_quirk_trap_init - initialise system specific traps
+ *
+ * Description:
+ * Called as the final act of trap_init(). Used in VISWS to initialise
+ * the various board specific APIC traps.
+ **/
+void __init x86_quirk_trap_init(void)
+{
+ if (x86_quirks->arch_trap_init) {
+ if (x86_quirks->arch_trap_init())
+ return;
+ }
+}
+
+static struct irqaction irq0 = {
+ .handler = timer_interrupt,
+ .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .mask = CPU_MASK_NONE,
+ .name = "timer"
+};
+
+/**
+ * x86_quirk_pre_time_init - do any specific initialisations before.
+ *
+ **/
+void __init x86_quirk_pre_time_init(void)
+{
+ if (x86_quirks->arch_pre_time_init)
+ x86_quirks->arch_pre_time_init();
+}
+
+/**
+ * x86_quirk_time_init - do any specific initialisations for the system timer.
+ *
+ * Description:
+ * Must plug the system timer interrupt source at HZ into the IRQ listed
+ * in irq_vectors.h:TIMER_IRQ
+ **/
+void __init x86_quirk_time_init(void)
+{
+ if (x86_quirks->arch_time_init) {
+ /*
+ * A nonzero return code does not mean failure, it means
+ * that the architecture quirk does not want any
+ * generic (timer) setup to be performed after this:
+ */
+ if (x86_quirks->arch_time_init())
+ return;
+ }
+
+ irq0.mask = cpumask_of_cpu(0);
+ setup_irq(0, &irq0);
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 01161077a49..400331b50a5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,402 +7,439 @@
#include <linux/crash_dump.h>
#include <linux/smp.h>
#include <linux/topology.h>
+#include <linux/pfn.h>
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/highmem.h>
+#include <asm/proto.h>
+#include <asm/cpumask.h>
+#include <asm/cpu.h>
+#include <asm/stackprotector.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
-unsigned int max_physical_apicid;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
#endif
-/* map cpu index to physical APIC ID */
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define X86_64_NUMA 1
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
-/* map cpu index to node index */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+#ifdef CONFIG_X86_64
+#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
+#else
+#define BOOT_PERCPU_OFFSET 0
+#endif
-/* which logical CPUs are on which nodes */
-cpumask_t *node_to_cpumask_map;
-EXPORT_SYMBOL(node_to_cpumask_map);
+DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-/* setup node_to_cpumask_map */
-static void __init setup_node_to_cpumask_map(void);
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+ [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
+};
+EXPORT_SYMBOL(__per_cpu_offset);
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations. Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base. On x86_32, anything can
+ * address anywhere. No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
#else
-static inline void setup_node_to_cpumask_map(void) { }
+#define PERCPU_FIRST_CHUNK_RESERVE 0
#endif
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
-/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas. These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA. This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
*/
-static void __init setup_per_cpu_maps(void)
+static bool __init pcpu_need_numa(void)
{
- int cpu;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ pg_data_t *last = NULL;
+ unsigned int cpu;
for_each_possible_cpu(cpu) {
- per_cpu(x86_cpu_to_apicid, cpu) =
- early_per_cpu_map(x86_cpu_to_apicid, cpu);
- per_cpu(x86_bios_cpu_apicid, cpu) =
- early_per_cpu_map(x86_bios_cpu_apicid, cpu);
-#ifdef X86_64_NUMA
- per_cpu(x86_cpu_to_node_map, cpu) =
- early_per_cpu_map(x86_cpu_to_node_map, cpu);
-#endif
- }
+ int node = early_cpu_to_node(cpu);
- /* indicate the early static arrays will soon be gone */
- early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
- early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
-#ifdef X86_64_NUMA
- early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+ if (node_online(node) && NODE_DATA(node) &&
+ last && last != NODE_DATA(node))
+ return true;
+
+ last = NODE_DATA(node);
+ }
#endif
+ return false;
}
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
*/
-static void __init setup_cpu_pda_map(void)
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+ unsigned long align)
{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
+ const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ int node = early_cpu_to_node(cpu);
+ void *ptr;
+
+ if (!node_online(node) || !NODE_DATA(node)) {
+ ptr = __alloc_bootmem_nopanic(size, align, goal);
+ pr_info("cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+ cpu, size, __pa(ptr));
+ } else {
+ ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+ size, align, goal);
+ pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+ "%016lx\n", cpu, size, node, __pa(ptr));
}
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
+ return ptr;
+#else
+ return __alloc_bootmem_nopanic(size, align, goal);
+#endif
}
-#endif /* CONFIG_SMP && CONFIG_X86_64 */
-
-#ifdef CONFIG_X86_64
+/*
+ * Remap allocator
+ *
+ * This allocator uses PMD page as unit. A PMD page is allocated for
+ * each cpu and each is remapped into vmalloc area using PMD mapping.
+ * As PMD page is quite large, only part of it is used for the first
+ * chunk. Unused part is returned to the bootmem allocator.
+ *
+ * So, the PMD pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk. The double
+ * mapping does add one more PMD TLB entry pressure but still is much
+ * better than only using 4k mappings while still being NUMA friendly.
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
-/* correctly size the local cpu masks */
-static void __init setup_cpu_local_masks(void)
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
{
- alloc_bootmem_cpumask_var(&cpu_initialized_mask);
- alloc_bootmem_cpumask_var(&cpu_callin_mask);
- alloc_bootmem_cpumask_var(&cpu_callout_mask);
- alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
-}
+ size_t off = (size_t)pageno << PAGE_SHIFT;
-#else /* CONFIG_X86_32 */
+ if (off >= pcpur_size)
+ return NULL;
-static inline void setup_cpu_local_masks(void)
-{
+ return virt_to_page(pcpur_ptrs[cpu] + off);
}
-#endif /* CONFIG_X86_32 */
-
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
-void __init setup_per_cpu_areas(void)
+static ssize_t __init setup_pcpu_remap(size_t static_size)
{
- ssize_t size, old_size;
- char *ptr;
- int cpu;
- unsigned long align = 1;
-
- /* Setup cpu_pda map */
- setup_cpu_pda_map();
+ static struct vm_struct vm;
+ pg_data_t *last;
+ size_t ptrs_size, dyn_size;
+ unsigned int cpu;
+ ssize_t ret;
+
+ /*
+ * If large page isn't supported, there's no benefit in doing
+ * this. Also, on non-NUMA, embedding is better.
+ */
+ if (!cpu_has_pse || pcpu_need_numa())
+ return -EINVAL;
+
+ last = NULL;
+ for_each_possible_cpu(cpu) {
+ int node = early_cpu_to_node(cpu);
- /* Copy section for each CPU (we discard the original) */
- old_size = PERCPU_ENOUGH_ROOM;
- align = max_t(unsigned long, PAGE_SIZE, align);
- size = roundup(old_size, align);
+ if (node_online(node) && NODE_DATA(node) &&
+ last && last != NODE_DATA(node))
+ goto proceed;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
- NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+ last = NODE_DATA(node);
+ }
+ return -EINVAL;
+
+proceed:
+ /*
+ * Currently supports only single page. Supporting multiple
+ * pages won't be too difficult if it ever becomes necessary.
+ */
+ pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE);
+ if (pcpur_size > PMD_SIZE) {
+ pr_warning("PERCPU: static data is larger than large page, "
+ "can't use large page\n");
+ return -EINVAL;
+ }
+ dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
- pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+ /* allocate pointer array and alloc large pages */
+ ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+ pcpur_ptrs = alloc_bootmem(ptrs_size);
for_each_possible_cpu(cpu) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = __alloc_bootmem(size, align,
- __pa(MAX_DMA_ADDRESS));
-#else
- int node = early_cpu_to_node(cpu);
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = __alloc_bootmem(size, align,
- __pa(MAX_DMA_ADDRESS));
- pr_info("cpu %d has no node %d or node-local memory\n",
- cpu, node);
- pr_debug("per cpu data for cpu%d at %016lx\n",
- cpu, __pa(ptr));
- } else {
- ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
- __pa(MAX_DMA_ADDRESS));
- pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
- cpu, node, __pa(ptr));
- }
-#endif
- per_cpu_offset(cpu) = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+ pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
+ if (!pcpur_ptrs[cpu])
+ goto enomem;
+
+ /*
+ * Only use pcpur_size bytes and give back the rest.
+ *
+ * Ingo: The 2MB up-rounding bootmem is needed to make
+ * sure the partial 2MB page is still fully RAM - it's
+ * not well-specified to have a PAT-incompatible area
+ * (unmapped RAM, device memory, etc.) in that hole.
+ */
+ free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+ PMD_SIZE - pcpur_size);
+
+ memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
}
- /* Setup percpu data maps */
- setup_per_cpu_maps();
-
- /* Setup node to cpumask map */
- setup_node_to_cpumask_map();
-
- /* Setup cpu initialized, callin, callout masks */
- setup_cpu_local_masks();
-}
-
-#endif
+ /* allocate address and map */
+ vm.flags = VM_ALLOC;
+ vm.size = num_possible_cpus() * PMD_SIZE;
+ vm_area_register_early(&vm, PMD_SIZE);
-#ifdef X86_64_NUMA
+ for_each_possible_cpu(cpu) {
+ pmd_t *pmd;
-/*
- * Allocate node_to_cpumask_map based on number of available nodes
- * Requires node_possible_map to be valid.
- *
- * Note: node_to_cpumask() is not valid until after this is done.
- */
-static void __init setup_node_to_cpumask_map(void)
-{
- unsigned int node, num = 0;
- cpumask_t *map;
-
- /* setup nr_node_ids if not done yet */
- if (nr_node_ids == MAX_NUMNODES) {
- for_each_node_mask(node, node_possible_map)
- num = node;
- nr_node_ids = num + 1;
+ pmd = populate_extra_pmd((unsigned long)vm.addr
+ + cpu * PMD_SIZE);
+ set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
+ PAGE_KERNEL_LARGE));
}
- /* allocate the map */
- map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
-
- pr_debug("Node to cpumask map at %p for %d nodes\n",
- map, nr_node_ids);
-
- /* node_to_cpumask() will now work */
- node_to_cpumask_map = map;
+ /* we're ready, commit */
+ pr_info("PERCPU: Remapped at %p with large pages, static data "
+ "%zu bytes\n", vm.addr, static_size);
+
+ ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+ PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+ PMD_SIZE, vm.addr, NULL);
+ goto out_free_ar;
+
+enomem:
+ for_each_possible_cpu(cpu)
+ if (pcpur_ptrs[cpu])
+ free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+ ret = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+ return ret;
}
-
-void __cpuinit numa_set_node(int cpu, int node)
+#else
+static ssize_t __init setup_pcpu_remap(size_t static_size)
{
- int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-
- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
-
- if (cpu_to_node_map)
- cpu_to_node_map[cpu] = node;
-
- else if (per_cpu_offset(cpu))
- per_cpu(x86_cpu_to_node_map, cpu) = node;
-
- else
- pr_debug("Setting node for non-present cpu %d\n", cpu);
+ return -EINVAL;
}
+#endif
-void __cpuinit numa_clear_node(int cpu)
+/*
+ * Embedding allocator
+ *
+ * The first chunk is sized to just contain the static area plus
+ * module and dynamic reserves and embedded into linear physical
+ * mapping so that it can use PMD mapping without additional TLB
+ * pressure.
+ */
+static ssize_t __init setup_pcpu_embed(size_t static_size)
{
- numa_set_node(cpu, NUMA_NO_NODE);
+ size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+
+ /*
+ * If large page isn't supported, there's no benefit in doing
+ * this. Also, embedding allocation doesn't play well with
+ * NUMA.
+ */
+ if (!cpu_has_pse || pcpu_need_numa())
+ return -EINVAL;
+
+ return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+ reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
}
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+/*
+ * 4k page allocator
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page and most of initialization is done by the generic
+ * setup function.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
-void __cpuinit numa_add_cpu(int cpu)
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
- cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+ if (pageno < pcpu4k_nr_static_pages)
+ return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+ return NULL;
}
-void __cpuinit numa_remove_cpu(int cpu)
+static void __init pcpu4k_populate_pte(unsigned long addr)
{
- cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+ populate_extra_pte(addr);
}
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-
-/*
- * --------- debug versions of the numa functions ---------
- */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
+static ssize_t __init setup_pcpu_4k(size_t static_size)
{
- int node = cpu_to_node(cpu);
- cpumask_t *mask;
- char buf[64];
-
- if (node_to_cpumask_map == NULL) {
- printk(KERN_ERR "node_to_cpumask_map NULL\n");
- dump_stack();
- return;
- }
-
- mask = &node_to_cpumask_map[node];
- if (enable)
- cpu_set(cpu, *mask);
- else
- cpu_clear(cpu, *mask);
+ size_t pages_size;
+ unsigned int cpu;
+ int i, j;
+ ssize_t ret;
+
+ pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+ /* unaligned allocations can't be freed, round up to page size */
+ pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+ * sizeof(pcpu4k_pages[0]));
+ pcpu4k_pages = alloc_bootmem(pages_size);
+
+ /* allocate and copy */
+ j = 0;
+ for_each_possible_cpu(cpu)
+ for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+ void *ptr;
+
+ ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
+ if (!ptr)
+ goto enomem;
+
+ memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+ pcpu4k_pages[j++] = virt_to_page(ptr);
+ }
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
- enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+ /* we're ready, commit */
+ pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+ pcpu4k_nr_static_pages, static_size);
+
+ ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+ PERCPU_FIRST_CHUNK_RESERVE, -1,
+ -1, NULL, pcpu4k_populate_pte);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
+ ret = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pcpu4k_pages), pages_size);
+ return ret;
}
-void __cpuinit numa_add_cpu(int cpu)
+static inline void setup_percpu_segment(int cpu)
{
- numa_set_cpumask(cpu, 1);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
- numa_set_cpumask(cpu, 0);
-}
+#ifdef CONFIG_X86_32
+ struct desc_struct gdt;
-int cpu_to_node(int cpu)
-{
- if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
- printk(KERN_WARNING
- "cpu_to_node(%d): usage too early!\n", cpu);
- dump_stack();
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
- }
- return per_cpu(x86_cpu_to_node_map, cpu);
+ pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
+ 0x2 | DESCTYPE_S, 0x8);
+ gdt.s = 1;
+ write_gdt_entry(get_cpu_gdt_table(cpu),
+ GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+#endif
}
-EXPORT_SYMBOL(cpu_to_node);
/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
*/
-int early_cpu_to_node(int cpu)
+void __init setup_per_cpu_areas(void)
{
- if (early_per_cpu_ptr(x86_cpu_to_node_map))
- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
- if (!per_cpu_offset(cpu)) {
- printk(KERN_WARNING
- "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
- dump_stack();
- return NUMA_NO_NODE;
- }
- return per_cpu(x86_cpu_to_node_map, cpu);
-}
+ size_t static_size = __per_cpu_end - __per_cpu_start;
+ unsigned int cpu;
+ unsigned long delta;
+ size_t pcpu_unit_size;
+ ssize_t ret;
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
-/* empty cpumask */
-static const cpumask_t cpu_mask_none;
-
-/*
- * Returns a pointer to the bitmask of CPUs on Node 'node'.
- */
-const cpumask_t *cpumask_of_node(int node)
-{
- if (node_to_cpumask_map == NULL) {
- printk(KERN_WARNING
- "cpumask_of_node(%d): no node_to_cpumask_map!\n",
- node);
- dump_stack();
- return (const cpumask_t *)&cpu_online_map;
- }
- if (node >= nr_node_ids) {
- printk(KERN_WARNING
- "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
- node, nr_node_ids);
- dump_stack();
- return &cpu_mask_none;
- }
- return &node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(cpumask_of_node);
-
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- *
- * Side note: this function creates the returned cpumask on the stack
- * so with a high NR_CPUS count, excessive stack space is used. The
- * node_to_cpumask_ptr function should be used whenever possible.
- */
-cpumask_t node_to_cpumask(int node)
-{
- if (node_to_cpumask_map == NULL) {
- printk(KERN_WARNING
- "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
- dump_stack();
- return cpu_online_map;
- }
- if (node >= nr_node_ids) {
- printk(KERN_WARNING
- "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
- node, nr_node_ids);
- dump_stack();
- return cpu_mask_none;
+ /*
+ * Allocate percpu area. If PSE is supported, try to make use
+ * of large page mappings. Please read comments on top of
+ * each allocator for details.
+ */
+ ret = setup_pcpu_remap(static_size);
+ if (ret < 0)
+ ret = setup_pcpu_embed(static_size);
+ if (ret < 0)
+ ret = setup_pcpu_4k(static_size);
+ if (ret < 0)
+ panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
+ static_size, ret);
+
+ pcpu_unit_size = ret;
+
+ /* alrighty, percpu areas up and running */
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu) {
+ per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+ per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
+ per_cpu(cpu_number, cpu) = cpu;
+ setup_percpu_segment(cpu);
+ setup_stack_canary_segment(cpu);
+ /*
+ * Copy data used in early init routines from the
+ * initial arrays to the per cpu data areas. These
+ * arrays then become expendable and the *_early_ptr's
+ * are zeroed indicating that the static arrays are
+ * gone.
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+ per_cpu(x86_cpu_to_apicid, cpu) =
+ early_per_cpu_map(x86_cpu_to_apicid, cpu);
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#endif
+#ifdef CONFIG_X86_64
+ per_cpu(irq_stack_ptr, cpu) =
+ per_cpu(irq_stack_union.irq_stack, cpu) +
+ IRQ_STACK_SIZE - 64;
+#ifdef CONFIG_NUMA
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#endif
+#endif
+ /*
+ * Up to this point, the boot CPU has been using .data.init
+ * area. Reload any changed state for the boot CPU.
+ */
+ if (cpu == boot_cpu_id)
+ switch_to_new_gdt(cpu);
}
- return node_to_cpumask_map[node];
-}
-EXPORT_SYMBOL(node_to_cpumask);
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+ /* indicate the early static arrays will soon be gone */
+#ifdef CONFIG_X86_LOCAL_APIC
+ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
-#endif /* X86_64_NUMA */
+ /* Setup node to cpumask map */
+ setup_node_to_cpumask_map();
+ /* Setup cpu initialized, callin, callout masks */
+ setup_cpu_local_masks();
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index df0587f24c5..d2cc6428c58 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -50,27 +50,23 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-#define COPY(x) { \
- err |= __get_user(regs->x, &sc->x); \
-}
+#define COPY(x) do { \
+ get_user_ex(regs->x, &sc->x); \
+} while (0)
-#define COPY_SEG(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp; \
-}
+#define GET_SEG(seg) ({ \
+ unsigned short tmp; \
+ get_user_ex(tmp, &sc->seg); \
+ tmp; \
+})
-#define COPY_SEG_CPL3(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp | 3; \
-}
+#define COPY_SEG(seg) do { \
+ regs->seg = GET_SEG(seg); \
+} while (0)
-#define GET_SEG(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- loadsegment(seg, tmp); \
-}
+#define COPY_SEG_CPL3(seg) do { \
+ regs->seg = GET_SEG(seg) | 3; \
+} while (0)
static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
@@ -83,45 +79,49 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ get_user_try {
+
#ifdef CONFIG_X86_32
- GET_SEG(gs);
- COPY_SEG(fs);
- COPY_SEG(es);
- COPY_SEG(ds);
+ set_user_gs(regs, GET_SEG(gs));
+ COPY_SEG(fs);
+ COPY_SEG(es);
+ COPY_SEG(ds);
#endif /* CONFIG_X86_32 */
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip);
#ifdef CONFIG_X86_64
- COPY(r8);
- COPY(r9);
- COPY(r10);
- COPY(r11);
- COPY(r12);
- COPY(r13);
- COPY(r14);
- COPY(r15);
+ COPY(r8);
+ COPY(r9);
+ COPY(r10);
+ COPY(r11);
+ COPY(r12);
+ COPY(r13);
+ COPY(r14);
+ COPY(r15);
#endif /* CONFIG_X86_64 */
#ifdef CONFIG_X86_32
- COPY_SEG_CPL3(cs);
- COPY_SEG_CPL3(ss);
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
#else /* !CONFIG_X86_32 */
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_CPL3(cs);
+ /* Kernel saves and restores only the CS segment register on signals,
+ * which is the bare minimum needed to allow mixed 32/64-bit code.
+ * App's signal handler can save/restore other segments if needed. */
+ COPY_SEG_CPL3(cs);
#endif /* CONFIG_X86_32 */
- err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
+ get_user_ex(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_ax = -1; /* disable syscall checks */
- err |= __get_user(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
+ get_user_ex(buf, &sc->fpstate);
+ err |= restore_i387_xstate(buf);
+
+ get_user_ex(*pax, &sc->ax);
+ } get_user_catch(err);
- err |= __get_user(*pax, &sc->ax);
return err;
}
@@ -131,57 +131,55 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
{
int err = 0;
-#ifdef CONFIG_X86_32
- {
- unsigned int tmp;
+ put_user_try {
- savesegment(gs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
- }
- err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
- err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
- err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
+#ifdef CONFIG_X86_32
+ put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
+ put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
+ put_user_ex(regs->es, (unsigned int __user *)&sc->es);
+ put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
#endif /* CONFIG_X86_32 */
- err |= __put_user(regs->di, &sc->di);
- err |= __put_user(regs->si, &sc->si);
- err |= __put_user(regs->bp, &sc->bp);
- err |= __put_user(regs->sp, &sc->sp);
- err |= __put_user(regs->bx, &sc->bx);
- err |= __put_user(regs->dx, &sc->dx);
- err |= __put_user(regs->cx, &sc->cx);
- err |= __put_user(regs->ax, &sc->ax);
+ put_user_ex(regs->di, &sc->di);
+ put_user_ex(regs->si, &sc->si);
+ put_user_ex(regs->bp, &sc->bp);
+ put_user_ex(regs->sp, &sc->sp);
+ put_user_ex(regs->bx, &sc->bx);
+ put_user_ex(regs->dx, &sc->dx);
+ put_user_ex(regs->cx, &sc->cx);
+ put_user_ex(regs->ax, &sc->ax);
#ifdef CONFIG_X86_64
- err |= __put_user(regs->r8, &sc->r8);
- err |= __put_user(regs->r9, &sc->r9);
- err |= __put_user(regs->r10, &sc->r10);
- err |= __put_user(regs->r11, &sc->r11);
- err |= __put_user(regs->r12, &sc->r12);
- err |= __put_user(regs->r13, &sc->r13);
- err |= __put_user(regs->r14, &sc->r14);
- err |= __put_user(regs->r15, &sc->r15);
+ put_user_ex(regs->r8, &sc->r8);
+ put_user_ex(regs->r9, &sc->r9);
+ put_user_ex(regs->r10, &sc->r10);
+ put_user_ex(regs->r11, &sc->r11);
+ put_user_ex(regs->r12, &sc->r12);
+ put_user_ex(regs->r13, &sc->r13);
+ put_user_ex(regs->r14, &sc->r14);
+ put_user_ex(regs->r15, &sc->r15);
#endif /* CONFIG_X86_64 */
- err |= __put_user(current->thread.trap_no, &sc->trapno);
- err |= __put_user(current->thread.error_code, &sc->err);
- err |= __put_user(regs->ip, &sc->ip);
+ put_user_ex(current->thread.trap_no, &sc->trapno);
+ put_user_ex(current->thread.error_code, &sc->err);
+ put_user_ex(regs->ip, &sc->ip);
#ifdef CONFIG_X86_32
- err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
- err |= __put_user(regs->flags, &sc->flags);
- err |= __put_user(regs->sp, &sc->sp_at_signal);
- err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
+ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->sp, &sc->sp_at_signal);
+ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
#else /* !CONFIG_X86_32 */
- err |= __put_user(regs->flags, &sc->flags);
- err |= __put_user(regs->cs, &sc->cs);
- err |= __put_user(0, &sc->gs);
- err |= __put_user(0, &sc->fs);
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->cs, &sc->cs);
+ put_user_ex(0, &sc->gs);
+ put_user_ex(0, &sc->fs);
#endif /* CONFIG_X86_32 */
- err |= __put_user(fpstate, &sc->fpstate);
+ put_user_ex(fpstate, &sc->fpstate);
- /* non-iBCS2 extensions.. */
- err |= __put_user(mask, &sc->oldmask);
- err |= __put_user(current->thread.cr2, &sc->cr2);
+ /* non-iBCS2 extensions.. */
+ put_user_ex(mask, &sc->oldmask);
+ put_user_ex(current->thread.cr2, &sc->cr2);
+ } put_user_catch(err);
return err;
}
@@ -189,40 +187,35 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
/*
* Set up a signal frame.
*/
-#ifdef CONFIG_X86_32
-static const struct {
- u16 poplmovl;
- u32 val;
- u16 int80;
-} __attribute__((packed)) retcode = {
- 0xb858, /* popl %eax; movl $..., %eax */
- __NR_sigreturn,
- 0x80cd, /* int $0x80 */
-};
-
-static const struct {
- u8 movl;
- u32 val;
- u16 int80;
- u8 pad;
-} __attribute__((packed)) rt_retcode = {
- 0xb8, /* movl $..., %eax */
- __NR_rt_sigreturn,
- 0x80cd, /* int $0x80 */
- 0
-};
/*
* Determine which stack to use..
*/
+static unsigned long align_sigframe(unsigned long sp)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -16ul) - 4;
+#else /* !CONFIG_X86_32 */
+ sp = round_down(sp, 16) - 8;
+#endif
+ return sp;
+}
+
static inline void __user *
get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
- void **fpstate)
+ void __user **fpstate)
{
- unsigned long sp;
-
/* Default to using normal stack */
- sp = regs->sp;
+ unsigned long sp = regs->sp;
+
+#ifdef CONFIG_X86_64
+ /* redzone */
+ sp -= 128;
+#endif /* CONFIG_X86_64 */
/*
* If we are on the alternate signal stack and would overflow it, don't.
@@ -236,30 +229,52 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
if (sas_ss_flags(sp) == 0)
sp = current->sas_ss_sp + current->sas_ss_size;
} else {
+#ifdef CONFIG_X86_32
/* This is the legacy signal stack switching. */
if ((regs->ss & 0xffff) != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer)
sp = (unsigned long) ka->sa.sa_restorer;
+#endif /* CONFIG_X86_32 */
}
if (used_math()) {
- sp = sp - sig_xstate_size;
- *fpstate = (struct _fpstate *) sp;
+ sp -= sig_xstate_size;
+#ifdef CONFIG_X86_64
+ sp = round_down(sp, 64);
+#endif /* CONFIG_X86_64 */
+ *fpstate = (void __user *)sp;
+
if (save_i387_xstate(*fpstate) < 0)
return (void __user *)-1L;
}
- sp -= frame_size;
- /*
- * Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0.
- */
- sp = ((sp + 4) & -16ul) - 4;
-
- return (void __user *) sp;
+ return (void __user *)align_sigframe(sp - frame_size);
}
+#ifdef CONFIG_X86_32
+static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+} __attribute__((packed)) retcode = {
+ 0xb858, /* popl %eax; movl $..., %eax */
+ __NR_sigreturn,
+ 0x80cd, /* int $0x80 */
+};
+
+static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+} __attribute__((packed)) rt_retcode = {
+ 0xb8, /* movl $..., %eax */
+ __NR_rt_sigreturn,
+ 0x80cd, /* int $0x80 */
+ 0
+};
+
static int
__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
struct pt_regs *regs)
@@ -336,43 +351,41 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
- err |= __put_user(sig, &frame->sig);
- err |= __put_user(&frame->info, &frame->pinfo);
- err |= __put_user(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
- if (err)
- return -EFAULT;
+ put_user_try {
+ put_user_ex(sig, &frame->sig);
+ put_user_ex(&frame->info, &frame->pinfo);
+ put_user_ex(&frame->uc, &frame->puc);
+ err |= copy_siginfo_to_user(&frame->info, info);
- /* Create the ucontext. */
- if (cpu_has_xsave)
- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- err |= __put_user(0, &frame->uc.uc_flags);
- err |= __put_user(0, &frame->uc.uc_link);
- err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- err |= __put_user(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
- if (err)
- return -EFAULT;
-
- /* Set up to return from userspace. */
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
- err |= __put_user(restorer, &frame->pretcode);
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ put_user_ex(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ /* Set up to return from userspace. */
+ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
+ if (ka->sa.sa_flags & SA_RESTORER)
+ restorer = ka->sa.sa_restorer;
+ put_user_ex(restorer, &frame->pretcode);
- /*
- * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
+ /*
+ * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
+ } put_user_catch(err);
if (err)
return -EFAULT;
@@ -392,24 +405,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return 0;
}
#else /* !CONFIG_X86_32 */
-/*
- * Determine which stack to use..
- */
-static void __user *
-get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
-{
- /* Default to using normal stack - redzone*/
- sp -= 128;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- }
-
- return (void __user *)round_down(sp - size, 64);
-}
-
static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs)
{
@@ -418,15 +413,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
int err = 0;
struct task_struct *me = current;
- if (used_math()) {
- fp = get_stack(ka, regs->sp, sig_xstate_size);
- frame = (void __user *)round_down(
- (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
-
- if (save_i387_xstate(fp) < 0)
- return -EFAULT;
- } else
- frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
+ frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
@@ -436,28 +423,30 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return -EFAULT;
}
- /* Create the ucontext. */
- if (cpu_has_xsave)
- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- err |= __put_user(0, &frame->uc.uc_flags);
- err |= __put_user(0, &frame->uc.uc_link);
- err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- err |= __put_user(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- /* Set up to return from userspace. If provided, use a stub
- already in userspace. */
- /* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER) {
- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
- } else {
- /* could use a vstub here */
- return -EFAULT;
- }
+ put_user_try {
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ put_user_ex(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ /* x86-64 should always use SA_RESTORER. */
+ if (ka->sa.sa_flags & SA_RESTORER) {
+ put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
+ } else {
+ /* could use a vstub here */
+ err |= -EFAULT;
+ }
+ } put_user_catch(err);
if (err)
return -EFAULT;
@@ -509,31 +498,41 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
struct old_sigaction __user *oact)
{
struct k_sigaction new_ka, old_ka;
- int ret;
+ int ret = 0;
if (act) {
old_sigset_t mask;
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)))
return -EFAULT;
- __get_user(new_ka.sa.sa_flags, &act->sa_flags);
- __get_user(mask, &act->sa_mask);
+ get_user_try {
+ get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
+ get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
+ get_user_ex(mask, &act->sa_mask);
+ get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
+ } get_user_catch(ret);
+
+ if (ret)
+ return -EFAULT;
siginitset(&new_ka.sa.sa_mask, mask);
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
- __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
return -EFAULT;
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+ put_user_try {
+ put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
+ put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
+ put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+ put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
+ } put_user_catch(ret);
+
+ if (ret)
+ return -EFAULT;
}
return ret;
@@ -541,14 +540,9 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
#endif /* CONFIG_X86_32 */
#ifdef CONFIG_X86_32
-asmlinkage int sys_sigaltstack(unsigned long bx)
+int sys_sigaltstack(struct pt_regs *regs)
{
- /*
- * This is needed to make gcc realize it doesn't own the
- * "struct pt_regs"
- */
- struct pt_regs *regs = (struct pt_regs *)&bx;
- const stack_t __user *uss = (const stack_t __user *)bx;
+ const stack_t __user *uss = (const stack_t __user *)regs->bx;
stack_t __user *uoss = (stack_t __user *)regs->cx;
return do_sigaltstack(uss, uoss, regs->sp);
@@ -566,14 +560,12 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
* Do a signal return; undo the signal stack.
*/
#ifdef CONFIG_X86_32
-asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
+unsigned long sys_sigreturn(struct pt_regs *regs)
{
struct sigframe __user *frame;
- struct pt_regs *regs;
unsigned long ax;
sigset_t set;
- regs = (struct pt_regs *) &__unused;
frame = (struct sigframe __user *)(regs->sp - 8);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -600,7 +592,7 @@ badframe:
}
#endif /* CONFIG_X86_32 */
-static long do_rt_sigreturn(struct pt_regs *regs)
+long sys_rt_sigreturn(struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
unsigned long ax;
@@ -631,25 +623,6 @@ badframe:
return 0;
}
-#ifdef CONFIG_X86_32
-/*
- * Note: do not pass in pt_regs directly as with tail-call optimization
- * GCC will incorrectly stomp on the caller's frame and corrupt user-space
- * register state:
- */
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
-{
- struct pt_regs *regs = (struct pt_regs *)&__unused;
-
- return do_rt_sigreturn(regs);
-}
-#else /* !CONFIG_X86_32 */
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
-{
- return do_rt_sigreturn(regs);
-}
-#endif /* CONFIG_X86_32 */
-
/*
* OK, we're invoking a handler:
*/
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index e6faa3316bd..13f33ea8cca 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -2,7 +2,7 @@
* Intel SMP support routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* (c) 2002,2003 Andi Kleen, SuSE Labs.
*
* i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
@@ -26,8 +26,7 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
-#include <mach_ipi.h>
-#include <mach_apic.h>
+#include <asm/apic.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -118,12 +117,12 @@ static void native_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+ apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
}
void native_send_call_func_ipi(const struct cpumask *mask)
@@ -131,7 +130,7 @@ void native_send_call_func_ipi(const struct cpumask *mask)
cpumask_var_t allbutself;
if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
return;
}
@@ -140,9 +139,9 @@ void native_send_call_func_ipi(const struct cpumask *mask)
if (cpumask_equal(mask, allbutself) &&
cpumask_equal(cpu_online_mask, cpu_callout_mask))
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
else
- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
free_cpumask_var(allbutself);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bb1a3b1fc87..ef7d10170c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -2,7 +2,7 @@
* x86 SMP booting functions
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
* Copyright 2001 Andi Kleen, SuSE Labs.
*
* Much of the core SMP work is based on previous work by Thomas Radke, to
@@ -53,7 +53,6 @@
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/idle.h>
-#include <asm/smp.h>
#include <asm/trampoline.h>
#include <asm/cpu.h>
#include <asm/numa.h>
@@ -61,13 +60,12 @@
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/vmi.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
#include <asm/setup.h>
+#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
-#include <mach_apic.h>
-#include <mach_wakecpu.h>
-#include <smpboot_hooks.h>
+#include <asm/smpboot_hooks.h>
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
@@ -114,11 +112,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
-static atomic_t init_deasserted;
-
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
+atomic_t init_deasserted;
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
@@ -163,7 +157,7 @@ static void map_cpu_to_logical_apicid(void)
{
int cpu = smp_processor_id();
int apicid = logical_smp_processor_id();
- int node = apicid_to_node(apicid);
+ int node = apic->apicid_to_node(apicid);
if (!node_online(node))
node = first_online_node;
@@ -196,7 +190,8 @@ static void __cpuinit smp_callin(void)
* our local APIC. We have to wait for the IPI or we'll
* lock up on an APIC access.
*/
- wait_for_init_deassert(&init_deasserted);
+ if (apic->wait_for_init_deassert)
+ apic->wait_for_init_deassert(&init_deasserted);
/*
* (This works even if the APIC is not enabled.)
@@ -243,7 +238,8 @@ static void __cpuinit smp_callin(void)
*/
pr_debug("CALLIN, before setup_local_APIC().\n");
- smp_callin_clear_local_apic();
+ if (apic->smp_callin_clear_local_apic)
+ apic->smp_callin_clear_local_apic();
setup_local_APIC();
end_local_APIC_setup();
map_cpu_to_logical_apicid();
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
cpumask_set_cpu(cpuid, cpu_callin_mask);
}
-static int __cpuinitdata unsafe_smp;
-
/*
* Activate a secondary processor.
*/
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
cpu_idle();
}
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
- /*
- * Mask B, Pentium, but not Pentium MMX
- */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3)
- /*
- * Remember we have B step Pentia with bugs
- */
- smp_b_stepping = 1;
-
- /*
- * Certain Athlons might work (for various values of 'work') in SMP
- * but they are not certified as MP capable.
- */
- if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
- if (num_possible_cpus() == 1)
- goto valid_k7;
-
- /* Athlon 660/661 is valid. */
- if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
- (c->x86_mask == 1)))
- goto valid_k7;
-
- /* Duron 670 is valid */
- if ((c->x86_model == 7) && (c->x86_mask == 0))
- goto valid_k7;
-
- /*
- * Athlon 662, Duron 671, and Athlon >model 7 have capability
- * bit. It's worth noting that the A5 stepping (662) of some
- * Athlon XP's have the MP bit set.
- * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
- * more.
- */
- if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
- ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
- (c->x86_model > 7))
- if (cpu_has_mp)
- goto valid_k7;
-
- /* If we get here, not a certified SMP capable AMD system. */
- unsafe_smp = 1;
- }
-
-valid_k7:
- ;
-}
-
-static void __cpuinit smp_checks(void)
-{
- if (smp_b_stepping)
- printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
- "with B stepping processors.\n");
-
- /*
- * Don't taint if we are running SMP kernel on a single non-MP
- * approved Athlon
- */
- if (unsafe_smp && num_online_cpus() > 1) {
- printk(KERN_INFO "WARNING: This combination of AMD"
- "processors is not suitable for SMP.\n");
- add_taint(TAINT_UNSAFE_SMP);
- }
-}
-
/*
* The bootstrap kernel entry code has set these up. Save them for
* a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
c->cpu_index = id;
if (id != 0)
identify_secondary_cpu(c);
- smp_apply_quirks(c);
}
@@ -583,7 +506,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
/* Target chip */
/* Boot on the stack */
/* Kick the second */
- apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
+ apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -614,12 +537,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC) {
- send_status = uv_wakeup_secondary(phys_apicid, start_eip);
- atomic_set(&init_deasserted, 1);
- return send_status;
- }
-
maxlvt = lapic_get_maxlvt();
/*
@@ -745,78 +662,23 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}
-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- free_bootmem_pda(oldpda);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
+ * Returns zero if CPU booted OK, else error code from
+ * ->wakeup_secondary_cpu.
*/
+static int __cpuinit do_boot_cpu(int apicid, int cpu)
{
unsigned long boot_error = 0;
- int timeout;
unsigned long start_ip;
- unsigned short nmi_high = 0, nmi_low = 0;
+ int timeout;
struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
+ .cpu = cpu,
+ .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
- INIT_WORK(&c_idle.work, do_fork_idle);
-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
+ INIT_WORK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
@@ -847,14 +709,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
set_idle_for_cpu(cpu, c_idle.idle);
do_rest:
-#ifdef CONFIG_X86_32
per_cpu(current_task, cpu) = c_idle.idle;
- init_gdt(cpu);
+#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ initial_gs = per_cpu_offset(cpu);
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(c_idle.idle) -
+ KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
@@ -878,8 +742,6 @@ do_rest:
pr_debug("Setting warm reset code and vector.\n");
- store_NMI_vector(&nmi_high, &nmi_low);
-
smpboot_setup_warm_reset_vector(start_ip);
/*
* Be paranoid about clearing APIC errors.
@@ -891,9 +753,13 @@ do_rest:
}
/*
- * Starting actual IPI sequence...
+ * Kick the secondary CPU. Use the method in the APIC driver
+ * if it's defined - or use an INIT boot APIC message otherwise:
*/
- boot_error = wakeup_secondary_cpu(apicid, start_ip);
+ if (apic->wakeup_secondary_cpu)
+ boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
+ else
+ boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
if (!boot_error) {
/*
@@ -927,13 +793,11 @@ do_rest:
else
/* trampoline code not run */
printk(KERN_ERR "Not responding.\n");
- if (get_uv_system_type() != UV_NON_UNIQUE_APIC)
- inquire_remote_apic(apicid);
+ if (apic->inquire_remote_apic)
+ apic->inquire_remote_apic(apicid);
}
}
-#ifdef CONFIG_X86_64
-restore_state:
-#endif
+
if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -961,7 +825,7 @@ restore_state:
int __cpuinit native_cpu_up(unsigned int cpu)
{
- int apicid = cpu_present_to_apicid(cpu);
+ int apicid = apic->cpu_present_to_apicid(cpu);
unsigned long flags;
int err;
@@ -1054,14 +918,14 @@ static int __init smp_sanity_check(unsigned max_cpus)
{
preempt_disable();
-#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
+#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
if (def_to_bigsmp && nr_cpu_ids > 8) {
unsigned int cpu;
unsigned nr;
printk(KERN_WARNING
"More than 8 CPUs detected - skipping them.\n"
- "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
+ "Use CONFIG_X86_BIGSMP.\n");
nr = 0;
for_each_present_cpu(cpu) {
@@ -1107,7 +971,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
* Should not be necessary because the MP table should list the boot
* CPU too, but we do it for the sake of robustness anyway.
*/
- if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
+ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
printk(KERN_NOTICE
"weird, boot CPU (#%d) not listed by the BIOS.\n",
boot_cpu_physical_apicid);
@@ -1125,6 +989,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
printk(KERN_ERR "... forcing use of dummy APIC emulation."
"(tell your hw vendor)\n");
smpboot_clear_io_apic();
+ arch_disable_smp_support();
return -1;
}
@@ -1181,9 +1046,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
current_thread_info()->cpu = 0; /* needed? */
set_cpu_sibling_map(0);
-#ifdef CONFIG_X86_64
enable_IR_x2apic();
- setup_apic_routing();
+#ifdef CONFIG_X86_64
+ default_setup_apic_routing();
#endif
if (smp_sanity_check(max_cpus) < 0) {
@@ -1207,18 +1072,18 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
*/
setup_local_APIC();
-#ifdef CONFIG_X86_64
/*
* Enable IO APIC before setting up error vector
*/
if (!skip_ioapic_setup && nr_ioapics)
enable_IO_APIC();
-#endif
+
end_local_APIC_setup();
map_cpu_to_logical_apicid();
- setup_portio_remap();
+ if (apic->setup_portio_remap)
+ apic->setup_portio_remap();
smpboot_setup_io_apic();
/*
@@ -1240,10 +1105,7 @@ out:
void __init native_smp_prepare_boot_cpu(void)
{
int me = smp_processor_id();
-#ifdef CONFIG_X86_32
- init_gdt(me);
-#endif
- switch_to_new_gdt();
+ switch_to_new_gdt(me);
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
per_cpu(cpu_state, me) = CPU_ONLINE;
@@ -1254,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
pr_debug("Boot done.\n");
impress_friends();
- smp_checks();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839d..00000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * SMP stuff which is common to all sub-architectures.
- */
-#include <linux/module.h>
-#include <asm/smp.h>
-
-#ifdef CONFIG_X86_32
-DEFINE_PER_CPU(unsigned long, this_cpu_off);
-EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-
-/*
- * Initialize the CPU's GDT. This is either the boot CPU doing itself
- * (still using the master per-cpu area), or a CPU doing it for a
- * secondary which will soon come up.
- */
-__cpuinit void init_gdt(int cpu)
-{
- struct desc_struct gdt;
-
- pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
- 0x2 | DESCTYPE_S, 0x8);
- gdt.s = 1;
-
- write_gdt_entry(get_cpu_gdt_table(cpu),
- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
-
- per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
- per_cpu(cpu_number, cpu) = cpu;
-}
-#endif
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 10786af9554..f7bddc2e37d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -1,7 +1,7 @@
/*
* Stack trace management functions
*
- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
#include <linux/sched.h>
#include <linux/stacktrace.h>
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
deleted file mode 100644
index 7b987852e87..00000000000
--- a/arch/x86/kernel/summit_32.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * IBM Summit-Specific Code
- *
- * Written By: Matthew Dobson, IBM Corporation
- *
- * Copyright (c) 2003 IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/bios_ebda.h>
-#include <asm/summit/mpparse.h>
-
-static struct rio_table_hdr *rio_table_hdr __initdata;
-static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
-
-#ifndef CONFIG_X86_NUMAQ
-static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
-#endif
-
-static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
-{
- int twister = 0, node = 0;
- int i, bus, num_buses;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
- twister = rio_devs[i]->owner_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_rio_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
- return last_bus;
- }
-
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
- if (scal_devs[i]->node_id == twister) {
- node = scal_devs[i]->node_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_scal_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
- return last_bus;
- }
-
- switch (rio_devs[wpeg_num]->type) {
- case CompatWPEG:
- /*
- * The Compatibility Winnipeg controls the 2 legacy buses,
- * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
- * a PCI-PCI bridge card is used in either slot: total 5 buses.
- */
- num_buses = 5;
- break;
- case AltWPEG:
- /*
- * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
- * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
- * the "extra" buses for each of those slots: total 7 buses.
- */
- num_buses = 7;
- break;
- case LookOutAWPEG:
- case LookOutBWPEG:
- /*
- * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
- * & the "extra" buses for each of those slots: total 9 buses.
- */
- num_buses = 9;
- break;
- default:
- printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
- return last_bus;
- }
-
- for (bus = last_bus; bus < last_bus + num_buses; bus++)
- mp_bus_id_to_node[bus] = node;
- return bus;
-}
-
-static int __init build_detail_arrays(void)
-{
- unsigned long ptr;
- int i, scal_detail_size, rio_detail_size;
-
- if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
- printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
- return 0;
- }
-
- switch (rio_table_hdr->version) {
- default:
- printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
- return 0;
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- }
-
- ptr = (unsigned long)rio_table_hdr + 3;
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 1;
-}
-
-void __init setup_summit(void)
-{
- unsigned long ptr;
- unsigned short offset;
- int i, next_wpeg, next_bus = 0;
-
- /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
- ptr = get_bios_ebda();
- ptr = (unsigned long)phys_to_virt(ptr);
-
- rio_table_hdr = NULL;
- offset = 0x180;
- while (offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- /* The next offset is stored in the 1st word. 0 means no more */
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
- return;
- }
-
- if (!build_detail_arrays())
- return;
-
- /* The first Winnipeg we're looking for has an index of 0 */
- next_wpeg = 0;
- do {
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
- /* It's the Winnipeg we're looking for! */
- next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
- next_wpeg++;
- break;
- }
- }
- /*
- * If we go through all Rio devices and don't find one with
- * the next index, it means we've found all the Winnipegs,
- * and thus all the PCI buses.
- */
- if (i == rio_table_hdr->num_rio_dev)
- next_wpeg = 0;
- } while (next_wpeg != 0);
-}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e2e86a08f31..3bdb64829b8 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -1,7 +1,7 @@
ENTRY(sys_call_table)
.long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
.long sys_exit
- .long sys_fork
+ .long ptregs_fork
.long sys_read
.long sys_write
.long sys_open /* 5 */
@@ -10,7 +10,7 @@ ENTRY(sys_call_table)
.long sys_creat
.long sys_link
.long sys_unlink /* 10 */
- .long sys_execve
+ .long ptregs_execve
.long sys_chdir
.long sys_time
.long sys_mknod
@@ -109,17 +109,17 @@ ENTRY(sys_call_table)
.long sys_newlstat
.long sys_newfstat
.long sys_uname
- .long sys_iopl /* 110 */
+ .long ptregs_iopl /* 110 */
.long sys_vhangup
.long sys_ni_syscall /* old "idle" system call */
- .long sys_vm86old
+ .long ptregs_vm86old
.long sys_wait4
.long sys_swapoff /* 115 */
.long sys_sysinfo
.long sys_ipc
.long sys_fsync
- .long sys_sigreturn
- .long sys_clone /* 120 */
+ .long ptregs_sigreturn
+ .long ptregs_clone /* 120 */
.long sys_setdomainname
.long sys_newuname
.long sys_modify_ldt
@@ -165,14 +165,14 @@ ENTRY(sys_call_table)
.long sys_mremap
.long sys_setresuid16
.long sys_getresuid16 /* 165 */
- .long sys_vm86
+ .long ptregs_vm86
.long sys_ni_syscall /* Old sys_query_module */
.long sys_poll
.long sys_nfsservctl
.long sys_setresgid16 /* 170 */
.long sys_getresgid16
.long sys_prctl
- .long sys_rt_sigreturn
+ .long ptregs_rt_sigreturn
.long sys_rt_sigaction
.long sys_rt_sigprocmask /* 175 */
.long sys_rt_sigpending
@@ -185,11 +185,11 @@ ENTRY(sys_call_table)
.long sys_getcwd
.long sys_capget
.long sys_capset /* 185 */
- .long sys_sigaltstack
+ .long ptregs_sigaltstack
.long sys_sendfile
.long sys_ni_syscall /* reserved for streams1 */
.long sys_ni_syscall /* reserved for streams2 */
- .long sys_vfork /* 190 */
+ .long ptregs_vfork /* 190 */
.long sys_getrlimit
.long sys_mmap2
.long sys_truncate64
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 3985cac0ed4..5c5d87f0b2e 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -33,12 +33,12 @@
#include <linux/time.h>
#include <linux/mca.h>
-#include <asm/arch_hooks.h>
+#include <asm/setup.h>
#include <asm/hpet.h>
#include <asm/time.h>
#include <asm/timer.h>
-#include "do_timer.h"
+#include <asm/do_timer.h>
int timer_ack;
@@ -118,7 +118,7 @@ void __init hpet_time_init(void)
{
if (!hpet_enable())
setup_pit_timer();
- time_init_hook();
+ x86_quirk_time_init();
}
/*
@@ -131,7 +131,7 @@ void __init hpet_time_init(void)
*/
void __init time_init(void)
{
- pre_time_init_hook();
+ x86_quirk_pre_time_init();
tsc_init();
late_time_init = choose_time_init();
}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index ce505464224..00000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-
-#include <asm/tlbflush.h>
-
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
- ____cacheline_aligned = { &init_mm, 0, };
-
-/* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
-
-/*
- * Smarter SMP flushing macros.
- * c/o Linus Torvalds.
- *
- * These mean you can really definitely utterly forget about
- * writing to user space from interrupts. (Its not allowed anyway).
- *
- * Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-
-static cpumask_t flush_cpumask;
-static struct mm_struct *flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(int cpu)
-{
- BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
- cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
- load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * for the wrong mm, and in the worst case we perform a superfluous
- * tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- * was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
- * flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
-void smp_invalidate_interrupt(struct pt_regs *regs)
-{
- unsigned long cpu;
-
- cpu = get_cpu();
-
- if (!cpu_isset(cpu, flush_cpumask))
- goto out;
- /*
- * This was a BUG() but until someone can quote me the
- * line from the intel manual that guarantees an IPI to
- * multiple CPUs is retried _only_ on the erroring CPUs
- * its staying as a return
- *
- * BUG();
- */
-
- if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (flush_va == TLB_FLUSH_ALL)
- local_flush_tlb();
- else
- __flush_tlb_one(flush_va);
- } else
- leave_mm(cpu);
- }
- ack_APIC_irq();
- smp_mb__before_clear_bit();
- cpu_clear(cpu, flush_cpumask);
- smp_mb__after_clear_bit();
-out:
- put_cpu_no_resched();
- inc_irq_stat(irq_tlb_count);
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
-{
- cpumask_t cpumask = *cpumaskp;
-
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
-#ifdef CONFIG_HOTPLUG_CPU
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (unlikely(cpus_empty(cpumask)))
- return;
-#endif
-
- /*
- * i'm not happy about this global shared spinlock in the
- * MM hot path, but we'll see how contended it is.
- * AK: x86-64 has a faster method that could be ported.
- */
- spin_lock(&tlbstate_lock);
-
- flush_mm = mm;
- flush_va = va;
- cpus_or(flush_cpumask, cpumask, flush_cpumask);
-
- /*
- * Make the above memory operations globally visible before
- * sending the IPI.
- */
- smp_mb();
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
-
- while (!cpus_empty(flush_cpumask))
- /* nothing. lockup detection does not belong here */
- cpu_relax();
-
- flush_mm = NULL;
- flush_va = 0;
- spin_unlock(&tlbstate_lock);
-}
-
-void flush_tlb_current_task(void)
-{
- struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
- preempt_enable();
-}
-
-void flush_tlb_mm(struct mm_struct *mm)
-{
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- local_flush_tlb();
- else
- leave_mm(smp_processor_id());
- }
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
- preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
- struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- __flush_tlb_one(va);
- else
- leave_mm(smp_processor_id());
- }
-
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
-
- preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-static void do_flush_tlb_all(void *info)
-{
- unsigned long cpu = smp_processor_id();
-
- __flush_tlb_all();
- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
- leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
- on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
-
-void reset_lazy_tlbstate(void)
-{
- int cpu = raw_smp_processor_id();
-
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
deleted file mode 100644
index f8be6f1d2e4..00000000000
--- a/arch/x86/kernel/tlb_64.c
+++ /dev/null
@@ -1,284 +0,0 @@
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/interrupt.h>
-
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/apicdef.h>
-#include <asm/idle.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
-
-#include <mach_ipi.h>
-/*
- * Smarter SMP flushing macros.
- * c/o Linus Torvalds.
- *
- * These mean you can really definitely utterly forget about
- * writing to user space from interrupts. (Its not allowed anyway).
- *
- * Optimizations Manfred Spraul <manfred@colorfullife.com>
- *
- * More scalable flush, from Andi Kleen
- *
- * To avoid global state use 8 different call vectors.
- * Each CPU uses a specific vector to trigger flushes on other
- * CPUs. Depending on the received vector the target CPUs look into
- * the right per cpu variable for the flush data.
- *
- * With more than 8 CPUs they are hashed to the 8 available
- * vectors. The limited global vector space forces us to this right now.
- * In future when interrupts are split into per CPU domains this could be
- * fixed, at the cost of triggering multiple IPIs in some cases.
- */
-
-union smp_flush_state {
- struct {
- cpumask_t flush_cpumask;
- struct mm_struct *flush_mm;
- unsigned long flush_va;
- spinlock_t tlbstate_lock;
- };
- char pad[SMP_CACHE_BYTES];
-} ____cacheline_aligned;
-
-/* State is put into the per CPU data section, but padded
- to a full cache line because other CPUs can access it and we don't
- want false sharing in the per cpu data segment. */
-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
-void leave_mm(int cpu)
-{
- if (read_pda(mmu_state) == TLBSTATE_OK)
- BUG();
- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
- load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * for the wrong mm, and in the worst case we perform a superfluous
- * tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- * was in lazy tlb mode.
- * 1a3) update cpu active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- * cpu active_mm is correct, cpu0 already handles
- * flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu mmu_state is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-
-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
-{
- int cpu;
- int sender;
- union smp_flush_state *f;
-
- cpu = smp_processor_id();
- /*
- * orig_rax contains the negated interrupt vector.
- * Use that to determine where the sender put the data.
- */
- sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
- f = &per_cpu(flush_state, sender);
-
- if (!cpu_isset(cpu, f->flush_cpumask))
- goto out;
- /*
- * This was a BUG() but until someone can quote me the
- * line from the intel manual that guarantees an IPI to
- * multiple CPUs is retried _only_ on the erroring CPUs
- * its staying as a return
- *
- * BUG();
- */
-
- if (f->flush_mm == read_pda(active_mm)) {
- if (read_pda(mmu_state) == TLBSTATE_OK) {
- if (f->flush_va == TLB_FLUSH_ALL)
- local_flush_tlb();
- else
- __flush_tlb_one(f->flush_va);
- } else
- leave_mm(cpu);
- }
-out:
- ack_APIC_irq();
- cpu_clear(cpu, f->flush_cpumask);
- inc_irq_stat(irq_tlb_count);
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
-{
- int sender;
- union smp_flush_state *f;
- cpumask_t cpumask = *cpumaskp;
-
- if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
- return;
-
- /* Caller has disabled preemption */
- sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
- f = &per_cpu(flush_state, sender);
-
- /*
- * Could avoid this lock when
- * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
- * probably not worth checking this for a cache-hot lock.
- */
- spin_lock(&f->tlbstate_lock);
-
- f->flush_mm = mm;
- f->flush_va = va;
- cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
-
- /*
- * Make the above memory operations globally visible before
- * sending the IPI.
- */
- smp_mb();
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
-
- while (!cpus_empty(f->flush_cpumask))
- cpu_relax();
-
- f->flush_mm = NULL;
- f->flush_va = 0;
- spin_unlock(&f->tlbstate_lock);
-}
-
-static int __cpuinit init_smp_flush(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
-
- return 0;
-}
-core_initcall(init_smp_flush);
-
-void flush_tlb_current_task(void)
-{
- struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
- preempt_enable();
-}
-
-void flush_tlb_mm(struct mm_struct *mm)
-{
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- local_flush_tlb();
- else
- leave_mm(smp_processor_id());
- }
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
- preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
-{
- struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- __flush_tlb_one(va);
- else
- leave_mm(smp_processor_id());
- }
-
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
-
- preempt_enable();
-}
-
-static void do_flush_tlb_all(void *info)
-{
- unsigned long cpu = smp_processor_id();
-
- __flush_tlb_all();
- if (read_pda(mmu_state) == TLBSTATE_LAZY)
- leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
- on_each_cpu(do_flush_tlb_all, NULL, 1);
-}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6812b829ed8..79c07324728 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,16 +11,15 @@
#include <linux/kernel.h>
#include <asm/mmu_context.h>
+#include <asm/uv/uv.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_bau.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/tsc.h>
#include <asm/irq_vectors.h>
-#include <mach_apic.h>
-
static struct bau_control **uv_bau_table_bases __read_mostly;
static int uv_bau_retry_limit __read_mostly;
@@ -210,14 +209,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
*
* Send a broadcast and wait for a broadcast message to complete.
*
- * The cpumaskp mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast was sent to.
*
- * Returns 1 if all remote flushing was done. The mask is zeroed.
- * Returns 0 if some remote flushing remains to be done. The mask is left
- * unchanged.
+ * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns @flush_mask if some remote flushing remains to be done. The
+ * mask will have some bits still set.
*/
-int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
- cpumask_t *cpumaskp)
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
+ struct bau_desc *bau_desc,
+ struct cpumask *flush_mask)
{
int completion_status = 0;
int right_shift;
@@ -257,66 +257,74 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
* the cpu's, all of which are still in the mask.
*/
__get_cpu_var(ptcstats).ptc_i++;
- return 0;
+ return flush_mask;
}
/*
* Success, so clear the remote cpu's from the mask so we don't
* use the IPI method of shootdown on them.
*/
- for_each_cpu_mask(bit, *cpumaskp) {
+ for_each_cpu(bit, flush_mask) {
blade = uv_cpu_to_blade_id(bit);
if (blade == this_blade)
continue;
- cpu_clear(bit, *cpumaskp);
+ cpumask_clear_cpu(bit, flush_mask);
}
- if (!cpus_empty(*cpumaskp))
- return 0;
- return 1;
+ if (!cpumask_empty(flush_mask))
+ return flush_mask;
+ return NULL;
}
/**
* uv_flush_tlb_others - globally purge translation cache of a virtual
* address or all TLB's
- * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @cpumask: mask of all cpu's in which the address is to be removed
* @mm: mm_struct containing virtual address range
* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @cpu: the current cpu
*
* This is the entry point for initiating any UV global TLB shootdown.
*
* Purges the translation caches of all specified processors of the given
* virtual address, or purges all TLB's on specified processors.
*
- * The caller has derived the cpumaskp from the mm_struct and has subtracted
- * the local cpu from the mask. This function is called only if there
- * are bits set in the mask. (e.g. flush_tlb_page())
+ * The caller has derived the cpumask from the mm_struct. This function
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
*
- * The cpumaskp is converted into a nodemask of the nodes containing
+ * The cpumask is converted into a nodemask of the nodes containing
* the cpus.
*
- * Returns 1 if all remote flushing was done.
- * Returns 0 if some remote flushing remains to be done.
+ * Note that this function should be called with preemption disabled.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done. The returned pointer is valid till preemption is re-enabled.
*/
-int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm,
+ unsigned long va, unsigned int cpu)
{
+ static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+ struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
int i;
int bit;
int blade;
- int cpu;
+ int uv_cpu;
int this_blade;
int locals = 0;
struct bau_desc *bau_desc;
- cpu = uv_blade_processor_id();
+ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
+ uv_cpu = uv_blade_processor_id();
this_blade = uv_numa_blade_id();
bau_desc = __get_cpu_var(bau_control).descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
+ bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
i = 0;
- for_each_cpu_mask(bit, *cpumaskp) {
+ for_each_cpu(bit, flush_mask) {
blade = uv_cpu_to_blade_id(bit);
BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
if (blade == this_blade) {
@@ -331,17 +339,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
* no off_node flushing; return status for local node
*/
if (locals)
- return 0;
+ return flush_mask;
else
- return 1;
+ return NULL;
}
__get_cpu_var(ptcstats).requestor++;
__get_cpu_var(ptcstats).ntargeted += i;
bau_desc->payload.address = va;
- bau_desc->payload.sending_cpu = smp_processor_id();
+ bau_desc->payload.sending_cpu = cpu;
- return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
+ return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
}
/*
@@ -742,7 +750,7 @@ static int __init uv_bau_init(void)
int node;
int nblades;
int last_blade;
- int cur_cpu = 0;
+ int cur_cpu;
if (!is_uv_system())
return 0;
@@ -752,6 +760,7 @@ static int __init uv_bau_init(void)
uv_mmask = (1UL << uv_hub_info->n_val) - 1;
nblades = 0;
last_blade = -1;
+ cur_cpu = 0;
for_each_online_node(node) {
blade = uv_node_to_blade_id(node);
if (blade == last_blade)
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index d8ccc3c6552..66d874e5404 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -29,7 +29,7 @@
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
/* We can free up trampoline after bootup if cpu hotplug is not supported. */
#ifndef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 894293c598d..cddfb8d386b 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,10 +25,11 @@
*/
#include <linux/linkage.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
+#include <asm/pgtable_types.h>
+#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/segment.h>
+#include <asm/processor-flags.h>
.section .rodata, "a", @progbits
@@ -37,7 +38,7 @@
ENTRY(trampoline_data)
r_base = .
cli # We should be safe anyway
- wbinvd
+ wbinvd
mov %cs, %ax # Code and data in the same place
mov %ax, %ds
mov %ax, %es
@@ -73,9 +74,8 @@ r_base = .
lidtl tidt - r_base # load idt with 0, 0
lgdtl tgdt - r_base # load gdt with whatever is appropriate
- xor %ax, %ax
- inc %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
+ mov $X86_CR0_PE, %ax # protected mode (PE) bit
+ lmsw %ax # into protected mode
# flush prefetch and jump to startup_32
ljmpl *(startup_32_vector - r_base)
@@ -86,9 +86,8 @@ startup_32:
movl $__KERNEL_DS, %eax # Initialize the %ds segment register
movl %eax, %ds
- xorl %eax, %eax
- btsl $5, %eax # Enable PAE mode
- movl %eax, %cr4
+ movl $X86_CR4_PAE, %eax
+ movl %eax, %cr4 # Enable PAE mode
# Setup trampoline 4 level pagetables
leal (trampoline_level4_pgt - r_base)(%esi), %eax
@@ -99,9 +98,9 @@ startup_32:
xorl %edx, %edx
wrmsr
- xorl %eax, %eax
- btsl $31, %eax # Enable paging and in turn activate Long Mode
- btsl $0, %eax # Enable protected mode
+ # Enable paging and in turn activate Long Mode
+ # Enable protected mode
+ movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a9e7548e179..a1d288327ff 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,15 +54,14 @@
#include <asm/desc.h>
#include <asm/i387.h>
-#include <mach_traps.h>
+#include <asm/mach_traps.h>
#ifdef CONFIG_X86_64
#include <asm/pgalloc.h>
#include <asm/proto.h>
-#include <asm/pda.h>
#else
#include <asm/processor-flags.h>
-#include <asm/arch_hooks.h>
+#include <asm/setup.h>
#include <asm/traps.h>
#include "cpu/mcheck/mce.h"
@@ -119,47 +118,6 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
if (!user_mode_vm(regs))
die(str, regs, err);
}
-
-/*
- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
- * invalid offset set (the LAZY one) and the faulting thread has
- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
- * we set the offset field correctly and return 1.
- */
-static int lazy_iobitmap_copy(void)
-{
- struct thread_struct *thread;
- struct tss_struct *tss;
- int cpu;
-
- cpu = get_cpu();
- tss = &per_cpu(init_tss, cpu);
- thread = &current->thread;
-
- if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
- thread->io_bitmap_ptr) {
- memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
- thread->io_bitmap_max);
- /*
- * If the previously set map was extending to higher ports
- * than the current one, pad extra space with 0xff (no access).
- */
- if (thread->io_bitmap_max < tss->io_bitmap_max) {
- memset((char *) tss->io_bitmap +
- thread->io_bitmap_max, 0xff,
- tss->io_bitmap_max - thread->io_bitmap_max);
- }
- tss->io_bitmap_max = thread->io_bitmap_max;
- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
- tss->io_bitmap_owner = thread;
- put_cpu();
-
- return 1;
- }
- put_cpu();
-
- return 0;
-}
#endif
static void __kprobes
@@ -310,11 +268,6 @@ do_general_protection(struct pt_regs *regs, long error_code)
conditional_sti(regs);
#ifdef CONFIG_X86_32
- if (lazy_iobitmap_copy()) {
- /* restart the faulting instruction */
- return;
- }
-
if (regs->flags & X86_VM_MASK)
goto gp_in_vm86;
#endif
@@ -914,19 +867,20 @@ void math_emulate(struct math_emu_info *info)
}
#endif /* CONFIG_MATH_EMULATION */
-dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs)
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_X86_32
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
- conditional_sti(&regs);
+ conditional_sti(regs);
- info.regs = &regs;
+ info.regs = regs;
math_emulate(&info);
} else {
math_state_restore(); /* interrupts still off */
- conditional_sti(&regs);
+ conditional_sti(regs);
}
#else
math_state_restore();
@@ -942,7 +896,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
info.si_signo = SIGILL;
info.si_errno = 0;
info.si_code = ILL_BADSTK;
- info.si_addr = 0;
+ info.si_addr = NULL;
if (notify_die(DIE_TRAP, "iret exception",
regs, error_code, 32, SIGILL) == NOTIFY_STOP)
return;
@@ -1026,6 +980,6 @@ void __init trap_init(void)
cpu_init();
#ifdef CONFIG_X86_32
- trap_init_hook();
+ x86_quirk_trap_init();
#endif
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d5cebb52d45..462b9ba67e9 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -793,7 +793,7 @@ __cpuinit int unsynchronized_tsc(void)
if (!cpu_has_tsc || tsc_unstable)
return 1;
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
if (apic_is_clustered_box())
return 1;
#endif
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 00000000000..2ffb6c53326
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME "sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+ struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+ .name = RTC_NAME,
+ .rating = 400,
+ .read = uv_read_rtc,
+ .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+ .shift = 10,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+ .name = RTC_NAME,
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+ .shift = 20,
+ .rating = 400,
+ .irq = -1,
+ .set_next_event = uv_rtc_next_event,
+ .set_mode = uv_rtc_timer_setup,
+ .event_handler = NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+ spinlock_t lock;
+ /* next cpu waiting for timer, local node relative: */
+ int next_cpu;
+ /* number of cpus on this node: */
+ int ncpus;
+ struct {
+ int lcpu; /* systemwide logical cpu number */
+ u64 expires; /* next timer expiration for this cpu */
+ } cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head **blade_info __read_mostly;
+
+static int uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+ unsigned long apicid, val;
+ int pnode;
+
+ apicid = cpu_physical_id(cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+ return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+ UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+ u64 val;
+ int pnode = uv_cpu_to_pnode(cpu);
+
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+ uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+ UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+ val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+ ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+ /* Set configuration */
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+ /* Initialize comparator value */
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+ return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+ int bid;
+
+ for_each_possible_blade(bid) {
+ kfree(blade_info[bid]);
+ }
+ kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+ int cpu;
+
+ blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+ if (!blade_info)
+ return -ENOMEM;
+ memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+ for_each_present_cpu(cpu) {
+ int nid = cpu_to_node(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ struct uv_rtc_timer_head *head = blade_info[bid];
+
+ if (!head) {
+ head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+ (uv_blade_nr_possible_cpus(bid) *
+ 2 * sizeof(u64)),
+ GFP_KERNEL, nid);
+ if (!head) {
+ uv_rtc_deallocate_timers();
+ return -ENOMEM;
+ }
+ spin_lock_init(&head->lock);
+ head->ncpus = uv_blade_nr_possible_cpus(bid);
+ head->next_cpu = -1;
+ blade_info[bid] = head;
+ }
+
+ head->cpu[bcpu].lcpu = cpu;
+ head->cpu[bcpu].expires = ULLONG_MAX;
+ }
+
+ return 0;
+}
+
+/* Find and set the next expiring timer. */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+ u64 lowest = ULLONG_MAX;
+ int c, bcpu = -1;
+
+ head->next_cpu = -1;
+ for (c = 0; c < head->ncpus; c++) {
+ u64 exp = head->cpu[c].expires;
+ if (exp < lowest) {
+ bcpu = c;
+ lowest = exp;
+ }
+ }
+ if (bcpu >= 0) {
+ head->next_cpu = bcpu;
+ c = head->cpu[bcpu].lcpu;
+ if (uv_setup_intr(c, lowest))
+ /* If we didn't set it up in time, trigger */
+ uv_rtc_send_IPI(c);
+ } else {
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ }
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int next_cpu;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ next_cpu = head->next_cpu;
+ *t = expires;
+ /* Will this one be next to go off? */
+ if (next_cpu < 0 || bcpu == next_cpu ||
+ expires < head->cpu[next_cpu].expires) {
+ head->next_cpu = bcpu;
+ if (uv_setup_intr(cpu, expires)) {
+ *t = ULLONG_MAX;
+ uv_rtc_find_next_timer(head, pnode);
+ spin_unlock_irqrestore(&head->lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore(&head->lock, flags);
+ return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int rc = 0;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+ rc = 1;
+
+ *t = ULLONG_MAX;
+
+ /* Was the hardware setup for this timer? */
+ if (head->next_cpu == bcpu)
+ uv_rtc_find_next_timer(head, pnode);
+
+ spin_unlock_irqrestore(&head->lock, flags);
+
+ return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+ return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+ struct clock_event_device *ced)
+{
+ int ced_cpu = cpumask_first(ced->cpumask);
+
+ return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ int ced_cpu = cpumask_first(evt->cpumask);
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_RESUME:
+ /* Nothing to do here yet */
+ break;
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ uv_rtc_unset_timer(ced_cpu);
+ break;
+ }
+}
+
+static void uv_rtc_interrupt(void)
+{
+ struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+ int cpu = smp_processor_id();
+
+ if (!ced || !ced->event_handler)
+ return;
+
+ if (uv_rtc_unset_timer(cpu) != 1)
+ return;
+
+ ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+ uv_rtc_enable = 1;
+
+ return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+ struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+ *ced = clock_event_device_uv;
+ ced->cpumask = cpumask_of(smp_processor_id());
+ clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+ int rc;
+
+ if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+ return -ENODEV;
+
+ generic_interrupt_extension = uv_rtc_interrupt;
+
+ clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+ clocksource_uv.shift);
+
+ rc = clocksource_register(&clocksource_uv);
+ if (rc) {
+ generic_interrupt_extension = NULL;
+ return rc;
+ }
+
+ /* Setup and register clockevents */
+ rc = uv_rtc_allocate_timers();
+ if (rc) {
+ clocksource_unregister(&clocksource_uv);
+ generic_interrupt_extension = NULL;
+ return rc;
+ }
+
+ clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+ NSEC_PER_SEC, clock_event_device_uv.shift);
+
+ clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+ sn_rtc_cycles_per_second;
+
+ clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+ (NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+ rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+ if (rc) {
+ clocksource_unregister(&clocksource_uv);
+ generic_interrupt_extension = NULL;
+ uv_rtc_deallocate_timers();
+ }
+
+ return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index d801d06af06..31ffc24eec4 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -24,18 +24,14 @@
#include <asm/visws/cobalt.h>
#include <asm/visws/piix4.h>
-#include <asm/arch_hooks.h>
#include <asm/io_apic.h>
#include <asm/fixmap.h>
#include <asm/reboot.h>
#include <asm/setup.h>
+#include <asm/apic.h>
#include <asm/e820.h>
#include <asm/io.h>
-#include <mach_ipi.h>
-
-#include "mach_apic.h"
-
#include <linux/kernel_stat.h>
#include <asm/i8259.h>
@@ -49,8 +45,6 @@
extern int no_broadcast;
-#include <asm/apic.h>
-
char visws_board_type = -1;
char visws_board_rev = -1;
@@ -200,7 +194,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
return;
}
- apic_cpus = apicid_to_cpu_present(m->apicid);
+ apic_cpus = apic->apicid_to_cpu_present(m->apicid);
physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
/*
* Validate version
@@ -584,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
static irqreturn_t piix4_master_intr(int irq, void *dev_id)
{
int realirq;
- irq_desc_t *desc;
+ struct irq_desc *desc;
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 4eeb5cf9720..d7ac84e7fc1 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
ret = KVM86->regs32;
ret->fs = current->thread.saved_fs;
- loadsegment(gs, current->thread.saved_gs);
+ set_user_gs(ret, current->thread.saved_gs);
return ret;
}
@@ -197,9 +197,9 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
-asmlinkage int sys_vm86old(struct pt_regs regs)
+int sys_vm86old(struct pt_regs *regs)
{
- struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
+ struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
@@ -218,7 +218,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
if (tmp)
goto out;
memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = &regs;
+ info.regs32 = regs;
tsk->thread.vm86_info = v86;
do_sys_vm86(&info, tsk);
ret = 0; /* we never return here */
@@ -227,7 +227,7 @@ out:
}
-asmlinkage int sys_vm86(struct pt_regs regs)
+int sys_vm86(struct pt_regs *regs)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
@@ -239,12 +239,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
struct vm86plus_struct __user *v86;
tsk = current;
- switch (regs.bx) {
+ switch (regs->bx) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
+ ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
goto out;
case VM86_PLUS_INSTALL_CHECK:
/*
@@ -261,14 +261,14 @@ asmlinkage int sys_vm86(struct pt_regs regs)
ret = -EPERM;
if (tsk->thread.saved_sp0)
goto out;
- v86 = (struct vm86plus_struct __user *)regs.cx;
+ v86 = (struct vm86plus_struct __user *)regs->cx;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, regs32) -
sizeof(info.regs));
ret = -EFAULT;
if (tmp)
goto out;
- info.regs32 = &regs;
+ info.regs32 = regs;
info.vm86plus.is_vm86pus = 1;
tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
do_sys_vm86(&info, tsk);
@@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
info->regs32->ax = 0;
tsk->thread.saved_sp0 = tsk->thread.sp0;
tsk->thread.saved_fs = info->regs32->fs;
- savesegment(gs, tsk->thread.saved_gs);
+ tsk->thread.saved_gs = get_user_gs(info->regs32);
tss = &per_cpu(init_tss, get_cpu());
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index bef58b4982d..95deb9f2211 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
vmi_ops.update_pte(ptep, VMI_PAGE_PT);
}
-static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
-}
-
static void vmi_set_pud(pud_t *pudp, pud_t pudval)
{
/* Um, eww */
@@ -680,10 +675,11 @@ static inline int __init activate_vmi(void)
para_fill(pv_mmu_ops.write_cr2, SetCR2);
para_fill(pv_mmu_ops.write_cr3, SetCR3);
para_fill(pv_cpu_ops.write_cr4, SetCR4);
- para_fill(pv_irq_ops.save_fl, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
+
+ para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
+ para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
+ para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
+ para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
para_fill(pv_cpu_ops.wbinvd, WBINVD);
para_fill(pv_cpu_ops.read_tsc, RDTSC);
@@ -749,7 +745,6 @@ static inline int __init activate_vmi(void)
pv_mmu_ops.set_pmd = vmi_set_pmd;
#ifdef CONFIG_X86_PAE
pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pte_present = vmi_set_pte_present;
pv_mmu_ops.set_pud = vmi_set_pud;
pv_mmu_ops.pte_clear = vmi_pte_clear;
pv_mmu_ops.pmd_clear = vmi_pmd_clear;
@@ -797,8 +792,8 @@ static inline int __init activate_vmi(void)
#endif
#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic_ops->read, APICRead);
- para_fill(apic_ops->write, APICWrite);
+ para_fill(apic->read, APICRead);
+ para_fill(apic->write, APICWrite);
#endif
/*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index e5b088fffa4..33a788d5879 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -28,7 +28,6 @@
#include <asm/vmi.h>
#include <asm/vmi_time.h>
-#include <asm/arch_hooks.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/timer.h>
@@ -256,7 +255,7 @@ void __devinit vmi_time_bsp_init(void)
*/
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
local_irq_disable();
-#ifdef CONFIG_X86_SMP
+#ifdef CONFIG_SMP
/*
* XXX handle_percpu_irq only defined for SMP; we need to switch over
* to using it, since this is a local interrupt, which each CPU must
@@ -288,8 +287,7 @@ static struct clocksource clocksource_vmi;
static cycle_t read_real_cycles(void)
{
cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
- return ret >= clocksource_vmi.cycle_last ?
- ret : clocksource_vmi.cycle_last;
+ return max(ret, clocksource_vmi.cycle_last);
}
static struct clocksource clocksource_vmi = {
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c67559dde..62ad500d55f 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -12,7 +12,7 @@
#include <asm-generic/vmlinux.lds.h>
#include <asm/thread_info.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
#include <asm/cache.h>
#include <asm/boot.h>
@@ -178,14 +178,7 @@ SECTIONS
__initramfs_end = .;
}
#endif
- . = ALIGN(PAGE_SIZE);
- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
- __per_cpu_start = .;
- *(.data.percpu.page_aligned)
- *(.data.percpu)
- *(.data.percpu.shared_aligned)
- __per_cpu_end = .;
- }
+ PERCPU(PAGE_SIZE)
. = ALIGN(PAGE_SIZE);
/* freed after init ends here */
@@ -196,15 +189,24 @@ SECTIONS
*(.bss)
. = ALIGN(4);
__bss_stop = .;
- _end = . ;
- /* This is where the kernel creates the early boot page tables */
+ }
+
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
. = ALIGN(PAGE_SIZE);
- pg0 = . ;
+ __brk_base = . ;
+ . += 64 * 1024 ; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = . ;
+ }
+
+ .end : AT(ADDR(.end) - LOAD_OFFSET) {
+ _end = . ;
}
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
+ *(.discard)
}
STABS_DEBUG
@@ -212,6 +214,12 @@ SECTIONS
DWARF_DEBUG
}
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
+
#ifdef CONFIG_KEXEC
/* Link time checks */
#include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6be..c8742507b03 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,7 +5,8 @@
#define LOAD_OFFSET __START_KERNEL_map
#include <asm-generic/vmlinux.lds.h>
-#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/page_types.h>
#undef i386 /* in case the preprocessor is a 32bit one */
@@ -13,20 +14,23 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
jiffies_64 = jiffies;
-_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(7); /* RWE */
+#endif
+ data.init2 PT_LOAD FLAGS(7); /* RWE */
note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
{
. = __START_KERNEL;
phys_startup_64 = startup_64 - LOAD_OFFSET;
- _text = .; /* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
+ _text = .; /* Text and read-only data */
/* First the code that has to be first for bootstrapping */
*(.text.head)
_stext = .;
@@ -57,13 +61,13 @@ SECTIONS
.data : AT(ADDR(.data) - LOAD_OFFSET) {
DATA_DATA
CONSTRUCTORS
+ _edata = .; /* End of data section */
} :data
- _edata = .; /* End of data section */
- . = ALIGN(PAGE_SIZE);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
*(.data.cacheline_aligned)
}
. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -121,29 +125,29 @@ SECTIONS
#undef VVIRT_OFFSET
#undef VVIRT
- . = ALIGN(THREAD_SIZE); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+ . = ALIGN(THREAD_SIZE); /* init_task */
*(.data.init_task)
}:data.init
- . = ALIGN(PAGE_SIZE);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
*(.data.page_aligned)
}
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- __smp_alt_begin = .;
- __smp_locks = .;
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ /* might get freed after init */
+ . = ALIGN(PAGE_SIZE);
+ __smp_alt_begin = .;
+ __smp_locks = .;
*(.smp_locks)
+ __smp_locks_end = .;
+ . = ALIGN(PAGE_SIZE);
+ __smp_alt_end = .;
}
- __smp_locks_end = .;
- . = ALIGN(PAGE_SIZE);
- __smp_alt_end = .;
. = ALIGN(PAGE_SIZE); /* Init code and data */
- __init_begin = .;
+ __init_begin = .; /* paired with __init_end */
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
INIT_TEXT
@@ -155,40 +159,42 @@ SECTIONS
__initdata_end = .;
}
- . = ALIGN(16);
- __setup_start = .;
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
- __setup_end = .;
- __initcall_start = .;
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+ . = ALIGN(16);
+ __setup_start = .;
+ *(.init.setup)
+ __setup_end = .;
+ }
.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+ __initcall_start = .;
INITCALLS
+ __initcall_end = .;
}
- __initcall_end = .;
- __con_initcall_start = .;
.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ __con_initcall_start = .;
*(.con_initcall.init)
+ __con_initcall_end = .;
}
- __con_initcall_end = .;
- __x86_cpu_dev_start = .;
.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
*(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
}
- __x86_cpu_dev_end = .;
SECURITY_INIT
. = ALIGN(8);
.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
+ __parainstructions = .;
*(.parainstructions)
- __parainstructions_end = .;
+ __parainstructions_end = .;
}
- . = ALIGN(8);
- __alt_instructions = .;
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ . = ALIGN(8);
+ __alt_instructions = .;
*(.altinstructions)
+ __alt_instructions_end = .;
}
- __alt_instructions_end = .;
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
@@ -203,28 +209,53 @@ SECTIONS
#ifdef CONFIG_BLK_DEV_INITRD
. = ALIGN(PAGE_SIZE);
- __initramfs_start = .;
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
- __initramfs_end = .;
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+ __initramfs_start = .;
+ *(.init.ramfs)
+ __initramfs_end = .;
+ }
#endif
+#ifdef CONFIG_SMP
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - __data_nosave - should
+ * start another section data.init2. Also, pda should be at the head of
+ * percpu area. Preallocate it and define the percpu offset symbol
+ * so that it can be accessed as a percpu variable.
+ */
+ . = ALIGN(PAGE_SIZE);
+ PERCPU_VADDR(0, :percpu)
+#else
PERCPU(PAGE_SIZE)
+#endif
. = ALIGN(PAGE_SIZE);
__init_end = .;
- . = ALIGN(PAGE_SIZE);
- __nosave_begin = .;
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ __nosave_begin = .;
+ *(.data.nosave)
+ . = ALIGN(PAGE_SIZE);
+ __nosave_end = .;
+ } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
- __bss_start = .; /* BSS */
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ __bss_start = .; /* BSS */
*(.bss.page_aligned)
*(.bss)
- }
- __bss_stop = .;
+ __bss_stop = .;
+ }
+
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ __brk_base = . ;
+ . += 64 * 1024 ; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = . ;
+ }
_end = . ;
@@ -232,6 +263,7 @@ SECTIONS
/DISCARD/ : {
*(.exitcall.exit)
*(.eh_frame)
+ *(.discard)
}
STABS_DEBUG
@@ -239,8 +271,28 @@ SECTIONS
DWARF_DEBUG
}
+ /*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
/*
* Build-time check on the image size:
*/
ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big")
+#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec..74de562812c 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -22,7 +22,7 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
-#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT
/*
* Interrupt control on vSMPowered systems:
* ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
flags &= ~X86_EFLAGS_IF;
return flags;
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
static void vsmp_restore_fl(unsigned long flags)
{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
flags |= X86_EFLAGS_AC;
native_restore_fl(flags);
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
static void vsmp_irq_disable(void)
{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
static void vsmp_irq_enable(void)
{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
cap, ctl);
if (cap & ctl & (1 << 4)) {
/* Setup irq ops and turn on vSMP IRQ fastpath handling */
- pv_irq_ops.irq_disable = vsmp_irq_disable;
- pv_irq_ops.irq_enable = vsmp_irq_enable;
- pv_irq_ops.save_fl = vsmp_save_fl;
- pv_irq_ops.restore_fl = vsmp_restore_fl;
+ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
+ pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
+ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
pv_init_ops.patch = vsmp_patch;
ctl &= ~(1 << 4);
@@ -110,7 +114,6 @@ static void __init set_vsmp_pv_ops(void)
}
#endif
-#ifdef CONFIG_PCI
static int is_vsmp = -1;
static void __init detect_vsmp_box(void)
@@ -135,15 +138,6 @@ int is_vsmp_box(void)
return 0;
}
}
-#else
-static void __init detect_vsmp_box(void)
-{
-}
-int is_vsmp_box(void)
-{
- return 0;
-}
-#endif
void __init vsmp_init(void)
{
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa35..3909e3ba5ce 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
EXPORT_SYMBOL(load_gs_index);
-
-EXPORT_SYMBOL(_proxy_pda);