aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/.gitignore1
-rw-r--r--arch/x86/kernel/Makefile29
-rw-r--r--arch/x86/kernel/acpi/boot.c442
-rw-r--r--arch/x86/kernel/acpi/processor.c6
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S38
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/sleep.c40
-rw-r--r--arch/x86/kernel/alternative.c22
-rw-r--r--arch/x86/kernel/amd_iommu.c962
-rw-r--r--arch/x86/kernel/amd_iommu_init.c875
-rw-r--r--arch/x86/kernel/aperture_64.c313
-rw-r--r--arch/x86/kernel/apic_32.c127
-rw-r--r--arch/x86/kernel/apic_64.c68
-rw-r--r--arch/x86/kernel/apm_32.c37
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c7
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c17
-rw-r--r--arch/x86/kernel/cpu/amd.c42
-rw-r--r--arch/x86/kernel/cpu/amd_64.c222
-rw-r--r--arch/x86/kernel/cpu/bugs.c27
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c (renamed from arch/x86/kernel/bugs_64.c)0
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c35
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/common_64.c681
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c44
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c36
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c90
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c38
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c905
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h3
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c210
-rw-r--r--arch/x86/kernel/cpuid.c27
-rw-r--r--arch/x86/kernel/e820.c1390
-rw-r--r--arch/x86/kernel/e820_32.c775
-rw-r--r--arch/x86/kernel/e820_64.c952
-rw-r--r--arch/x86/kernel/early-quirks.c41
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/efi.c67
-rw-r--r--arch/x86/kernel/efi_32.c8
-rw-r--r--arch/x86/kernel/efi_64.c8
-rw-r--r--arch/x86/kernel/entry_32.S83
-rw-r--r--arch/x86/kernel/entry_64.S170
-rw-r--r--arch/x86/kernel/ftrace.c141
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c214
-rw-r--r--arch/x86/kernel/geode_32.c5
-rw-r--r--arch/x86/kernel/head.c55
-rw-r--r--arch/x86/kernel/head32.c27
-rw-r--r--arch/x86/kernel/head64.c96
-rw-r--r--arch/x86/kernel/head_32.S13
-rw-r--r--arch/x86/kernel/head_64.S101
-rw-r--r--arch/x86/kernel/hpet.c63
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c9
-rw-r--r--arch/x86/kernel/i8259.c (renamed from arch/x86/kernel/i8259_32.c)136
-rw-r--r--arch/x86/kernel/i8259_64.c512
-rw-r--r--arch/x86/kernel/io_apic_32.c678
-rw-r--r--arch/x86/kernel/io_apic_64.c282
-rw-r--r--arch/x86/kernel/ipi.c1
-rw-r--r--arch/x86/kernel/irq_32.c254
-rw-r--r--arch/x86/kernel/irq_64.c28
-rw-r--r--arch/x86/kernel/irqinit_32.c114
-rw-r--r--arch/x86/kernel/irqinit_64.c221
-rw-r--r--arch/x86/kernel/kvmclock.c89
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c6
-rw-r--r--arch/x86/kernel/microcode.c35
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c1
-rw-r--r--arch/x86/kernel/mpparse.c847
-rw-r--r--arch/x86/kernel/msr.c16
-rw-r--r--arch/x86/kernel/nmi.c (renamed from arch/x86/kernel/nmi_32.c)233
-rw-r--r--arch/x86/kernel/nmi_64.c482
-rw-r--r--arch/x86/kernel/numaq_32.c32
-rw-r--r--arch/x86/kernel/paravirt.c36
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c9
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/pci-dma.c22
-rw-r--r--arch/x86/kernel/pci-gart_64.c95
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c2
-rw-r--r--arch/x86/kernel/probe_roms_32.c166
-rw-r--r--arch/x86/kernel/process.c220
-rw-r--r--arch/x86/kernel/process_32.c69
-rw-r--r--arch/x86/kernel/process_64.c89
-rw-r--r--arch/x86/kernel/ptrace.c4
-rw-r--r--arch/x86/kernel/pvclock.c141
-rw-r--r--arch/x86/kernel/quirks.c60
-rw-r--r--arch/x86/kernel/reboot.c18
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c4
-rw-r--r--arch/x86/kernel/setup.c929
-rw-r--r--arch/x86/kernel/setup64.c287
-rw-r--r--arch/x86/kernel/setup_32.c958
-rw-r--r--arch/x86/kernel/setup_64.c1194
-rw-r--r--arch/x86/kernel/setup_percpu.c399
-rw-r--r--arch/x86/kernel/smp.c158
-rw-r--r--arch/x86/kernel/smpboot.c217
-rw-r--r--arch/x86/kernel/smpcommon.c56
-rw-r--r--arch/x86/kernel/srat_32.c358
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c64
-rw-r--r--arch/x86/kernel/time_32.c6
-rw-r--r--arch/x86/kernel/time_64.c16
-rw-r--r--arch/x86/kernel/tlb_32.c2
-rw-r--r--arch/x86/kernel/tlb_64.c7
-rw-r--r--arch/x86/kernel/tlb_uv.c792
-rw-r--r--arch/x86/kernel/trampoline.c2
-rw-r--r--arch/x86/kernel/traps_32.c190
-rw-r--r--arch/x86/kernel/traps_64.c541
-rw-r--r--arch/x86/kernel/tsc.c535
-rw-r--r--arch/x86/kernel/tsc_32.c453
-rw-r--r--arch/x86/kernel/tsc_64.c357
-rw-r--r--arch/x86/kernel/visws_quirks.c709
-rw-r--r--arch/x86/kernel/vmi_32.c6
-rw-r--r--arch/x86/kernel/vmiclock_32.c7
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S15
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S18
-rw-r--r--arch/x86/kernel/vsmp_64.c3
-rw-r--r--arch/x86/kernel/vsyscall_64.c19
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c16
127 files changed, 13528 insertions, 9434 deletions
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
index 4ea38a39aed..08f4fd73146 100644
--- a/arch/x86/kernel/.gitignore
+++ b/arch/x86/kernel/.gitignore
@@ -1,2 +1,3 @@
vsyscall.lds
vsyscall_32.lds
+vmlinux.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b472..da140611bb5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,10 +2,16 @@
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
+extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
+ifdef CONFIG_FTRACE
+# Do not profile debug utilities
+CFLAGS_REMOVE_tsc.o = -pg
+CFLAGS_REMOVE_rtc.o = -pg
+endif
+
#
# vsyscalls (which work on the user stack) should have
# no stack-protector checks:
@@ -13,20 +19,21 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc_64.o := $(nostackp)
+CFLAGS_tsc.o := $(nostackp)
obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
obj-y += traps_$(BITS).o irq_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o
-obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o
+obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
+obj-$(CONFIG_X86_VISWS) += visws_quirks.o
+obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o
-obj-y += bootflag.o e820_$(BITS).o
+obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o
-obj-$(CONFIG_X86_64) += bugs_64.o
-obj-y += tsc_$(BITS).o io_delay.o rtc.o
+obj-y += tsc.o io_delay.o rtc.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
@@ -53,9 +60,10 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
@@ -64,7 +72,6 @@ obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
obj-y += vsmp_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module_$(BITS).o
-obj-$(CONFIG_ACPI_SRAT) += srat_32.o
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
@@ -82,6 +89,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -93,12 +101,13 @@ obj-$(CONFIG_OLPC) += olpc.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
+ obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
+ obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 33c5216fd3e..f489d7a9be9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,6 +37,7 @@
#include <asm/pgtable.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
+#include <asm/genapic.h>
#include <asm/io.h>
#include <asm/mpspec.h>
#include <asm/smp.h>
@@ -106,21 +107,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
*/
enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
-#ifdef CONFIG_X86_64
-
-/* rely on all ACPI tables being in the direct mapping */
-char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
-{
- if (!phys_addr || !size)
- return NULL;
-
- if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE)
- return __va(phys_addr);
-
- return NULL;
-}
-
-#else
/*
* Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
@@ -139,11 +125,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
unsigned long base, offset, mapped_size;
int idx;
- if (phys + size < 8 * 1024 * 1024)
+ if (!phys || !size)
+ return NULL;
+
+ if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
return __va(phys);
offset = phys & (PAGE_SIZE - 1);
mapped_size = PAGE_SIZE - offset;
+ clear_fixmap(FIX_ACPI_END);
set_fixmap(FIX_ACPI_END, phys);
base = fix_to_virt(FIX_ACPI_END);
@@ -155,13 +145,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
if (--idx < FIX_ACPI_BEGIN)
return NULL; /* cannot handle this */
phys += PAGE_SIZE;
+ clear_fixmap(idx);
set_fixmap(idx, phys);
mapped_size += PAGE_SIZE;
}
return ((unsigned char *)base + offset);
}
-#endif
#ifdef CONFIG_PCI_MMCONFIG
/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
@@ -338,8 +328,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
#ifdef CONFIG_X86_IO_APIC
-struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-
static int __init
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
{
@@ -514,8 +502,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- extern void eisa_set_level_irq(unsigned int irq);
-
if (triggering == ACPI_LEVEL_SENSITIVE)
eisa_set_level_irq(gsi);
}
@@ -860,6 +846,364 @@ static int __init acpi_parse_madt_lapic_entries(void)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+#ifdef CONFIG_X86_ES7000
+extern int es7000_plat;
+#endif
+
+static struct {
+ int apic_id;
+ int gsi_base;
+ int gsi_end;
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+ int i = 0;
+
+ /* Find the IOAPIC that manages this GSI. */
+ for (i = 0; i < nr_ioapics; i++) {
+ if ((gsi >= mp_ioapic_routing[i].gsi_base)
+ && (gsi <= mp_ioapic_routing[i].gsi_end))
+ return i;
+ }
+
+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
+}
+
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return io_apic_get_unique_id(nr_ioapics, id);
+ else
+ return id;
+#else
+ int i;
+ DECLARE_BITMAP(used, 256);
+ bitmap_zero(used, 256);
+ for (i = 0; i < nr_ioapics; i++) {
+ struct mp_config_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->mp_apicid, used);
+ }
+ if (!test_bit(id, used))
+ return id;
+ return find_first_zero_bit(used, 256);
+#endif
+}
+
+static int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+ }
+ if (!address) {
+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+ int idx = 0;
+
+ if (bad_ioapic(address))
+ return;
+
+ idx = nr_ioapics;
+
+ mp_ioapics[idx].mp_type = MP_IOAPIC;
+ mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
+ mp_ioapics[idx].mp_apicaddr = address;
+
+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
+ mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
+#else
+ mp_ioapics[idx].mp_apicver = 0;
+#endif
+ /*
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+ */
+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
+ mp_ioapic_routing[idx].gsi_base = gsi_base;
+ mp_ioapic_routing[idx].gsi_end = gsi_base +
+ io_apic_get_redir_entries(idx);
+
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+ "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
+ mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
+ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+ nr_ioapics++;
+}
+
+static void assign_to_mp_irq(struct mp_config_intsrc *m,
+ struct mp_config_intsrc *mp_irq)
+{
+ memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
+ struct mp_config_intsrc *m)
+{
+ return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static void save_mp_irq(struct mp_config_intsrc *m)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!mp_irq_cmp(&mp_irqs[i], m))
+ return;
+ }
+
+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+ int ioapic;
+ int pin;
+ struct mp_config_intsrc mp_irq;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return;
+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ mp_irq.mp_type = MP_INTSRC;
+ mp_irq.mp_irqtype = mp_INT;
+ mp_irq.mp_irqflag = (trigger << 2) | polarity;
+ mp_irq.mp_srcbus = MP_ISA_BUS;
+ mp_irq.mp_srcbusirq = bus_irq; /* IRQ */
+ mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
+ mp_irq.mp_dstirq = pin; /* INTIN# */
+
+ save_mp_irq(&mp_irq);
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+ int i;
+ int ioapic;
+ unsigned int dstapic;
+ struct mp_config_intsrc mp_irq;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+ /*
+ * Fabricate the legacy ISA bus (bus #31).
+ */
+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+#ifdef CONFIG_X86_ES7000
+ /*
+ * Older generations of ES7000 have no legacy identity mappings
+ */
+ if (es7000_plat == 1)
+ return;
+#endif
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
+ */
+ ioapic = mp_find_ioapic(0);
+ if (ioapic < 0)
+ return;
+ dstapic = mp_ioapics[ioapic].mp_apicid;
+
+ /*
+ * Use the default configuration for the IRQs 0-15. Unless
+ * overridden by (MADT) interrupt source override entries.
+ */
+ for (i = 0; i < 16; i++) {
+ int idx;
+
+ for (idx = 0; idx < mp_irq_entries; idx++) {
+ struct mp_config_intsrc *irq = mp_irqs + idx;
+
+ /* Do we already have a mapping for this ISA IRQ? */
+ if (irq->mp_srcbus == MP_ISA_BUS
+ && irq->mp_srcbusirq == i)
+ break;
+
+ /* Do we already have a mapping for this IOAPIC pin */
+ if (irq->mp_dstapic == dstapic &&
+ irq->mp_dstirq == i)
+ break;
+ }
+
+ if (idx != mp_irq_entries) {
+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ continue; /* IRQ already used */
+ }
+
+ mp_irq.mp_type = MP_INTSRC;
+ mp_irq.mp_irqflag = 0; /* Conforming */
+ mp_irq.mp_srcbus = MP_ISA_BUS;
+ mp_irq.mp_dstapic = dstapic;
+ mp_irq.mp_irqtype = mp_INT;
+ mp_irq.mp_srcbusirq = i; /* Identity mapped */
+ mp_irq.mp_dstirq = i;
+
+ save_mp_irq(&mp_irq);
+ }
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+#ifdef CONFIG_X86_32
+#define MAX_GSI_NUM 4096
+#define IRQ_COMPRESSION_START 64
+
+ static int pci_irq = IRQ_COMPRESSION_START;
+ /*
+ * Mapping between Global System Interrupts, which
+ * represent all possible interrupts, and IRQs
+ * assigned to actual devices.
+ */
+ static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return gsi;
+#endif
+
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return gsi;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+ return gsi;
+ }
+
+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+#ifdef CONFIG_X86_32
+ if (ioapic_renumber_irq)
+ gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
+
+ /*
+ * Avoid pin reprogramming. PRTs typically include entries
+ * with redundant pin->gsi mappings (but unique PCI devices);
+ * we only program the IOAPIC on the first.
+ */
+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+ ioapic_pin);
+ return gsi;
+ }
+ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#ifdef CONFIG_X86_32
+ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+ return gsi;
+#endif
+ }
+
+ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#ifdef CONFIG_X86_32
+ /*
+ * For GSI >= 64, use IRQ compression
+ */
+ if ((gsi >= IRQ_COMPRESSION_START)
+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
+ /*
+ * For PCI devices assign IRQs in order, avoiding gaps
+ * due to unused I/O APIC pins.
+ */
+ int irq = gsi;
+ if (gsi < MAX_GSI_NUM) {
+ /*
+ * Retain the VIA chipset work-around (gsi > 15), but
+ * avoid a problem where the 8254 timer (IRQ0) is setup
+ * via an override (so it's not on pin 0 of the ioapic),
+ * and at the same time, the pin 0 interrupt is a PCI
+ * type. The gsi > 15 test could cause these two pins
+ * to be shared as IRQ0, and they are not shareable.
+ * So test for this condition, and if necessary, avoid
+ * the pin collision.
+ */
+ gsi = pci_irq++;
+ /*
+ * Don't assign IRQ used by ACPI SCI
+ */
+ if (gsi == acpi_gbl_FADT.sci_interrupt)
+ gsi = pci_irq++;
+ gsi_to_irq[irq] = gsi;
+ } else {
+ printk(KERN_ERR "GSI %u is too high\n", gsi);
+ return gsi;
+ }
+ }
+#endif
+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ return gsi;
+}
+
+int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+ u32 gsi, int triggering, int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mp_config_intsrc mp_irq;
+ int ioapic;
+
+ if (!acpi_ioapic)
+ return 0;
+
+ /* print the entry should happen on mptable identically */
+ mp_irq.mp_type = MP_INTSRC;
+ mp_irq.mp_irqtype = mp_INT;
+ mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.mp_srcbus = number;
+ mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
+ mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+ save_mp_irq(&mp_irq);
+#endif
+ return 0;
+}
+
/*
* Parse IOAPIC related entries in MADT
* returns 0 on success, < 0 on error
@@ -1009,8 +1353,6 @@ static void __init acpi_process_madt(void)
return;
}
-#ifdef __i386__
-
static int __init disable_acpi_irq(const struct dmi_system_id *d)
{
if (!acpi_force) {
@@ -1061,6 +1403,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
}
/*
+ * Force ignoring BIOS IRQ0 pin2 override
+ */
+static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
+{
+ pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
+ acpi_skip_timer_override = 1;
+ return 0;
+}
+
+/*
* If your system is blacklisted here, but you find that acpi=force
* works for you, please contact acpi-devel@sourceforge.net
*/
@@ -1227,11 +1579,35 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ /*
+ * HP laptops which use a DSDT reporting as HP/SB400/10000,
+ * which includes some code which overrides all temperature
+ * trip points to 16C if the INTIN2 input of the I/O APIC
+ * is enabled. This input is incorrectly designated the
+ * ISA IRQ 0 via an interrupt source override even though
+ * it is wired to the output of the master 8259A and INTIN0
+ * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * override in that cases.
+ */
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6125 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6325 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+ },
+ },
{}
};
-#endif /* __i386__ */
-
/*
* acpi_boot_table_init() and acpi_boot_init()
* called from setup_arch(), always.
@@ -1259,9 +1635,7 @@ int __init acpi_boot_table_init(void)
{
int error;
-#ifdef __i386__
dmi_check_system(acpi_dmi_table);
-#endif
/*
* If acpi_disabled, bail out
@@ -1386,6 +1760,20 @@ static int __init parse_pci(char *arg)
}
early_param("pci", parse_pci);
+int __init acpi_mps_check(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
+/* mptable code is not built-in*/
+ if (acpi_disabled || acpi_noirq) {
+ printk(KERN_WARNING "MPS support code is not built-in.\n"
+ "Using acpi=off or acpi=noirq or pci=noacpi "
+ "may have problem\n");
+ return 1;
+ }
+#endif
+ return 0;
+}
+
#ifdef CONFIG_X86_IO_APIC
static int __init parse_acpi_skip_timer_override(char *arg)
{
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index de2d2e4ebad..7c074eec39f 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_ACPI))
buf[2] |= ACPI_PDC_T_FFH;
+ /*
+ * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
+ */
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
+ buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
+
obj->type = ACPI_TYPE_BUFFER;
obj->buffer.length = 12;
obj->buffer.pointer = (u8 *) buf;
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index f9b77fb37e5..3355973b12a 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -5,6 +5,7 @@
#include <asm/msr-index.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/processor-flags.h>
.code16
.section ".header", "a"
@@ -24,6 +25,11 @@ pmode_gdt: .quad 0
realmode_flags: .long 0
real_magic: .long 0
trampoline_segment: .word 0
+_pad1: .byte 0
+wakeup_jmp: .byte 0xea /* ljmpw */
+wakeup_jmp_off: .word 3f
+wakeup_jmp_seg: .word 0
+wakeup_gdt: .quad 0, 0, 0
signature: .long 0x51ee1111
.text
@@ -34,11 +40,34 @@ _start:
cli
cld
+ /* Apparently some dimwit BIOS programmers don't know how to
+ program a PM to RM transition, and we might end up here with
+ junk in the data segment descriptor registers. The only way
+ to repair that is to go into PM and fix it ourselves... */
+ movw $16, %cx
+ lgdtl %cs:wakeup_gdt
+ movl %cr0, %eax
+ orb $X86_CR0_PE, %al
+ movl %eax, %cr0
+ jmp 1f
+1: ljmpw $8, $2f
+2:
+ movw %cx, %ds
+ movw %cx, %es
+ movw %cx, %ss
+ movw %cx, %fs
+ movw %cx, %gs
+
+ andb $~X86_CR0_PE, %al
+ movl %eax, %cr0
+ jmp wakeup_jmp
+3:
/* Set up segments */
movw %cs, %ax
movw %ax, %ds
movw %ax, %es
movw %ax, %ss
+ lidtl wakeup_idt
movl $wakeup_stack_end, %esp
@@ -98,7 +127,14 @@ bogus_real_magic:
jmp 1b
.data
- .balign 4
+ .balign 8
+
+ /* This is the standard real-mode IDT */
+wakeup_idt:
+ .word 0xffff /* limit */
+ .long 0 /* address */
+ .word 0
+
.globl HEAP, heap_end
HEAP:
.long wakeup_heap
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index ef8166fe802..69d38d0b2b6 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -24,6 +24,11 @@ struct wakeup_header {
u32 realmode_flags;
u32 real_magic;
u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
+ u8 _pad1;
+ u8 wakeup_jmp;
+ u16 wakeup_jmp_off;
+ u16 wakeup_jmp_seg;
+ u64 wakeup_gdt[3];
u32 signature; /* To check we have correct structure */
} __attribute__((__packed__));
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index afc25ee9964..868de3d5c39 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -23,6 +23,15 @@ static unsigned long acpi_realmode;
static char temp_stack[10240];
#endif
+/* XXX: this macro should move to asm-x86/segment.h and be shared with the
+ boot code... */
+#define GDT_ENTRY(flags, base, limit) \
+ (((u64)(base & 0xff000000) << 32) | \
+ ((u64)flags << 40) | \
+ ((u64)(limit & 0x00ff0000) << 32) | \
+ ((u64)(base & 0x00ffffff) << 16) | \
+ ((u64)(limit & 0x0000ffff)))
+
/**
* acpi_save_state_mem - save kernel state
*
@@ -50,6 +59,29 @@ int acpi_save_state_mem(void)
header->video_mode = saved_video_mode;
+ header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+
+ /*
+ * Set up the wakeup GDT. We set these up as Big Real Mode,
+ * that is, with limits set to 4 GB. At least the Lenovo
+ * Thinkpad X61 is known to need this for the video BIOS
+ * initialization quirk to work; this is likely to also
+ * be the case for other laptops or integrated video devices.
+ */
+
+ /* GDT[0]: GDT self-pointer */
+ header->wakeup_gdt[0] =
+ (u64)(sizeof(header->wakeup_gdt) - 1) +
+ ((u64)(acpi_wakeup_address +
+ ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
+ << 16);
+ /* GDT[1]: big real mode-like code segment */
+ header->wakeup_gdt[1] =
+ GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
+ /* GDT[2]: big real mode-like data segment */
+ header->wakeup_gdt[2] =
+ GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
+
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -72,7 +104,9 @@ int acpi_save_state_mem(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
header->trampoline_segment = setup_trampoline() >> 4;
- init_rsp = (unsigned long)temp_stack + 4096;
+#ifdef CONFIG_SMP
+ stack_start.sp = temp_stack + 4096;
+#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
#endif /* CONFIG_64BIT */
@@ -111,7 +145,7 @@ void __init acpi_reserve_bootmem(void)
return;
}
- acpi_wakeup_address = acpi_realmode;
+ acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
}
@@ -124,6 +158,8 @@ static int __init acpi_sleep_setup(char *str)
acpi_realmode_flags |= 2;
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
+ if (strncmp(str, "old_ordering", 12) == 0)
+ acpi_old_suspend_ordering();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65c7857a90d..2763cb37b55 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,6 +1,6 @@
#include <linux/module.h>
#include <linux/sched.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/kprobes.h>
#include <linux/mm.h>
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
#ifdef CONFIG_X86_64
extern char __vsyscall_0;
-static inline const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
{
return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
{ -1, NULL }
};
-static const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
{
const unsigned char *const *noptable = intel_nops;
int i;
@@ -279,7 +279,7 @@ struct smp_alt_module {
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
-static DEFINE_SPINLOCK(smp_alt);
+static DEFINE_MUTEX(smp_alt);
static int smp_mode = 1; /* protected by smp_alt */
void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
__func__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
- spin_lock(&smp_alt);
+ mutex_lock(&smp_alt);
list_add_tail(&smp->next, &smp_alt_modules);
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(smp->locks, smp->locks_end,
smp->text, smp->text_end);
- spin_unlock(&smp_alt);
+ mutex_unlock(&smp_alt);
}
void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
if (smp_alt_once || noreplace_smp)
return;
- spin_lock(&smp_alt);
+ mutex_lock(&smp_alt);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
- spin_unlock(&smp_alt);
+ mutex_unlock(&smp_alt);
DPRINTK("%s: %s\n", __func__, item->name);
kfree(item);
return;
}
- spin_unlock(&smp_alt);
+ mutex_unlock(&smp_alt);
}
void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
- spin_lock(&smp_alt);
+ mutex_lock(&smp_alt);
/*
* Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
mod->text, mod->text_end);
}
smp_mode = smp;
- spin_unlock(&smp_alt);
+ mutex_unlock(&smp_alt);
}
#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
new file mode 100644
index 00000000000..f2766d84c7a
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/bitops.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <asm/proto.h>
+#include <asm/gart.h>
+#include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
+
+#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
+
+#define to_pages(addr, size) \
+ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+static DEFINE_RWLOCK(amd_iommu_devtable_lock);
+
+struct command {
+ u32 data[4];
+};
+
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+ struct unity_map_entry *e);
+
+static int iommu_has_npcache(struct amd_iommu *iommu)
+{
+ return iommu->cap & IOMMU_CAP_NPCACHE;
+}
+
+static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+ u32 tail, head;
+ u8 *target;
+
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+ target = (iommu->cmd_buf + tail);
+ memcpy_toio(target, cmd, sizeof(*cmd));
+ tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ if (tail == head)
+ return -ENOMEM;
+ writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ return 0;
+}
+
+static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ ret = __iommu_queue_command(iommu, cmd);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return ret;
+}
+
+static int iommu_completion_wait(struct amd_iommu *iommu)
+{
+ int ret;
+ struct command cmd;
+ volatile u64 ready = 0;
+ unsigned long ready_phys = virt_to_phys(&ready);
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
+ cmd.data[1] = HIGH_U32(ready_phys);
+ cmd.data[2] = 1; /* value written to 'ready' */
+ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+ iommu->need_sync = 0;
+
+ ret = iommu_queue_command(iommu, &cmd);
+
+ if (ret)
+ return ret;
+
+ while (!ready)
+ cpu_relax();
+
+ return 0;
+}
+
+static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
+{
+ struct command cmd;
+
+ BUG_ON(iommu == NULL);
+
+ memset(&cmd, 0, sizeof(cmd));
+ CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
+ cmd.data[0] = devid;
+
+ iommu->need_sync = 1;
+
+ return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
+ u64 address, u16 domid, int pde, int s)
+{
+ struct command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ address &= PAGE_MASK;
+ CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
+ cmd.data[1] |= domid;
+ cmd.data[2] = LOW_U32(address);
+ cmd.data[3] = HIGH_U32(address);
+ if (s)
+ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ if (pde)
+ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+
+ iommu->need_sync = 1;
+
+ return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
+ u64 address, size_t size)
+{
+ int s = 0;
+ unsigned pages = to_pages(address, size);
+
+ address &= PAGE_MASK;
+
+ if (pages > 1) {
+ /*
+ * If we have to flush more than one page, flush all
+ * TLB entries for this domain
+ */
+ address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ s = 1;
+ }
+
+ iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
+
+ return 0;
+}
+
+static int iommu_map(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long phys_addr,
+ int prot)
+{
+ u64 __pte, *pte, *page;
+
+ bus_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(bus_addr);
+
+ /* only support 512GB address spaces for now */
+ if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+ return -EINVAL;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ *pte = IOMMU_L2_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ *pte = IOMMU_L1_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+
+ if (IOMMU_PTE_PRESENT(*pte))
+ return -EBUSY;
+
+ __pte = phys_addr | IOMMU_PTE_P;
+ if (prot & IOMMU_PROT_IR)
+ __pte |= IOMMU_PTE_IR;
+ if (prot & IOMMU_PROT_IW)
+ __pte |= IOMMU_PTE_IW;
+
+ *pte = __pte;
+
+ return 0;
+}
+
+static int iommu_for_unity_map(struct amd_iommu *iommu,
+ struct unity_map_entry *entry)
+{
+ u16 bdf, i;
+
+ for (i = entry->devid_start; i <= entry->devid_end; ++i) {
+ bdf = amd_iommu_alias_table[i];
+ if (amd_iommu_rlookup_table[bdf] == iommu)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int iommu_init_unity_mappings(struct amd_iommu *iommu)
+{
+ struct unity_map_entry *entry;
+ int ret;
+
+ list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+ if (!iommu_for_unity_map(iommu, entry))
+ continue;
+ ret = dma_ops_unity_map(iommu->default_dom, entry);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+ struct unity_map_entry *e)
+{
+ u64 addr;
+ int ret;
+
+ for (addr = e->address_start; addr < e->address_end;
+ addr += PAGE_SIZE) {
+ ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+ if (ret)
+ return ret;
+ /*
+ * if unity mapping is in aperture range mark the page
+ * as allocated in the aperture
+ */
+ if (addr < dma_dom->aperture_size)
+ __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+ }
+
+ return 0;
+}
+
+static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
+ u16 devid)
+{
+ struct unity_map_entry *e;
+ int ret;
+
+ list_for_each_entry(e, &amd_iommu_unity_map, list) {
+ if (!(devid >= e->devid_start && devid <= e->devid_end))
+ continue;
+ ret = dma_ops_unity_map(dma_dom, e);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static unsigned long dma_mask_to_pages(unsigned long mask)
+{
+ return (mask >> PAGE_SHIFT) +
+ (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
+}
+
+static unsigned long dma_ops_alloc_addresses(struct device *dev,
+ struct dma_ops_domain *dom,
+ unsigned int pages)
+{
+ unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+ unsigned long address;
+ unsigned long size = dom->aperture_size >> PAGE_SHIFT;
+ unsigned long boundary_size;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+ limit = limit < size ? limit : size;
+
+ if (dom->next_bit >= limit)
+ dom->next_bit = 0;
+
+ address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
+ 0 , boundary_size, 0);
+ if (address == -1)
+ address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
+ 0, boundary_size, 0);
+
+ if (likely(address != -1)) {
+ dom->next_bit = address + pages;
+ address <<= PAGE_SHIFT;
+ } else
+ address = bad_dma_address;
+
+ WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
+
+ return address;
+}
+
+static void dma_ops_free_addresses(struct dma_ops_domain *dom,
+ unsigned long address,
+ unsigned int pages)
+{
+ address >>= PAGE_SHIFT;
+ iommu_area_free(dom->bitmap, address, pages);
+}
+
+static u16 domain_id_alloc(void)
+{
+ unsigned long flags;
+ int id;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
+ BUG_ON(id == 0);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __set_bit(id, amd_iommu_pd_alloc_bitmap);
+ else
+ id = 0;
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return id;
+}
+
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages)
+{
+ unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+
+ if (start_page + pages > last_page)
+ pages = last_page - start_page;
+
+ set_bit_string(dom->bitmap, start_page, pages);
+}
+
+static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+{
+ int i, j;
+ u64 *p1, *p2, *p3;
+
+ p1 = dma_dom->domain.pt_root;
+
+ if (!p1)
+ return;
+
+ for (i = 0; i < 512; ++i) {
+ if (!IOMMU_PTE_PRESENT(p1[i]))
+ continue;
+
+ p2 = IOMMU_PTE_PAGE(p1[i]);
+ for (j = 0; j < 512; ++i) {
+ if (!IOMMU_PTE_PRESENT(p2[j]))
+ continue;
+ p3 = IOMMU_PTE_PAGE(p2[j]);
+ free_page((unsigned long)p3);
+ }
+
+ free_page((unsigned long)p2);
+ }
+
+ free_page((unsigned long)p1);
+}
+
+static void dma_ops_domain_free(struct dma_ops_domain *dom)
+{
+ if (!dom)
+ return;
+
+ dma_ops_free_pagetable(dom);
+
+ kfree(dom->pte_pages);
+
+ kfree(dom->bitmap);
+
+ kfree(dom);
+}
+
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
+ unsigned order)
+{
+ struct dma_ops_domain *dma_dom;
+ unsigned i, num_pte_pages;
+ u64 *l2_pde;
+ u64 address;
+
+ /*
+ * Currently the DMA aperture must be between 32 MB and 1GB in size
+ */
+ if ((order < 25) || (order > 30))
+ return NULL;
+
+ dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
+ if (!dma_dom)
+ return NULL;
+
+ spin_lock_init(&dma_dom->domain.lock);
+
+ dma_dom->domain.id = domain_id_alloc();
+ if (dma_dom->domain.id == 0)
+ goto free_dma_dom;
+ dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+ dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ dma_dom->domain.priv = dma_dom;
+ if (!dma_dom->domain.pt_root)
+ goto free_dma_dom;
+ dma_dom->aperture_size = (1ULL << order);
+ dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
+ GFP_KERNEL);
+ if (!dma_dom->bitmap)
+ goto free_dma_dom;
+ /*
+ * mark the first page as allocated so we never return 0 as
+ * a valid dma-address. So we can use 0 as error value
+ */
+ dma_dom->bitmap[0] = 1;
+ dma_dom->next_bit = 0;
+
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ int pages = to_pages(iommu->exclusion_start,
+ iommu->exclusion_length);
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
+
+ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
+ dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
+ GFP_KERNEL);
+ if (!dma_dom->pte_pages)
+ goto free_dma_dom;
+
+ l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (l2_pde == NULL)
+ goto free_dma_dom;
+
+ dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
+
+ for (i = 0; i < num_pte_pages; ++i) {
+ dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!dma_dom->pte_pages[i])
+ goto free_dma_dom;
+ address = virt_to_phys(dma_dom->pte_pages[i]);
+ l2_pde[i] = IOMMU_L1_PDE(address);
+ }
+
+ return dma_dom;
+
+free_dma_dom:
+ dma_ops_domain_free(dma_dom);
+
+ return NULL;
+}
+
+static struct protection_domain *domain_for_device(u16 devid)
+{
+ struct protection_domain *dom;
+ unsigned long flags;
+
+ read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ dom = amd_iommu_pd_table[devid];
+ read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return dom;
+}
+
+static void set_device_domain(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
+{
+ unsigned long flags;
+
+ u64 pte_root = virt_to_phys(domain->pt_root);
+
+ pte_root |= (domain->mode & 0x07) << 9;
+ pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ amd_iommu_dev_table[devid].data[0] = pte_root;
+ amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+ amd_iommu_dev_table[devid].data[2] = domain->id;
+
+ amd_iommu_pd_table[devid] = domain;
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+
+ iommu->need_sync = 1;
+}
+
+static int get_device_resources(struct device *dev,
+ struct amd_iommu **iommu,
+ struct protection_domain **domain,
+ u16 *bdf)
+{
+ struct dma_ops_domain *dma_dom;
+ struct pci_dev *pcidev;
+ u16 _bdf;
+
+ BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+
+ pcidev = to_pci_dev(dev);
+ _bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+
+ if (_bdf >= amd_iommu_last_bdf) {
+ *iommu = NULL;
+ *domain = NULL;
+ *bdf = 0xffff;
+ return 0;
+ }
+
+ *bdf = amd_iommu_alias_table[_bdf];
+
+ *iommu = amd_iommu_rlookup_table[*bdf];
+ if (*iommu == NULL)
+ return 0;
+ dma_dom = (*iommu)->default_dom;
+ *domain = domain_for_device(*bdf);
+ if (*domain == NULL) {
+ *domain = &dma_dom->domain;
+ set_device_domain(*iommu, *domain, *bdf);
+ printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+ "device ", (*domain)->id);
+ print_devid(_bdf, 1);
+ }
+
+ return 1;
+}
+
+static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
+ struct dma_ops_domain *dom,
+ unsigned long address,
+ phys_addr_t paddr,
+ int direction)
+{
+ u64 *pte, __pte;
+
+ WARN_ON(address > dom->aperture_size);
+
+ paddr &= PAGE_MASK;
+
+ pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
+
+ if (direction == DMA_TO_DEVICE)
+ __pte |= IOMMU_PTE_IR;
+ else if (direction == DMA_FROM_DEVICE)
+ __pte |= IOMMU_PTE_IW;
+ else if (direction == DMA_BIDIRECTIONAL)
+ __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
+
+ WARN_ON(*pte);
+
+ *pte = __pte;
+
+ return (dma_addr_t)address;
+}
+
+static void dma_ops_domain_unmap(struct amd_iommu *iommu,
+ struct dma_ops_domain *dom,
+ unsigned long address)
+{
+ u64 *pte;
+
+ if (address >= dom->aperture_size)
+ return;
+
+ WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+
+ pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ WARN_ON(!*pte);
+
+ *pte = 0ULL;
+}
+
+static dma_addr_t __map_single(struct device *dev,
+ struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ phys_addr_t paddr,
+ size_t size,
+ int dir)
+{
+ dma_addr_t offset = paddr & ~PAGE_MASK;
+ dma_addr_t address, start;
+ unsigned int pages;
+ int i;
+
+ pages = to_pages(paddr, size);
+ paddr &= PAGE_MASK;
+
+ address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+ if (unlikely(address == bad_dma_address))
+ goto out;
+
+ start = address;
+ for (i = 0; i < pages; ++i) {
+ dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ paddr += PAGE_SIZE;
+ start += PAGE_SIZE;
+ }
+ address += offset;
+
+out:
+ return address;
+}
+
+static void __unmap_single(struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ dma_addr_t dma_addr,
+ size_t size,
+ int dir)
+{
+ dma_addr_t i, start;
+ unsigned int pages;
+
+ if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+ return;
+
+ pages = to_pages(dma_addr, size);
+ dma_addr &= PAGE_MASK;
+ start = dma_addr;
+
+ for (i = 0; i < pages; ++i) {
+ dma_ops_domain_unmap(iommu, dma_dom, start);
+ start += PAGE_SIZE;
+ }
+
+ dma_ops_free_addresses(dma_dom, dma_addr, pages);
+}
+
+static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
+ size_t size, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ dma_addr_t addr;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (iommu == NULL || domain == NULL)
+ return (dma_addr_t)paddr;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+ if (addr == bad_dma_address)
+ goto out;
+
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return addr;
+}
+
+static void unmap_single(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ return;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ __unmap_single(iommu, domain->priv, dma_addr, size, dir);
+
+ iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sglist, s, nelems, i) {
+ s->dma_address = (dma_addr_t)sg_phys(s);
+ s->dma_length = s->length;
+ }
+
+ return nelems;
+}
+
+static int map_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ int i;
+ struct scatterlist *s;
+ phys_addr_t paddr;
+ int mapped_elems = 0;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain)
+ return map_sg_no_iommu(dev, sglist, nelems, dir);
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ for_each_sg(sglist, s, nelems, i) {
+ paddr = sg_phys(s);
+
+ s->dma_address = __map_single(dev, iommu, domain->priv,
+ paddr, s->length, dir);
+
+ if (s->dma_address) {
+ s->dma_length = s->length;
+ mapped_elems++;
+ } else
+ goto unmap;
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, s->dma_address,
+ s->dma_length);
+ }
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return mapped_elems;
+unmap:
+ for_each_sg(sglist, s, mapped_elems, i) {
+ if (s->dma_address)
+ __unmap_single(iommu, domain->priv, s->dma_address,
+ s->dma_length, dir);
+ s->dma_address = s->dma_length = 0;
+ }
+
+ mapped_elems = 0;
+
+ goto out;
+}
+
+static void unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ struct scatterlist *s;
+ u16 devid;
+ int i;
+
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ return;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ for_each_sg(sglist, s, nelems, i) {
+ __unmap_single(iommu, domain->priv, s->dma_address,
+ s->dma_length, dir);
+ iommu_flush_pages(iommu, domain->id, s->dma_address,
+ s->dma_length);
+ s->dma_address = s->dma_length = 0;
+ }
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static void *alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag)
+{
+ unsigned long flags;
+ void *virt_addr;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ phys_addr_t paddr;
+
+ virt_addr = (void *)__get_free_pages(flag, get_order(size));
+ if (!virt_addr)
+ return 0;
+
+ memset(virt_addr, 0, size);
+ paddr = virt_to_phys(virt_addr);
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain) {
+ *dma_addr = (dma_addr_t)paddr;
+ return virt_addr;
+ }
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+ size, DMA_BIDIRECTIONAL);
+
+ if (*dma_addr == bad_dma_address) {
+ free_pages((unsigned long)virt_addr, get_order(size));
+ virt_addr = NULL;
+ goto out;
+ }
+
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, *dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return virt_addr;
+}
+
+static void free_coherent(struct device *dev, size_t size,
+ void *virt_addr, dma_addr_t dma_addr)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain)
+ goto free_mem;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
+ iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+free_mem:
+ free_pages((unsigned long)virt_addr, get_order(size));
+}
+
+/*
+ * If the driver core informs the DMA layer if a driver grabs a device
+ * we don't need to preallocate the protection domains anymore.
+ * For now we have to.
+ */
+void prealloc_protection_domains(void)
+{
+ struct pci_dev *dev = NULL;
+ struct dma_ops_domain *dma_dom;
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ u16 devid;
+
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ devid = (dev->bus->number << 8) | dev->devfn;
+ if (devid >= amd_iommu_last_bdf)
+ continue;
+ devid = amd_iommu_alias_table[devid];
+ if (domain_for_device(devid))
+ continue;
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ continue;
+ dma_dom = dma_ops_domain_alloc(iommu, order);
+ if (!dma_dom)
+ continue;
+ init_unity_mappings_for_device(dma_dom, devid);
+ set_device_domain(iommu, &dma_dom->domain, devid);
+ printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
+ dma_dom->domain.id);
+ print_devid(devid, 1);
+ }
+}
+
+static struct dma_mapping_ops amd_iommu_dma_ops = {
+ .alloc_coherent = alloc_coherent,
+ .free_coherent = free_coherent,
+ .map_single = map_single,
+ .unmap_single = unmap_single,
+ .map_sg = map_sg,
+ .unmap_sg = unmap_sg,
+};
+
+int __init amd_iommu_init_dma_ops(void)
+{
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ int ret;
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+ if (iommu->default_dom == NULL)
+ return -ENOMEM;
+ ret = iommu_init_unity_mappings(iommu);
+ if (ret)
+ goto free_domains;
+ }
+
+ if (amd_iommu_isolate)
+ prealloc_protection_domains();
+
+ iommu_detected = 1;
+ force_iommu = 1;
+ bad_dma_address = 0;
+#ifdef CONFIG_GART_IOMMU
+ gart_iommu_aperture_disabled = 1;
+ gart_iommu_aperture = 0;
+#endif
+
+ dma_ops = &amd_iommu_dma_ops;
+
+ return 0;
+
+free_domains:
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ if (iommu->default_dom)
+ dma_ops_domain_free(iommu->default_dom);
+ }
+
+ return ret;
+}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
new file mode 100644
index 00000000000..2a13e430437
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/sysdev.h>
+#include <asm/pci-direct.h>
+#include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
+#include <asm/gart.h>
+
+/*
+ * definitions for the ACPI scanning code
+ */
+#define UPDATE_LAST_BDF(x) do {\
+ if ((x) > amd_iommu_last_bdf) \
+ amd_iommu_last_bdf = (x); \
+ } while (0);
+
+#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
+#define PCI_BUS(x) (((x) >> 8) & 0xff)
+#define IVRS_HEADER_LENGTH 48
+#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
+
+#define ACPI_IVHD_TYPE 0x10
+#define ACPI_IVMD_TYPE_ALL 0x20
+#define ACPI_IVMD_TYPE 0x21
+#define ACPI_IVMD_TYPE_RANGE 0x22
+
+#define IVHD_DEV_ALL 0x01
+#define IVHD_DEV_SELECT 0x02
+#define IVHD_DEV_SELECT_RANGE_START 0x03
+#define IVHD_DEV_RANGE_END 0x04
+#define IVHD_DEV_ALIAS 0x42
+#define IVHD_DEV_ALIAS_RANGE 0x43
+#define IVHD_DEV_EXT_SELECT 0x46
+#define IVHD_DEV_EXT_SELECT_RANGE 0x47
+
+#define IVHD_FLAG_HT_TUN_EN 0x00
+#define IVHD_FLAG_PASSPW_EN 0x01
+#define IVHD_FLAG_RESPASSPW_EN 0x02
+#define IVHD_FLAG_ISOC_EN 0x03
+
+#define IVMD_FLAG_EXCL_RANGE 0x08
+#define IVMD_FLAG_UNITY_MAP 0x01
+
+#define ACPI_DEVFLAG_INITPASS 0x01
+#define ACPI_DEVFLAG_EXTINT 0x02
+#define ACPI_DEVFLAG_NMI 0x04
+#define ACPI_DEVFLAG_SYSMGT1 0x10
+#define ACPI_DEVFLAG_SYSMGT2 0x20
+#define ACPI_DEVFLAG_LINT0 0x40
+#define ACPI_DEVFLAG_LINT1 0x80
+#define ACPI_DEVFLAG_ATSDIS 0x10000000
+
+struct ivhd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 cap_ptr;
+ u64 mmio_phys;
+ u16 pci_seg;
+ u16 info;
+ u32 reserved;
+} __attribute__((packed));
+
+struct ivhd_entry {
+ u8 type;
+ u16 devid;
+ u8 flags;
+ u32 ext;
+} __attribute__((packed));
+
+struct ivmd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 aux;
+ u64 resv;
+ u64 range_start;
+ u64 range_length;
+} __attribute__((packed));
+
+static int __initdata amd_iommu_detected;
+
+u16 amd_iommu_last_bdf;
+struct list_head amd_iommu_unity_map;
+unsigned amd_iommu_aperture_order = 26;
+int amd_iommu_isolate;
+
+struct list_head amd_iommu_list;
+struct dev_table_entry *amd_iommu_dev_table;
+u16 *amd_iommu_alias_table;
+struct amd_iommu **amd_iommu_rlookup_table;
+struct protection_domain **amd_iommu_pd_table;
+unsigned long *amd_iommu_pd_alloc_bitmap;
+
+static u32 dev_table_size;
+static u32 alias_table_size;
+static u32 rlookup_table_size;
+
+static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+{
+ u64 start = iommu->exclusion_start & PAGE_MASK;
+ u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
+ u64 entry;
+
+ if (!iommu->exclusion_start)
+ return;
+
+ entry = start | MMIO_EXCL_ENABLE_MASK;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+ &entry, sizeof(entry));
+
+ entry = limit;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+ &entry, sizeof(entry));
+}
+
+static void __init iommu_set_device_table(struct amd_iommu *iommu)
+{
+ u32 entry;
+
+ BUG_ON(iommu->mmio_base == NULL);
+
+ entry = virt_to_phys(amd_iommu_dev_table);
+ entry |= (dev_table_size >> 12) - 1;
+ memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+ &entry, sizeof(entry));
+}
+
+static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+{
+ u32 ctrl;
+
+ ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl |= (1 << bit);
+ writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+{
+ u32 ctrl;
+
+ ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl &= ~(1 << bit);
+ writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+void __init iommu_enable(struct amd_iommu *iommu)
+{
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
+ print_devid(iommu->devid, 0);
+ printk(" cap 0x%hx\n", iommu->cap_ptr);
+
+ iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+}
+
+static u8 * __init iommu_map_mmio_space(u64 address)
+{
+ u8 *ret;
+
+ if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+ return NULL;
+
+ ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
+ if (ret != NULL)
+ return ret;
+
+ release_mem_region(address, MMIO_REGION_LENGTH);
+
+ return NULL;
+}
+
+static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+{
+ if (iommu->mmio_base)
+ iounmap(iommu->mmio_base);
+ release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
+}
+
+static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
+{
+ u32 cap;
+
+ cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+ UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+
+ return 0;
+}
+
+static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+{
+ u8 *p = (void *)h, *end = (void *)h;
+ struct ivhd_entry *dev;
+
+ p += sizeof(*h);
+ end += h->length;
+
+ find_last_devid_on_pci(PCI_BUS(h->devid),
+ PCI_SLOT(h->devid),
+ PCI_FUNC(h->devid),
+ h->cap_ptr);
+
+ while (p < end) {
+ dev = (struct ivhd_entry *)p;
+ switch (dev->type) {
+ case IVHD_DEV_SELECT:
+ case IVHD_DEV_RANGE_END:
+ case IVHD_DEV_ALIAS:
+ case IVHD_DEV_EXT_SELECT:
+ UPDATE_LAST_BDF(dev->devid);
+ break;
+ default:
+ break;
+ }
+ p += 0x04 << (*p >> 6);
+ }
+
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static int __init find_last_devid_acpi(struct acpi_table_header *table)
+{
+ int i;
+ u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+
+ /*
+ * Validate checksum here so we don't need to do it when
+ * we actually parse the table
+ */
+ for (i = 0; i < table->length; ++i)
+ checksum += p[i];
+ if (checksum != 0)
+ /* ACPI table corrupt */
+ return -ENODEV;
+
+ p += IVRS_HEADER_LENGTH;
+
+ end += table->length;
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ switch (h->type) {
+ case ACPI_IVHD_TYPE:
+ find_last_devid_from_ivhd(h);
+ break;
+ default:
+ break;
+ }
+ p += h->length;
+ }
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
+{
+ u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+ get_order(CMD_BUFFER_SIZE));
+ u64 entry = 0;
+
+ if (cmd_buf == NULL)
+ return NULL;
+
+ iommu->cmd_buf_size = CMD_BUFFER_SIZE;
+
+ memset(cmd_buf, 0, CMD_BUFFER_SIZE);
+
+ entry = (u64)virt_to_phys(cmd_buf);
+ entry |= MMIO_CMD_SIZE_512;
+ memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+ &entry, sizeof(entry));
+
+ iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+
+ return cmd_buf;
+}
+
+static void __init free_command_buffer(struct amd_iommu *iommu)
+{
+ if (iommu->cmd_buf)
+ free_pages((unsigned long)iommu->cmd_buf,
+ get_order(CMD_BUFFER_SIZE));
+}
+
+static void set_dev_entry_bit(u16 devid, u8 bit)
+{
+ int i = (bit >> 5) & 0x07;
+ int _bit = bit & 0x1f;
+
+ amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
+}
+
+static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+{
+ if (flags & ACPI_DEVFLAG_INITPASS)
+ set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+ if (flags & ACPI_DEVFLAG_EXTINT)
+ set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+ if (flags & ACPI_DEVFLAG_NMI)
+ set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+ if (flags & ACPI_DEVFLAG_SYSMGT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+ if (flags & ACPI_DEVFLAG_SYSMGT2)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+ if (flags & ACPI_DEVFLAG_LINT0)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+ if (flags & ACPI_DEVFLAG_LINT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+}
+
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+ amd_iommu_rlookup_table[devid] = iommu;
+}
+
+static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
+{
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
+ return;
+
+ if (iommu) {
+ set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
+ iommu->exclusion_start = m->range_start;
+ iommu->exclusion_length = m->range_length;
+ }
+}
+
+static void __init init_iommu_from_pci(struct amd_iommu *iommu)
+{
+ int bus = PCI_BUS(iommu->devid);
+ int dev = PCI_SLOT(iommu->devid);
+ int fn = PCI_FUNC(iommu->devid);
+ int cap_ptr = iommu->cap_ptr;
+ u32 range;
+
+ iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+
+ range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+ iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
+ iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+}
+
+static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
+ struct ivhd_header *h)
+{
+ u8 *p = (u8 *)h;
+ u8 *end = p, flags = 0;
+ u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
+ u32 ext_flags = 0;
+ bool alias = 0;
+ struct ivhd_entry *e;
+
+ /*
+ * First set the recommended feature enable bits from ACPI
+ * into the IOMMU control registers
+ */
+ h->flags & IVHD_FLAG_HT_TUN_EN ?
+ iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+ iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+ h->flags & IVHD_FLAG_PASSPW_EN ?
+ iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+ h->flags & IVHD_FLAG_RESPASSPW_EN ?
+ iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+ h->flags & IVHD_FLAG_ISOC_EN ?
+ iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+ iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+ /*
+ * make IOMMU memory accesses cache coherent
+ */
+ iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+ /*
+ * Done. Now parse the device entries
+ */
+ p += sizeof(struct ivhd_header);
+ end += h->length;
+
+ while (p < end) {
+ e = (struct ivhd_entry *)p;
+ switch (e->type) {
+ case IVHD_DEV_ALL:
+ for (dev_i = iommu->first_device;
+ dev_i <= iommu->last_device; ++dev_i)
+ set_dev_entry_from_acpi(dev_i, e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT:
+ devid = e->devid;
+ set_dev_entry_from_acpi(devid, e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT_RANGE_START:
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = 0;
+ alias = 0;
+ break;
+ case IVHD_DEV_ALIAS:
+ devid = e->devid;
+ devid_to = e->ext >> 8;
+ set_dev_entry_from_acpi(devid, e->flags, 0);
+ amd_iommu_alias_table[devid] = devid_to;
+ break;
+ case IVHD_DEV_ALIAS_RANGE:
+ devid_start = e->devid;
+ flags = e->flags;
+ devid_to = e->ext >> 8;
+ ext_flags = 0;
+ alias = 1;
+ break;
+ case IVHD_DEV_EXT_SELECT:
+ devid = e->devid;
+ set_dev_entry_from_acpi(devid, e->flags, e->ext);
+ break;
+ case IVHD_DEV_EXT_SELECT_RANGE:
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = e->ext;
+ alias = 0;
+ break;
+ case IVHD_DEV_RANGE_END:
+ devid = e->devid;
+ for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+ if (alias)
+ amd_iommu_alias_table[dev_i] = devid_to;
+ set_dev_entry_from_acpi(
+ amd_iommu_alias_table[dev_i],
+ flags, ext_flags);
+ }
+ break;
+ default:
+ break;
+ }
+
+ p += 0x04 << (e->type >> 6);
+ }
+}
+
+static int __init init_iommu_devices(struct amd_iommu *iommu)
+{
+ u16 i;
+
+ for (i = iommu->first_device; i <= iommu->last_device; ++i)
+ set_iommu_for_device(iommu, i);
+
+ return 0;
+}
+
+static void __init free_iommu_one(struct amd_iommu *iommu)
+{
+ free_command_buffer(iommu);
+ iommu_unmap_mmio_space(iommu);
+}
+
+static void __init free_iommu_all(void)
+{
+ struct amd_iommu *iommu, *next;
+
+ list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+ list_del(&iommu->list);
+ free_iommu_one(iommu);
+ kfree(iommu);
+ }
+}
+
+static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+{
+ spin_lock_init(&iommu->lock);
+ list_add_tail(&iommu->list, &amd_iommu_list);
+
+ /*
+ * Copy data from ACPI table entry to the iommu struct
+ */
+ iommu->devid = h->devid;
+ iommu->cap_ptr = h->cap_ptr;
+ iommu->mmio_phys = h->mmio_phys;
+ iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
+ if (!iommu->mmio_base)
+ return -ENOMEM;
+
+ iommu_set_device_table(iommu);
+ iommu->cmd_buf = alloc_command_buffer(iommu);
+ if (!iommu->cmd_buf)
+ return -ENOMEM;
+
+ init_iommu_from_pci(iommu);
+ init_iommu_from_acpi(iommu, h);
+ init_iommu_devices(iommu);
+
+ return 0;
+}
+
+static int __init init_iommu_all(struct acpi_table_header *table)
+{
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+ struct amd_iommu *iommu;
+ int ret;
+
+ INIT_LIST_HEAD(&amd_iommu_list);
+
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ switch (*p) {
+ case ACPI_IVHD_TYPE:
+ iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+ if (iommu == NULL)
+ return -ENOMEM;
+ ret = init_iommu_one(iommu, h);
+ if (ret)
+ return ret;
+ break;
+ default:
+ break;
+ }
+ p += h->length;
+
+ }
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static void __init free_unity_maps(void)
+{
+ struct unity_map_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+}
+
+static int __init init_exclusion_range(struct ivmd_header *m)
+{
+ int i;
+
+ switch (m->type) {
+ case ACPI_IVMD_TYPE:
+ set_device_exclusion_range(m->devid, m);
+ break;
+ case ACPI_IVMD_TYPE_ALL:
+ for (i = 0; i < amd_iommu_last_bdf; ++i)
+ set_device_exclusion_range(i, m);
+ break;
+ case ACPI_IVMD_TYPE_RANGE:
+ for (i = m->devid; i <= m->aux; ++i)
+ set_device_exclusion_range(i, m);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int __init init_unity_map_range(struct ivmd_header *m)
+{
+ struct unity_map_entry *e = 0;
+
+ e = kzalloc(sizeof(*e), GFP_KERNEL);
+ if (e == NULL)
+ return -ENOMEM;
+
+ switch (m->type) {
+ default:
+ case ACPI_IVMD_TYPE:
+ e->devid_start = e->devid_end = m->devid;
+ break;
+ case ACPI_IVMD_TYPE_ALL:
+ e->devid_start = 0;
+ e->devid_end = amd_iommu_last_bdf;
+ break;
+ case ACPI_IVMD_TYPE_RANGE:
+ e->devid_start = m->devid;
+ e->devid_end = m->aux;
+ break;
+ }
+ e->address_start = PAGE_ALIGN(m->range_start);
+ e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+ e->prot = m->flags >> 1;
+
+ list_add_tail(&e->list, &amd_iommu_unity_map);
+
+ return 0;
+}
+
+static int __init init_memory_definitions(struct acpi_table_header *table)
+{
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivmd_header *m;
+
+ INIT_LIST_HEAD(&amd_iommu_unity_map);
+
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ m = (struct ivmd_header *)p;
+ if (m->flags & IVMD_FLAG_EXCL_RANGE)
+ init_exclusion_range(m);
+ else if (m->flags & IVMD_FLAG_UNITY_MAP)
+ init_unity_map_range(m);
+
+ p += m->length;
+ }
+
+ return 0;
+}
+
+static void __init enable_iommus(void)
+{
+ struct amd_iommu *iommu;
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ iommu_set_exclusion_range(iommu);
+ iommu_enable(iommu);
+ }
+}
+
+/*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+static int amd_iommu_resume(struct sys_device *dev)
+{
+ return 0;
+}
+
+static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return -EINVAL;
+}
+
+static struct sysdev_class amd_iommu_sysdev_class = {
+ .name = "amd_iommu",
+ .suspend = amd_iommu_suspend,
+ .resume = amd_iommu_resume,
+};
+
+static struct sys_device device_amd_iommu = {
+ .id = 0,
+ .cls = &amd_iommu_sysdev_class,
+};
+
+int __init amd_iommu_init(void)
+{
+ int i, ret = 0;
+
+
+ if (no_iommu) {
+ printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+ return 0;
+ }
+
+ if (!amd_iommu_detected)
+ return -ENODEV;
+
+ /*
+ * First parse ACPI tables to find the largest Bus/Dev/Func
+ * we need to handle. Upon this information the shared data
+ * structures for the IOMMUs in the system will be allocated
+ */
+ if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
+ return -ENODEV;
+
+ dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
+ alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
+ rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+
+ ret = -ENOMEM;
+
+ /* Device table - directly used by all IOMMUs */
+ amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(dev_table_size));
+ if (amd_iommu_dev_table == NULL)
+ goto out;
+
+ /*
+ * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+ * IOMMU see for that device
+ */
+ amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(alias_table_size));
+ if (amd_iommu_alias_table == NULL)
+ goto free;
+
+ /* IOMMU rlookup table - find the IOMMU for a specific device */
+ amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(rlookup_table_size));
+ if (amd_iommu_rlookup_table == NULL)
+ goto free;
+
+ /*
+ * Protection Domain table - maps devices to protection domains
+ * This table has the same size as the rlookup_table
+ */
+ amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(rlookup_table_size));
+ if (amd_iommu_pd_table == NULL)
+ goto free;
+
+ amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(MAX_DOMAIN_ID/8));
+ if (amd_iommu_pd_alloc_bitmap == NULL)
+ goto free;
+
+ /*
+ * memory is allocated now; initialize the device table with all zeroes
+ * and let all alias entries point to itself
+ */
+ memset(amd_iommu_dev_table, 0, dev_table_size);
+ for (i = 0; i < amd_iommu_last_bdf; ++i)
+ amd_iommu_alias_table[i] = i;
+
+ memset(amd_iommu_pd_table, 0, rlookup_table_size);
+ memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
+
+ /*
+ * never allocate domain 0 because its used as the non-allocated and
+ * error value placeholder
+ */
+ amd_iommu_pd_alloc_bitmap[0] = 1;
+
+ /*
+ * now the data structures are allocated and basically initialized
+ * start the real acpi table scan
+ */
+ ret = -ENODEV;
+ if (acpi_table_parse("IVRS", init_iommu_all) != 0)
+ goto free;
+
+ if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
+ goto free;
+
+ ret = amd_iommu_init_dma_ops();
+ if (ret)
+ goto free;
+
+ ret = sysdev_class_register(&amd_iommu_sysdev_class);
+ if (ret)
+ goto free;
+
+ ret = sysdev_register(&device_amd_iommu);
+ if (ret)
+ goto free;
+
+ enable_iommus();
+
+ printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
+ (1 << (amd_iommu_aperture_order-20)));
+
+ printk(KERN_INFO "AMD IOMMU: device isolation ");
+ if (amd_iommu_isolate)
+ printk("enabled\n");
+ else
+ printk("disabled\n");
+
+out:
+ return ret;
+
+free:
+ if (amd_iommu_pd_alloc_bitmap)
+ free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+
+ if (amd_iommu_pd_table)
+ free_pages((unsigned long)amd_iommu_pd_table,
+ get_order(rlookup_table_size));
+
+ if (amd_iommu_rlookup_table)
+ free_pages((unsigned long)amd_iommu_rlookup_table,
+ get_order(rlookup_table_size));
+
+ if (amd_iommu_alias_table)
+ free_pages((unsigned long)amd_iommu_alias_table,
+ get_order(alias_table_size));
+
+ if (amd_iommu_dev_table)
+ free_pages((unsigned long)amd_iommu_dev_table,
+ get_order(dev_table_size));
+
+ free_iommu_all();
+
+ free_unity_maps();
+
+ goto out;
+}
+
+static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+{
+ return 0;
+}
+
+void __init amd_iommu_detect(void)
+{
+ if (swiotlb || no_iommu || iommu_detected)
+ return;
+
+ if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+ iommu_detected = 1;
+ amd_iommu_detected = 1;
+#ifdef CONFIG_GART_IOMMU
+ gart_iommu_aperture_disabled = 1;
+ gart_iommu_aperture = 0;
+#endif
+ }
+}
+
+static int __init parse_amd_iommu_options(char *str)
+{
+ for (; *str; ++str) {
+ if (strcmp(str, "isolate") == 0)
+ amd_iommu_isolate = 1;
+ }
+
+ return 1;
+}
+
+static int __init parse_amd_iommu_size_options(char *str)
+{
+ for (; *str; ++str) {
+ if (strcmp(str, "32M") == 0)
+ amd_iommu_aperture_order = 25;
+ if (strcmp(str, "64M") == 0)
+ amd_iommu_aperture_order = 26;
+ if (strcmp(str, "128M") == 0)
+ amd_iommu_aperture_order = 27;
+ if (strcmp(str, "256M") == 0)
+ amd_iommu_aperture_order = 28;
+ if (strcmp(str, "512M") == 0)
+ amd_iommu_aperture_order = 29;
+ if (strcmp(str, "1G") == 0)
+ amd_iommu_aperture_order = 30;
+ }
+
+ return 1;
+}
+
+__setup("amd_iommu=", parse_amd_iommu_options);
+__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 479926d9e00..9f907806c1a 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -35,6 +35,18 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
+struct bus_dev_range {
+ int bus;
+ int dev_base;
+ int dev_limit;
+};
+
+static struct bus_dev_range bus_dev_ranges[] __initdata = {
+ { 0x00, 0x18, 0x20},
+ { 0xff, 0x00, 0x20},
+ { 0xfe, 0x00, 0x20}
+};
+
static struct resource gart_resource = {
.name = "GART",
.flags = IORESOURCE_MEM,
@@ -55,8 +67,9 @@ static u32 __init allocate_aperture(void)
u32 aper_size;
void *p;
- if (fallback_aper_order > 7)
- fallback_aper_order = 7;
+ /* aper_size should <= 1G */
+ if (fallback_aper_order > 5)
+ fallback_aper_order = 5;
aper_size = (32 * 1024 * 1024) << fallback_aper_order;
/*
@@ -65,7 +78,20 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
+ /*
+ * using 512M as goal, in case kexec will load kernel_big
+ * that will do the on position decompress, and could overlap with
+ * that positon with gart that is used.
+ * sequende:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+ * ==> kernel_small(gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * so don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe
+ */
+ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
if (!p || __pa(p)+aper_size > 0xffffffff) {
printk(KERN_ERR
"Cannot allocate aperture memory hole (%p,%uK)\n",
@@ -83,69 +109,53 @@ static u32 __init allocate_aperture(void)
return (u32)__pa(p);
}
-static int __init aperture_valid(u64 aper_base, u32 aper_size)
-{
- if (!aper_base)
- return 0;
-
- if (aper_base + aper_size > 0x100000000UL) {
- printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
- return 0;
- }
- if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
- printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
- return 0;
- }
- if (aper_size < 64*1024*1024) {
- printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
- return 0;
- }
-
- return 1;
-}
/* Find a PCI capability */
-static __u32 __init find_cap(int num, int slot, int func, int cap)
+static u32 __init find_cap(int bus, int slot, int func, int cap)
{
int bytes;
u8 pos;
- if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+ if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
PCI_STATUS_CAP_LIST))
return 0;
- pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+ pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
u8 id;
pos &= ~3;
- id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+ id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
if (id == 0xff)
break;
if (id == cap)
return pos;
- pos = read_pci_config_byte(num, slot, func,
+ pos = read_pci_config_byte(bus, slot, func,
pos+PCI_CAP_LIST_NEXT);
}
return 0;
}
/* Read a standard AGPv3 bridge header */
-static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
{
u32 apsize;
u32 apsizereg;
int nbits;
u32 aper_low, aper_hi;
u64 aper;
+ u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
- apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
+ printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
return 0;
}
+ /* old_order could be the value from NB gart setting */
+ old_order = *order;
+
apsize = apsizereg & 0xfff;
/* Some BIOS use weird encodings not in the AGPv3 table. */
if (apsize & 0xff)
@@ -155,14 +165,26 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
if ((int)*order < 0) /* < 32MB */
*order = 0;
- aper_low = read_pci_config(num, slot, func, 0x10);
- aper_hi = read_pci_config(num, slot, func, 0x14);
+ aper_low = read_pci_config(bus, slot, func, 0x10);
+ aper_hi = read_pci_config(bus, slot, func, 0x14);
aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+ /*
+ * On some sick chips, APSIZE is 0. It means it wants 4G
+ * so let double check that order, and lets trust AMD NB settings:
+ */
+ printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
+ aper, 32 << old_order);
+ if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
+ printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
+ 32 << *order, apsizereg);
+ *order = old_order;
+ }
+
printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
aper, 32 << *order, apsizereg);
- if (!aperture_valid(aper, (32*1024*1024) << *order))
+ if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
return (u32)aper;
}
@@ -180,17 +202,17 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
* the AGP bridges should be always an own bus on the HT hierarchy,
* but do it here for future safety.
*/
-static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
{
- int num, slot, func;
+ int bus, slot, func;
/* Poor man's PCI discovery */
- for (num = 0; num < 256; num++) {
+ for (bus = 0; bus < 256; bus++) {
for (slot = 0; slot < 32; slot++) {
for (func = 0; func < 8; func++) {
u32 class, cap;
u8 type;
- class = read_pci_config(num, slot, func,
+ class = read_pci_config(bus, slot, func,
PCI_CLASS_REVISION);
if (class == 0xffffffff)
break;
@@ -199,17 +221,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
case PCI_CLASS_BRIDGE_HOST:
case PCI_CLASS_BRIDGE_OTHER: /* needed? */
/* AGP bridge? */
- cap = find_cap(num, slot, func,
+ cap = find_cap(bus, slot, func,
PCI_CAP_ID_AGP);
if (!cap)
break;
*valid_agp = 1;
- return read_agp(num, slot, func, cap,
+ return read_agp(bus, slot, func, cap,
order);
}
/* No multi-function device? */
- type = read_pci_config_byte(num, slot, func,
+ type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
if (!(type & 0x80))
break;
@@ -249,36 +271,50 @@ void __init early_gart_iommu_check(void)
* or BIOS forget to put that in reserved.
* try to update e820 to make that region as reserved.
*/
- int fix, num;
+ int i, fix, slot;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base = 0, last_aper_base = 0;
- int aper_enabled = 0, last_aper_enabled = 0;
+ int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
if (!early_pci_allowed())
return;
+ /* This is mostly duplicate of iommu_hole_init */
fix = 0;
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
-
- ctl = read_pci_config(0, num, 3, 0x90);
- aper_enabled = ctl & 1;
- aper_order = (ctl >> 1) & 7;
- aper_size = (32 * 1024 * 1024) << aper_order;
- aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
- aper_base <<= 25;
-
- if ((last_aper_order && aper_order != last_aper_order) ||
- (last_aper_base && aper_base != last_aper_base) ||
- (last_aper_enabled && aper_enabled != last_aper_enabled)) {
- fix = 1;
- break;
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ aper_enabled = ctl & AMD64_GARTEN;
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ if (last_valid) {
+ if ((aper_order != last_aper_order) ||
+ (aper_base != last_aper_base) ||
+ (aper_enabled != last_aper_enabled)) {
+ fix = 1;
+ break;
+ }
+ }
+
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ last_aper_enabled = aper_enabled;
+ last_valid = 1;
}
- last_aper_order = aper_order;
- last_aper_base = aper_base;
- last_aper_enabled = aper_enabled;
}
if (!fix && !aper_enabled)
@@ -290,32 +326,46 @@ void __init early_gart_iommu_check(void)
if (gart_fix_e820 && !fix && aper_enabled) {
if (e820_any_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {
- /* reserved it, so we can resuse it in second kernel */
+ /* reserve it, so we can reuse it in second kernel */
printk(KERN_INFO "update e820 for GART\n");
- add_memory_region(aper_base, aper_size, E820_RESERVED);
+ e820_add_region(aper_base, aper_size, E820_RESERVED);
update_e820();
}
- return;
}
+ if (!fix)
+ return;
+
/* different nodes have different setting, disable them all at first*/
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
- ctl = read_pci_config(0, num, 3, 0x90);
- ctl &= ~1;
- write_pci_config(0, num, 3, 0x90, ctl);
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ ctl &= ~AMD64_GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ }
}
}
+static int __initdata printed_gart_size_msg;
+
void __init gart_iommu_hole_init(void)
{
+ u32 agp_aper_base = 0, agp_aper_order = 0;
u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base, last_aper_base = 0;
- int fix, num, valid_agp = 0;
- int node;
+ int fix, slot, valid_agp = 0;
+ int i, node;
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
@@ -323,38 +373,65 @@ void __init gart_iommu_hole_init(void)
printk(KERN_INFO "Checking aperture...\n");
+ if (!fallback_aper_force)
+ agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
fix = 0;
node = 0;
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
-
- iommu_detected = 1;
- gart_iommu_aperture = 1;
-
- aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
- aper_size = (32 * 1024 * 1024) << aper_order;
- aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
- aper_base <<= 25;
-
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
- node++;
-
- if (!aperture_valid(aper_base, aper_size)) {
- fix = 1;
- break;
- }
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ iommu_detected = 1;
+ gart_iommu_aperture = 1;
+
+ aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
+ node, aper_base, aper_size >> 20);
+ node++;
+
+ if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+ if (valid_agp && agp_aper_base &&
+ agp_aper_base == aper_base &&
+ agp_aper_order == aper_order) {
+ /* the same between two setting from NB and agp */
+ if (!no_iommu &&
+ max_pfn > MAX_DMA32_PFN &&
+ !printed_gart_size_msg) {
+ printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
+ printk(KERN_ERR "please increase GART size in your BIOS setup\n");
+ printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ printed_gart_size_msg = 1;
+ }
+ } else {
+ fix = 1;
+ goto out;
+ }
+ }
- if ((last_aper_order && aper_order != last_aper_order) ||
- (last_aper_base && aper_base != last_aper_base)) {
- fix = 1;
- break;
+ if ((last_aper_order && aper_order != last_aper_order) ||
+ (last_aper_base && aper_base != last_aper_base)) {
+ fix = 1;
+ goto out;
+ }
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
}
- last_aper_order = aper_order;
- last_aper_base = aper_base;
}
+out:
if (!fix && !fallback_aper_force) {
if (last_aper_base) {
unsigned long n = (32 * 1024 * 1024) << last_aper_order;
@@ -364,14 +441,16 @@ void __init gart_iommu_hole_init(void)
return;
}
- if (!fallback_aper_force)
- aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+ if (!fallback_aper_force) {
+ aper_alloc = agp_aper_base;
+ aper_order = agp_aper_order;
+ }
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
} else if (swiotlb && !valid_agp) {
/* Do nothing */
- } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
+ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
valid_agp ||
fallback_aper_force) {
@@ -401,16 +480,24 @@ void __init gart_iommu_hole_init(void)
}
/* Fix up the north bridges */
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
-
- /*
- * Don't enable translation yet. That is done later.
- * Assume this BIOS didn't initialise the GART so
- * just overwrite all previous bits
- */
- write_pci_config(0, num, 3, 0x90, aper_order<<1);
- write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ /* Don't enable translation yet. That is done later.
+ Assume this BIOS didn't initialise the GART so
+ just overwrite all previous bits */
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
+ }
}
+
+ set_up_gart_resume(aper_order, aper_alloc);
}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6..a437d027f20 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -52,30 +52,41 @@
unsigned long mp_lapic_addr;
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
/*
* Knob to control our willingness to enable the local APIC.
*
- * -1=force-disable, +1=force-enable
+ * +1=force-enable
*/
-static int enable_local_apic __initdata;
+static int force_enable_local_apic;
+int disable_apic;
/* Local APIC timer verification ok */
static int local_apic_timer_verify_ok;
-/* Disable local APIC timer from the kernel commandline or via dmi quirk
- or using CPU MSR check */
-int local_apic_timer_disabled;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
+static int local_apic_timer_disabled;
/* Local APIC timer works in C2 */
int local_apic_timer_c2_ok;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
/*
* Debug level, exported for io_apic.c
*/
int apic_verbosity;
+int pic_mode;
+
+/* Have we found an MP table */
+int smp_found_config;
+
+static struct resource lapic_resource = {
+ .name = "Local APIC",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
static unsigned int calibration_result;
static int lapic_next_event(unsigned long delta,
@@ -545,7 +556,7 @@ void __init setup_boot_APIC_clock(void)
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
else
printk(KERN_WARNING "APIC timer registered as dummy,"
- " due to nmi_watchdog=1!\n");
+ " due to nmi_watchdog=%d!\n", nmi_watchdog);
}
/* Setup the lapic or request the broadcast */
@@ -963,7 +974,7 @@ void __cpuinit setup_local_APIC(void)
* Double-check whether this APIC is really registered.
*/
if (!apic_id_registered())
- BUG();
+ WARN_ON_ONCE(1);
/*
* Intel recommends to set DFR, LDR and TPR before enabling
@@ -1094,7 +1105,7 @@ static int __init detect_init_APIC(void)
u32 h, l, features;
/* Disabled by kernel option? */
- if (enable_local_apic < 0)
+ if (disable_apic)
return -1;
switch (boot_cpu_data.x86_vendor) {
@@ -1117,7 +1128,7 @@ static int __init detect_init_APIC(void)
* Over-ride BIOS and try to enable the local APIC only if
* "lapic" specified.
*/
- if (enable_local_apic <= 0) {
+ if (!force_enable_local_apic) {
printk(KERN_INFO "Local APIC disabled by BIOS -- "
"you can enable it with \"lapic\"\n");
return -1;
@@ -1154,9 +1165,6 @@ static int __init detect_init_APIC(void)
if (l & MSR_IA32_APICBASE_ENABLE)
mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
- if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
- nmi_watchdog = NMI_LOCAL_APIC;
-
printk(KERN_INFO "Found and enabled local APIC!\n");
apic_pm_activate();
@@ -1195,36 +1203,6 @@ void __init init_apic_mappings(void)
if (boot_cpu_physical_apicid == -1U)
boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-#ifdef CONFIG_X86_IO_APIC
- {
- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
- if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
- "disabling IO/APIC support!\n");
- smp_found_config = 0;
- skip_ioapic_setup = 1;
- goto fake_ioapic_page;
- }
- } else {
-fake_ioapic_page:
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
- ioapic_phys = __pa(ioapic_phys);
- }
- set_fixmap_nocache(idx, ioapic_phys);
- printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
- idx++;
- }
- }
-#endif
}
/*
@@ -1236,7 +1214,7 @@ int apic_version[MAX_APICS];
int __init APIC_init_uniprocessor(void)
{
- if (enable_local_apic < 0)
+ if (disable_apic)
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
if (!smp_found_config && !cpu_has_apic)
@@ -1265,10 +1243,14 @@ int __init APIC_init_uniprocessor(void)
#ifdef CONFIG_CRASH_DUMP
boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
#endif
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
setup_local_APIC();
+#ifdef CONFIG_X86_IO_APIC
+ if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+#endif
+ localise_nmi_watchdog();
end_local_APIC_setup();
#ifdef CONFIG_X86_IO_APIC
if (smp_found_config)
@@ -1351,13 +1333,17 @@ void __init smp_intr_init(void)
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
*/
- set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
/* IPI for invalidation */
- set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
/* IPI for generic function call */
- set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+ /* IPI for single call function */
+ set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+ call_function_single_interrupt);
}
#endif
@@ -1370,15 +1356,15 @@ void __init apic_intr_init(void)
smp_intr_init();
#endif
/* self generated IPI for local APIC timer */
- set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
/* IPI vectors for APIC spurious and error interrupts */
- set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
/* thermal monitor LVT interrupt */
#ifdef CONFIG_X86_MCE_P4THERMAL
- set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
#endif
}
@@ -1513,6 +1499,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
*/
cpu = 0;
+ if (apicid > max_physical_apicid)
+ max_physical_apicid = apicid;
+
/*
* Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
* but we need to work other dependencies like SMP_SUSPEND etc
@@ -1520,7 +1509,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
* if (CPU_HOTPLUG_ENABLED || num_processors > 8)
* - Ashok Raj <ashok.raj@intel.com>
*/
- if (num_processors > 8) {
+ if (max_physical_apicid >= 8) {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
if (!APIC_XAPIC(version)) {
@@ -1534,9 +1523,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
}
#ifdef CONFIG_SMP
/* are we being called early in kernel startup? */
- if (x86_cpu_to_apicid_early_ptr) {
- u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
- u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+ if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
+ u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+ u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
cpu_to_apicid[cpu] = apicid;
bios_cpu_apicid[cpu] = apicid;
@@ -1703,14 +1692,14 @@ static void apic_pm_activate(void) { }
*/
static int __init parse_lapic(char *arg)
{
- enable_local_apic = 1;
+ force_enable_local_apic = 1;
return 0;
}
early_param("lapic", parse_lapic);
static int __init parse_nolapic(char *arg)
{
- enable_local_apic = -1;
+ disable_apic = 1;
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
return 0;
}
@@ -1740,3 +1729,21 @@ static int __init apic_set_verbosity(char *str)
}
__setup("apic=", apic_set_verbosity);
+static int __init lapic_insert_resource(void)
+{
+ if (!apic_phys)
+ return -1;
+
+ /* Put local APIC into the resource map. */
+ lapic_resource.start = apic_phys;
+ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+ insert_resource(&iomem_resource, &lapic_resource);
+
+ return 0;
+}
+
+/*
+ * need call insert after e820_reserve_resources()
+ * that is using request_resource
+ */
+late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0633cfd0dc2..1e3d32e27c1 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -43,7 +43,7 @@
#include <mach_ipi.h>
#include <mach_apic.h>
-int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __cpuinitdata;
static int apic_calibrate_pmtmr __initdata;
int disable_apic;
@@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
*/
int apic_verbosity;
+/* Have we found an MP table */
+int smp_found_config;
+
static struct resource lapic_resource = {
.name = "Local APIC",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -87,9 +90,6 @@ static unsigned long apic_phys;
unsigned long mp_lapic_addr;
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
unsigned int __cpuinitdata maxcpus = NR_CPUS;
/*
* Get the LAPIC version
@@ -417,37 +417,13 @@ void __init setup_boot_APIC_clock(void)
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
else
printk(KERN_WARNING "APIC timer registered as dummy,"
- " due to nmi_watchdog=1!\n");
+ " due to nmi_watchdog=%d!\n", nmi_watchdog);
setup_APIC_timer();
}
-/*
- * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
- * C1E flag only in the secondary CPU, so when we detect the wreckage
- * we already have enabled the boot CPU local apic timer. Check, if
- * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
- * set the DUMMY flag again and force the broadcast mode in the
- * clockevents layer.
- */
-static void __cpuinit check_boot_apic_timer_broadcast(void)
-{
- if (!disable_apic_timer ||
- (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
- return;
-
- printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
- lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
-
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
- &boot_cpu_physical_apicid);
- local_irq_disable();
-}
-
void __cpuinit setup_secondary_APIC_clock(void)
{
- check_boot_apic_timer_broadcast();
setup_APIC_timer();
}
@@ -850,7 +826,6 @@ static void __cpuinit lapic_setup_esr(void)
void __cpuinit end_local_APIC_setup(void)
{
lapic_setup_esr();
- nmi_watchdog_default();
setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
@@ -875,7 +850,7 @@ static int __init detect_init_APIC(void)
void __init early_init_lapic_mapping(void)
{
- unsigned long apic_phys;
+ unsigned long phys_addr;
/*
* If no local APIC can be found then go out
@@ -884,11 +859,11 @@ void __init early_init_lapic_mapping(void)
if (!smp_found_config)
return;
- apic_phys = mp_lapic_addr;
+ phys_addr = mp_lapic_addr;
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+ set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, apic_phys);
+ APIC_BASE, phys_addr);
/*
* Fetch the APIC ID of the BSP in case we have a
@@ -942,7 +917,9 @@ int __init APIC_init_uniprocessor(void)
verify_local_APIC();
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+ connect_bsp_APIC();
+
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
setup_local_APIC();
@@ -954,6 +931,8 @@ int __init APIC_init_uniprocessor(void)
if (!skip_ioapic_setup && nr_ioapics)
enable_IO_APIC();
+ if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+ localise_nmi_watchdog();
end_local_APIC_setup();
if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
@@ -1021,6 +1000,14 @@ asmlinkage void smp_error_interrupt(void)
irq_exit();
}
+/**
+ * * connect_bsp_APIC - attach the APIC to the interrupt system
+ * */
+void __init connect_bsp_APIC(void)
+{
+ enable_apic_mode();
+}
+
void disconnect_bsp_APIC(int virt_wire_setup)
{
/* Go back to Virtual Wire compatibility mode */
@@ -1090,10 +1077,13 @@ void __cpuinit generic_processor_info(int apicid, int version)
*/
cpu = 0;
}
+ if (apicid > max_physical_apicid)
+ max_physical_apicid = apicid;
+
/* are we being called early in kernel startup? */
- if (x86_cpu_to_apicid_early_ptr) {
- u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
- u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+ if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
+ u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+ u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
cpu_to_apicid[cpu] = apicid;
bios_cpu_apicid[cpu] = apicid;
@@ -1269,7 +1259,7 @@ __cpuinit int apic_is_clustered_box(void)
if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
return 0;
- bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+ bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
for (i = 0; i < NR_CPUS; i++) {
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e2901..bf9b441331e 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,6 +204,7 @@
#include <linux/module.h>
#include <linux/poll.h>
+#include <linux/smp_lock.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/timer.h>
@@ -228,6 +229,7 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
+#include <linux/smp_lock.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -1149,7 +1151,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
as->event_tail = 0;
}
as->events[as->event_head] = event;
- if ((!as->suser) || (!as->writer))
+ if (!as->suser || !as->writer)
continue;
switch (event) {
case APM_SYS_SUSPEND:
@@ -1211,9 +1213,9 @@ static int suspend(int vetoable)
if (err != APM_SUCCESS)
apm_error("suspend", err);
err = (err == APM_SUCCESS) ? 0 : -EIO;
- device_power_up();
+ device_power_up(PMSG_RESUME);
local_irq_enable();
- device_resume();
+ device_resume(PMSG_RESUME);
queue_event(APM_NORMAL_RESUME, NULL);
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
@@ -1238,7 +1240,7 @@ static void standby(void)
apm_error("standby", err);
local_irq_disable();
- device_power_up();
+ device_power_up(PMSG_RESUME);
local_irq_enable();
}
@@ -1324,7 +1326,7 @@ static void check_events(void)
ignore_bounce = 1;
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
- device_resume();
+ device_resume(PMSG_RESUME);
queue_event(event, NULL);
}
ignore_normal_resume = 0;
@@ -1396,7 +1398,7 @@ static void apm_mainloop(void)
static int check_apm_user(struct apm_user *as, const char *func)
{
- if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
+ if (as == NULL || as->magic != APM_BIOS_MAGIC) {
printk(KERN_ERR "apm: %s passed bad filp\n", func);
return 1;
}
@@ -1459,18 +1461,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
return 0;
}
-static int do_ioctl(struct inode *inode, struct file *filp,
- u_int cmd, u_long arg)
+static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
{
struct apm_user *as;
+ int ret;
as = filp->private_data;
if (check_apm_user(as, "ioctl"))
return -EIO;
- if ((!as->suser) || (!as->writer))
+ if (!as->suser || !as->writer)
return -EPERM;
switch (cmd) {
case APM_IOC_STANDBY:
+ lock_kernel();
if (as->standbys_read > 0) {
as->standbys_read--;
as->standbys_pending--;
@@ -1479,8 +1482,10 @@ static int do_ioctl(struct inode *inode, struct file *filp,
queue_event(APM_USER_STANDBY, as);
if (standbys_pending <= 0)
standby();
+ unlock_kernel();
break;
case APM_IOC_SUSPEND:
+ lock_kernel();
if (as->suspends_read > 0) {
as->suspends_read--;
as->suspends_pending--;
@@ -1488,16 +1493,17 @@ static int do_ioctl(struct inode *inode, struct file *filp,
} else
queue_event(APM_USER_SUSPEND, as);
if (suspends_pending <= 0) {
- return suspend(1);
+ ret = suspend(1);
} else {
as->suspend_wait = 1;
wait_event_interruptible(apm_suspend_waitqueue,
as->suspend_wait == 0);
- return as->suspend_result;
+ ret = as->suspend_result;
}
- break;
+ unlock_kernel();
+ return ret;
default:
- return -EINVAL;
+ return -ENOTTY;
}
return 0;
}
@@ -1544,10 +1550,12 @@ static int do_open(struct inode *inode, struct file *filp)
{
struct apm_user *as;
+ lock_kernel();
as = kmalloc(sizeof(*as), GFP_KERNEL);
if (as == NULL) {
printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
sizeof(*as));
+ unlock_kernel();
return -ENOMEM;
}
as->magic = APM_BIOS_MAGIC;
@@ -1569,6 +1577,7 @@ static int do_open(struct inode *inode, struct file *filp)
user_list = as;
spin_unlock(&user_list_lock);
filp->private_data = as;
+ unlock_kernel();
return 0;
}
@@ -1860,7 +1869,7 @@ static const struct file_operations apm_bios_fops = {
.owner = THIS_MODULE,
.read = do_read,
.poll = do_poll,
- .ioctl = do_ioctl,
+ .unlocked_ioctl = do_ioctl,
.open = do_open,
.release = do_release,
};
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 92588083950..6649d09ad88 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,7 +111,7 @@ void foo(void)
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
+ OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f126c05d617..bacf5deeec2 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -34,7 +34,7 @@ int main(void)
ENTRY(pid);
BLANK();
#undef ENTRY
-#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
+#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
ENTRY(flags);
ENTRY(addr_limit);
ENTRY(preempt_count);
@@ -61,8 +61,11 @@ int main(void)
OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+ OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
+ OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
+ OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
+ OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0c6f819088..ee76eaad300 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -6,11 +6,15 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o feature_names.o
obj-$(CONFIG_X86_32) += common.o bugs.o
+obj-$(CONFIG_X86_64) += common_64.o bugs_64.o
obj-$(CONFIG_X86_32) += amd.o
+obj-$(CONFIG_X86_64) += amd_64.o
obj-$(CONFIG_X86_32) += cyrix.o
obj-$(CONFIG_X86_32) += centaur.o
+obj-$(CONFIG_X86_64) += centaur_64.o
obj-$(CONFIG_X86_32) += transmeta.o
obj-$(CONFIG_X86_32) += intel.o
+obj-$(CONFIG_X86_64) += intel_64.o
obj-$(CONFIG_X86_32) += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c2e1ce33c7c..84a8220a607 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -1,9 +1,7 @@
-
/*
* Routines to indentify additional cpu features that are scattered in
* cpuid space.
*/
-
#include <linux/cpu.h>
#include <asm/pat.h>
@@ -53,19 +51,20 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_PAT
void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
{
+ if (!cpu_has_pat)
+ pat_disable("PAT not supported by CPU.");
+
switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- if (c->x86 >= 0xf && c->x86 <= 0x11)
- return;
- break;
case X86_VENDOR_INTEL:
if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
return;
break;
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_TRANSMETA:
+ return;
}
- pat_disable(cpu_has_pat ?
- "PAT disabled. Not yet verified on this CPU type." :
- "PAT not supported by CPU.");
+ pat_disable("PAT disabled. Not yet verified on this CPU type.");
}
#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 24586682829..81a07ca65d4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,43 +24,6 @@
extern void vide(void);
__asm__(".align 4\nvide: ret");
-#ifdef CONFIG_X86_LOCAL_APIC
-#define ENABLE_C1E_MASK 0x18000000
-#define CPUID_PROCESSOR_SIGNATURE 1
-#define CPUID_XFAM 0x0ff00000
-#define CPUID_XFAM_K8 0x00000000
-#define CPUID_XFAM_10H 0x00100000
-#define CPUID_XFAM_11H 0x00200000
-#define CPUID_XMOD 0x000f0000
-#define CPUID_XMOD_REV_F 0x00040000
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
- u32 lo, hi;
- u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- switch (eax & CPUID_XFAM) {
- case CPUID_XFAM_K8:
- if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
- break;
- case CPUID_XFAM_10H:
- case CPUID_XFAM_11H:
- rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
- if (lo & ENABLE_C1E_MASK) {
- if (smp_processor_id() != boot_cpu_physical_apicid)
- printk(KERN_INFO "AMD C1E detected late. "
- " Force timer broadcast.\n");
- return 1;
- }
- break;
- default:
- /* err on the side of caution */
- return 1;
- }
- return 0;
-}
-#endif
-
int force_mwait __cpuinitdata;
static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
@@ -297,11 +260,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
num_cache_leaves = 3;
}
-#ifdef CONFIG_X86_LOCAL_APIC
- if (amd_apic_timer_broken())
- local_apic_timer_disabled = 1;
-#endif
-
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
clear_cpu_cap(c, X86_FEATURE_MCE);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
new file mode 100644
index 00000000000..7c36fb8a28d
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -0,0 +1,222 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/numa_64.h>
+#include <asm/mmconfig.h>
+#include <asm/cacheflush.h>
+
+#include <mach_apic.h>
+
+#include "cpu.h"
+
+int force_mwait __cpuinitdata;
+
+#ifdef CONFIG_NUMA
+static int __cpuinit nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits;
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node = 0;
+ unsigned apicid = hard_smp_processor_id();
+#endif
+ bits = c->x86_coreid_bits;
+
+ /* Low order bits define the core id (index of core in socket) */
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
+
+#ifdef CONFIG_NUMA
+ node = c->phys_proc_id;
+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
+ node = apicid_to_node[apicid];
+ if (!node_online(node)) {
+ /* Two possibilities here:
+ - The CPU is missing memory and no node was created.
+ In that case try picking one from a nearby CPU
+ - The APIC IDs differ from the HyperTransport node IDs
+ which the K8 northbridge parsing fills in.
+ Assume they are all increased by a constant offset,
+ but in the same order as the HT nodeids.
+ If that doesn't result in a usable node fall back to the
+ path for the previous case. */
+
+ int ht_nodeid = c->initial_apicid;
+
+ if (ht_nodeid >= 0 &&
+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits, ecx;
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+
+ c->x86_coreid_bits = bits;
+
+#endif
+}
+
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+ early_init_amd_mc(c);
+
+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+ if (c->x86_power & (1<<8))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+ unsigned level;
+
+#ifdef CONFIG_SMP
+ unsigned long value;
+
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 0xf) {
+ rdmsrl(MSR_K8_HWCR, value);
+ value |= 1 << 6;
+ wrmsrl(MSR_K8_HWCR, value);
+ }
+#endif
+
+ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+ clear_cpu_cap(c, 0*32+31);
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ if (c->x86 == 0xf) {
+ level = cpuid_eax(1);
+ if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+ if (c->x86 == 0x10 || c->x86 == 0x11)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+
+ level = get_model_name(c);
+ if (!level) {
+ switch (c->x86) {
+ case 0xf:
+ /* Should distinguish Models here, but this is only
+ a fallback anyways. */
+ strcpy(c->x86_model_id, "Hammer");
+ break;
+ }
+ }
+ display_cacheinfo(c);
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level >= 0x80000008)
+ amd_detect_cmp(c);
+
+ if (c->extended_cpuid_level >= 0x80000006 &&
+ (cpuid_edx(0x80000006) & 0xf000))
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+
+ if (c->x86 >= 0xf && c->x86 <= 0x11)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ /* MFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
+ if (c->x86 == 0x10) {
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+ }
+
+ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+ if ((tseg>>PMD_SHIFT) <
+ (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
+ ((tseg>>PMD_SHIFT) <
+ (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
+ (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+}
+
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+ .c_vendor = "AMD",
+ .c_ident = { "AuthenticAMD" },
+ .c_early_init = early_init_amd,
+ .c_init = init_amd,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 170d2f5523b..1b1c56bb338 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -59,8 +59,12 @@ static void __init check_fpu(void)
return;
}
-/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
- /* Test for the divl bug.. */
+ /*
+ * trap_init() enabled FXSR and company _before_ testing for FP
+ * problems here.
+ *
+ * Test for the divl bug..
+ */
__asm__("fninit\n\t"
"fldl %1\n\t"
"fdivl %2\n\t"
@@ -108,10 +112,15 @@ static void __init check_popad(void)
"movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
: "=&a" (res)
: "d" (inp)
- : "ecx", "edi" );
- /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
- if (res != 12345678) printk( "Buggy.\n" );
- else printk( "OK.\n" );
+ : "ecx", "edi");
+ /*
+ * If this fails, it means that any user program may lock the
+ * CPU hard. Too bad.
+ */
+ if (res != 12345678)
+ printk("Buggy.\n");
+ else
+ printk("OK.\n");
#endif
}
@@ -137,7 +146,8 @@ static void __init check_config(void)
* i486+ only features! (WP works in supervisor mode and the
* new "invlpg" and "bswap" instructions)
*/
-#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
+#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
+ defined(CONFIG_X86_BSWAP)
if (boot_cpu_data.x86 == 3)
panic("Kernel requires i486+ for 'invlpg' and other features");
#endif
@@ -170,6 +180,7 @@ void __init check_bugs(void)
check_fpu();
check_hlt();
check_popad();
- init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
}
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 9a3ed0649d4..9a3ed0649d4 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
new file mode 100644
index 00000000000..1d181c40e2e
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -0,0 +1,35 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+ if (c->x86 == 0x6 && c->x86_model >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+}
+
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
+
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+ .c_vendor = "Centaur",
+ .c_ident = { "CentaurHauls" },
+ .c_early_init = early_init_centaur,
+ .c_init = init_centaur,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d0463a94624..80ab20d4fa3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -427,7 +427,7 @@ __setup("serialnumber", x86_serial_nr_setup);
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
{
int i;
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
new file mode 100644
index 00000000000..7b8cc72feb4
--- /dev/null
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -0,0 +1,681 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <linux/topology.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+#include <asm/pat.h>
+#include <asm/numa.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#endif
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/genapic.h>
+
+#include "cpu.h"
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+/* The TLS descriptors are currently at a different place compared to i386.
+ Hopefully nobody expects them at a fixed place (Wine?) */
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+ display_cacheinfo(c);
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+ unsigned int *v;
+
+ if (c->extended_cpuid_level < 0x80000004)
+ return 0;
+
+ v = (unsigned int *) c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+ return 1;
+}
+
+
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+ unsigned int n, dummy, ebx, ecx, edx;
+
+ n = c->extended_cpuid_level;
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+ "D cache %dK (%d bytes/line)\n",
+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+ c->x86_cache_size = (ecx>>24) + (edx>>24);
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+ }
+
+ if (n >= 0x80000006) {
+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+ ecx = cpuid_ecx(0x80000006);
+ c->x86_cache_size = ecx >> 16;
+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+ c->x86_cache_size, ecx & 0xFF);
+ }
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ u32 eax, ebx, ecx, edx;
+ int index_msb, core_bits;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return;
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ goto out;
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+ if (smp_num_siblings == 1) {
+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+ } else if (smp_num_siblings > 1) {
+
+ if (smp_num_siblings > NR_CPUS) {
+ printk(KERN_WARNING "CPU: Unsupported number of "
+ "siblings %d", smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
+
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = phys_pkg_id(index_msb);
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = phys_pkg_id(index_msb) &
+ ((1 << core_bits) - 1);
+ }
+out:
+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ }
+
+#endif
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+ char *v = c->x86_vendor_id;
+ int i;
+ static int printed;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ if (cpu_devs[i]) {
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+ c->x86_vendor = i;
+ this_cpu = cpu_devs[i];
+ return;
+ }
+ }
+ }
+ if (!printed) {
+ printed++;
+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
+ }
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+
+static void __init early_cpu_support_print(void)
+{
+ int i,j;
+ struct cpu_dev *cpu_devx;
+
+ printk("KERNEL supported cpus:\n");
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ cpu_devx = cpu_devs[i];
+ if (!cpu_devx)
+ continue;
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devx->c_ident[j])
+ continue;
+ printk(" %s %s\n", cpu_devx->c_vendor,
+ cpu_devx->c_ident[j]);
+ }
+ }
+}
+
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
+void __init early_cpu_init(void)
+{
+ struct cpu_vendor_dev *cvdev;
+
+ for (cvdev = __x86cpuvendor_start ;
+ cvdev < __x86cpuvendor_end ;
+ cvdev++)
+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+ early_cpu_support_print();
+ early_identify_cpu(&boot_cpu_data);
+}
+
+/* Do some early cpuid on the boot CPU to get some parameter that are
+ needed before check_bugs. Everything advanced is in identify_cpu
+ below. */
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+{
+ u32 tfms, xlvl;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = -1;
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_vendor_id[0] = '\0'; /* Unset */
+ c->x86_model_id[0] = '\0'; /* Unset */
+ c->x86_clflush_size = 64;
+ c->x86_cache_alignment = c->x86_clflush_size;
+ c->x86_max_cores = 1;
+ c->x86_coreid_bits = 0;
+ c->extended_cpuid_level = 0;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ get_cpu_vendor(c);
+
+ /* Initialize the standard set of capabilities */
+ /* Note that the vendor-specific code below might override */
+
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ __u32 misc;
+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+ &c->x86_capability[0]);
+ c->x86 = (tfms >> 8) & 0xf;
+ c->x86_model = (tfms >> 4) & 0xf;
+ c->x86_mask = tfms & 0xf;
+ if (c->x86 == 0xf)
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ } else {
+ /* Have CPUID level 0 only - unheard of */
+ c->x86 = 4;
+ }
+
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
+#ifdef CONFIG_SMP
+ c->phys_proc_id = c->initial_apicid;
+#endif
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = xlvl;
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
+ }
+ if (xlvl >= 0x80000004)
+ get_model_name(c); /* Default name */
+ }
+
+ /* Transmeta-defined flags: level 0x80860001 */
+ xlvl = cpuid_eax(0x80860000);
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ /* Don't set x86_cpuid_level here for now to not confuse. */
+ if (xlvl >= 0x80860001)
+ c->x86_capability[2] = cpuid_edx(0x80860001);
+ }
+
+ c->extended_cpuid_level = cpuid_eax(0x80000000);
+ if (c->extended_cpuid_level >= 0x80000007)
+ c->x86_power = cpuid_edx(0x80000007);
+
+ if (c->extended_cpuid_level >= 0x80000008) {
+ u32 eax = cpuid_eax(0x80000008);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+
+ /* Assume all 64-bit CPUs support 32-bit syscall */
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+
+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+ cpu_devs[c->x86_vendor]->c_early_init)
+ cpu_devs[c->x86_vendor]->c_early_init(c);
+
+ validate_pat_support(c);
+
+ /* early_param could clear that, but recall get it set again */
+ if (disable_apic)
+ clear_cpu_cap(c, X86_FEATURE_APIC);
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ early_identify_cpu(c);
+
+ init_scattered_cpuid_features(c);
+
+ c->apicid = phys_pkg_id(0);
+
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+ * features a certain CPU supports which CPUID doesn't
+ * tell us, CPUID claiming incorrect flags, or other bugs,
+ * we handle them here.
+ *
+ * At the end of this section, c->x86_capability better
+ * indicate the features this CPU genuinely supports!
+ */
+ if (this_cpu->c_init)
+ this_cpu->c_init(c);
+
+ detect_ht(c);
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+ * common between the CPUs. The first time this routine gets
+ * executed, c == &boot_cpu_data.
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+ for (i = 0; i < NCAPINTS; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+ }
+
+ /* Clear all flags overriden by options */
+ for (i = 0; i < NCAPINTS; i++)
+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
+#ifdef CONFIG_X86_MCE
+ mcheck_init(c);
+#endif
+ select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+#endif
+
+}
+
+void __cpuinit identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ mtrr_ap_init();
+}
+
+static __init int setup_noclflush(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+ return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+ if (c->x86_model_id[0])
+ printk(KERN_CONT "%s", c->x86_model_id);
+
+ if (c->x86_mask || c->cpuid_level >= 0)
+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+ else
+ printk(KERN_CONT "\n");
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+ int bit;
+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+ setup_clear_cpu_cap(bit);
+ else
+ return 0;
+ return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+struct x8664_pda **_cpu_pda __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+static int do_not_nx __cpuinitdata;
+
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+
+on Enable(default)
+off Disable
+*/
+static int __init nonx_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ do_not_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ do_not_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", nonx_setup);
+
+int force_personality32;
+
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+
+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off PROT_READ implies PROT_EXEC
+*/
+static int __init nonx32_setup(char *str)
+{
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+void pda_init(int cpu)
+{
+ struct x8664_pda *pda = cpu_pda(cpu);
+
+ /* Setup up data that may be needed in __get_free_pages early */
+ loadsegment(fs, 0);
+ loadsegment(gs, 0);
+ /* Memory clobbers used to order PDA accessed */
+ mb();
+ wrmsrl(MSR_GS_BASE, pda);
+ mb();
+
+ pda->cpunumber = cpu;
+ pda->irqcount = -1;
+ pda->kernelstack = (unsigned long)stack_thread_info() -
+ PDA_STACKOFFSET + THREAD_SIZE;
+ pda->active_mm = &init_mm;
+ pda->mmu_state = 0;
+
+ if (cpu == 0) {
+ /* others are initialized in smpboot.c */
+ pda->pcurrent = &init_task;
+ pda->irqstackptr = boot_cpu_stack;
+ } else {
+ pda->irqstackptr = (char *)
+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+ if (!pda->irqstackptr)
+ panic("cannot allocate irqstack for cpu %d", cpu);
+
+ if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+ pda->nodenumber = cpu_to_node(cpu);
+ }
+
+ pda->irqstackptr += IRQSTACKSIZE-64;
+}
+
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+ DEBUG_STKSZ]
+__attribute__((section(".bss.page_aligned")));
+
+extern asmlinkage void ignore_sysret(void);
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+ /*
+ * LSTAR and STAR live in a bit strange symbiosis.
+ * They both write to the same internal register. STAR allows to
+ * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+ */
+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
+ wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+
+#ifdef CONFIG_IA32_EMULATION
+ syscall32_cpu_init();
+#endif
+
+ /* Flags to clear on syscall */
+ wrmsrl(MSR_SYSCALL_MASK,
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+}
+
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || do_not_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+unsigned long kernel_eflags;
+
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __cpuinit cpu_init(void)
+{
+ int cpu = stack_smp_processor_id();
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+ unsigned long v;
+ char *estacks = NULL;
+ struct task_struct *me;
+ int i;
+
+ /* CPU 0 is initialised in head64.c */
+ if (cpu != 0)
+ pda_init(cpu);
+ else
+ estacks = boot_exception_stacks;
+
+ me = current;
+
+ if (cpu_test_and_set(cpu, cpu_initialized))
+ panic("CPU#%d already initialized!\n", cpu);
+
+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+
+ switch_to_new_gdt();
+ load_idt((const struct desc_ptr *)&idt_descr);
+
+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
+
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ check_efer();
+
+ /*
+ * set up and load the per-CPU TSS
+ */
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ static const unsigned int order[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ };
+ if (cpu) {
+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+ if (!estacks)
+ panic("Cannot allocate exception stack %ld %d\n",
+ v, cpu);
+ }
+ estacks += PAGE_SIZE << order[v];
+ orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
+ }
+
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+ */
+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
+ t->io_bitmap[i] = ~0UL;
+
+ atomic_inc(&init_mm.mm_count);
+ me->active_mm = &init_mm;
+ if (me->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, me);
+
+ load_sp0(t, &current->thread);
+ set_tss_desc(cpu, t);
+ load_TR_desc();
+ load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+ /*
+ * If the kgdb is connected no debug regs should be altered. This
+ * is only applicable when KGDB and a KGDB I/O module are built
+ * into the kernel and you are using early debugging with
+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+ */
+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ else {
+#endif
+ /*
+ * Clear all 6 debug registers:
+ */
+
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+ set_debugreg(0UL, 6);
+ set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+ /* If the kgdb is connected no debug regs should be altered. */
+ }
+#endif
+
+ fpu_init();
+
+ raw_local_save_flags(kernel_eflags);
+
+ if (is_uv_system())
+ uv_cpu_init();
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 783691b2a73..4d894e8565f 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,3 +1,6 @@
+#ifndef ARCH_X86_CPU_H
+
+#define ARCH_X86_CPU_H
struct cpu_model_info {
int vendor;
@@ -36,3 +39,5 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
extern int get_model_name(struct cpuinfo_x86 *c);
extern void display_cacheinfo(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index f03e9153618..965ea52767a 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -26,9 +26,10 @@
#define NFORCE2_SAFE_DISTANCE 50
/* Delay in ms between FSB changes */
-//#define NFORCE2_DELAY 10
+/* #define NFORCE2_DELAY 10 */
-/* nforce2_chipset:
+/*
+ * nforce2_chipset:
* FSB is changed using the chipset
*/
static struct pci_dev *nforce2_chipset_dev;
@@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev;
/* fid:
* multiplier * 10
*/
-static int fid = 0;
+static int fid;
/* min_fsb, max_fsb:
* minimum and maximum FSB (= FSB at boot time)
*/
-static int min_fsb = 0;
-static int max_fsb = 0;
+static int min_fsb;
+static int max_fsb;
MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
@@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444);
MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
MODULE_PARM_DESC(min_fsb,
- "Minimum FSB to use, if not defined: current FSB - 50");
+ "Minimum FSB to use, if not defined: current FSB - 50");
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
@@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb)
/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
- 0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL);
+ 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL);
if (!nforce2_sub5)
return 0;
@@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
fsb /= 1000000;
/* Check if PLL register is already set */
- pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+ pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
- if(bootfsb || !temp)
+ if (bootfsb || !temp)
return fsb;
-
+
/* Use PLL register FSB value */
- pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp);
+ pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp);
fsb = nforce2_calc_fsb(temp);
return fsb;
@@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb)
}
/* First write? Then set actual value */
- pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+ pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
if (!temp) {
pll = nforce2_calc_pll(tfsb);
@@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb)
tfsb--;
/* Calculate the PLL reg. value */
- if ((pll = nforce2_calc_pll(tfsb)) == -1)
+ pll = nforce2_calc_pll(tfsb);
+ if (pll == -1)
return -EINVAL;
nforce2_write_pll(pll);
@@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu)
static int nforce2_target(struct cpufreq_policy *policy,
unsigned int target_freq, unsigned int relation)
{
-// unsigned long flags;
+/* unsigned long flags; */
struct cpufreq_freqs freqs;
unsigned int target_fsb;
@@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy,
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
/* Disable IRQs */
- //local_irq_save(flags);
+ /* local_irq_save(flags); */
if (nforce2_set_fsb(target_fsb) < 0)
printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
- target_fsb);
+ target_fsb);
else
dprintk("Changed FSB successfully to %d\n",
- target_fsb);
+ target_fsb);
/* Enable IRQs */
- //local_irq_restore(flags);
+ /* local_irq_restore(flags); */
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
@@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy)
policy->max = (fsb_pol_max + 1) * fid * 100;
cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
+ policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
return 0;
}
@@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
/* Set maximum FSB to FSB at boot time */
max_fsb = nforce2_fsb_read(1);
- if(!max_fsb)
+ if (!max_fsb)
return -EIO;
if (!min_fsb)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fe9224c51d3..70609efdf1d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -226,6 +226,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (cpu_has_bts)
ds_init_intel(c);
+
+#ifdef CONFIG_X86_NUMAQ
+ numaq_tsc_disable();
+#endif
}
static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
new file mode 100644
index 00000000000..1019c58d39f
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -0,0 +1,95 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/topology.h>
+#include <asm/numa_64.h>
+
+#include "cpu.h"
+
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+}
+
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, t;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_count(4, 0, &eax, &t, &t, &t);
+
+ if (eax & 0x1f)
+ return ((eax >> 26) + 1);
+ else
+ return 1;
+}
+
+static void __cpuinit srat_detect_node(void)
+{
+#ifdef CONFIG_NUMA
+ unsigned node;
+ int cpu = smp_processor_id();
+ int apicid = hard_smp_processor_id();
+
+ /* Don't do the funky fallback heuristics the AMD version employs
+ for now. */
+ node = apicid_to_node[apicid];
+ if (node == NUMA_NO_NODE || !node_online(node))
+ node = first_node(node_online_map);
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+ init_intel_cacheinfo(c);
+ if (c->cpuid_level > 9) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (cpu_has_ds) {
+ unsigned int l1, l2;
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+ set_cpu_cap(c, X86_FEATURE_BTS);
+ if (!(l1 & (1<<12)))
+ set_cpu_cap(c, X86_FEATURE_PEBS);
+ }
+
+
+ if (cpu_has_bts)
+ ds_init_intel(c);
+
+ if (c->x86 == 15)
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ c->x86_max_cores = intel_num_cpu_cores(c);
+
+ srat_detect_node();
+}
+
+static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+ .c_vendor = "Intel",
+ .c_ident = { "GenuineIntel" },
+ .c_early_init = early_init_intel,
+ .c_init = init_intel,
+};
+cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 26d615dcb14..2c8afafa18e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata =
{ 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
{ 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */
{ 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */
+ { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */
{ 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index e633c9c2b76..f390c9f6635 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -9,23 +9,23 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For AMD Athlon/Duron */
-static void k7_machine_check(struct pt_regs * regs, long error_code)
+static void k7_machine_check(struct pt_regs *regs, long error_code)
{
- int recover=1;
+ int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
- rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+ rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
- recover=0;
+ recover = 0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
@@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
}
if (recover&2)
- panic ("CPU context corrupt");
+ panic("CPU context corrupt");
if (recover&1)
- panic ("Unable to continue");
- printk (KERN_EMERG "Attempting to continue.\n");
+ panic("Unable to continue");
+ printk(KERN_EMERG "Attempting to continue.\n");
mcgstl &= ~(1<<2);
- wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+ wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
@@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
machine_check_vector = k7_machine_check;
wmb();
- printk (KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr (MSR_IA32_MCG_CAP, l, h);
+ printk(KERN_INFO "Intel machine check architecture supported.\n");
+ rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
- wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
/* Clear status for MC index 0 separately, we don't touch CTL,
* as some K7 Athlons cause spurious MCEs when its enabled. */
if (boot_cpu_data.x86 == 6) {
- wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+ wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
i = 1;
} else
i = 0;
- for (; i<nr_mce_banks; i++) {
- wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+ for (; i < nr_mce_banks; i++) {
+ wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
- set_in_cr4 (X86_CR4_MCE);
- printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+ set_in_cr4(X86_CR4_MCE);
+ printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae..c4a7ec31394 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/smp_lock.h>
#include <linux/string.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
@@ -31,7 +32,7 @@
#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
-#define NR_BANKS 6
+#define NR_SYSFS_BANKS 6
atomic_t mce_entry;
@@ -46,7 +47,7 @@ static int mce_dont_init;
*/
static int tolerant = 1;
static int banks;
-static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = -1;
@@ -209,7 +210,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
barrier();
for (i = 0; i < banks; i++) {
- if (!bank[i])
+ if (i < NR_SYSFS_BANKS && !bank[i])
continue;
m.misc = 0;
@@ -363,7 +364,7 @@ static void mcheck_check_cpu(void *info)
static void mcheck_timer(struct work_struct *work)
{
- on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+ on_each_cpu(mcheck_check_cpu, NULL, 1);
/*
* Alert userspace if needed. If we logged an MCE, reduce the
@@ -444,9 +445,10 @@ static void mce_init(void *dummy)
rdmsrl(MSR_IA32_MCG_CAP, cap);
banks = cap & 0xff;
- if (banks > NR_BANKS) {
- printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
- banks = NR_BANKS;
+ if (banks > MCE_EXTENDED_BANK) {
+ banks = MCE_EXTENDED_BANK;
+ printk(KERN_INFO "MCE: warning: using only %d banks\n",
+ MCE_EXTENDED_BANK);
}
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
@@ -462,7 +464,11 @@ static void mce_init(void *dummy)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ if (i < NR_SYSFS_BANKS)
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ else
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
+
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
@@ -527,10 +533,12 @@ static int open_exclu; /* already open exclusive? */
static int mce_open(struct inode *inode, struct file *file)
{
+ lock_kernel();
spin_lock(&mce_state_lock);
if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
spin_unlock(&mce_state_lock);
+ unlock_kernel();
return -EBUSY;
}
@@ -539,6 +547,7 @@ static int mce_open(struct inode *inode, struct file *file)
open_count++;
spin_unlock(&mce_state_lock);
+ unlock_kernel();
return nonseekable_open(inode, file);
}
@@ -612,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
* Collect entries that were still getting written before the
* synchronize.
*/
- on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+ on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
if (mcelog.entry[i].finished &&
mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -737,7 +746,7 @@ static void mce_restart(void)
if (next_interval)
cancel_delayed_work(&mcheck_work);
/* Timer race is harmless here */
- on_each_cpu(mce_init, NULL, 1, 1);
+ on_each_cpu(mce_init, NULL, 1);
next_interval = check_interval * HZ;
if (next_interval)
schedule_delayed_work(&mcheck_work,
@@ -766,7 +775,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/* TBD should generate these dynamically based on number of available banks */
+/*
+ * TBD should generate these dynamically based on number of available banks.
+ * Have only 6 contol banks in /sysfs until then.
+ */
ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 00ccb6c14ec..cc1fccdd31e 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
static void mce_work_fn(struct work_struct *work)
{
- on_each_cpu(mce_checkregs, NULL, 1, 1);
+ on_each_cpu(mce_checkregs, NULL, 1);
schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
}
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb03345554a..eef001ad3bd 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -8,7 +8,7 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -32,12 +32,12 @@ struct intel_mce_extended_msrs {
/* u32 *reserved[]; */
};
-static int mce_num_extended_msrs = 0;
+static int mce_num_extended_msrs;
#ifdef CONFIG_X86_MCE_P4THERMAL
static void unexpected_thermal_interrupt(struct pt_regs *regs)
-{
+{
printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
smp_processor_id());
add_taint(TAINT_MACHINE_CHECK);
@@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
* be some SMM goo which handles it, so we can't even put a handler
* since it might be delivered via SMI already -zwanem.
*/
- rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = apic_read(APIC_LVTTHMR);
if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
@@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return; /* -EBUSY */
}
- /* check whether a vector already exists, temporarily masked? */
+ /* check whether a vector already exists, temporarily masked? */
if (h & APIC_VECTOR_MASK) {
printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
"installed\n",
@@ -104,18 +104,18 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
apic_write_around(APIC_LVTTHMR, h);
- rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+ rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+ wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
/* ok we're good to go... */
vendor_thermal_interrupt = intel_thermal_interrupt;
-
- rdmsr (MSR_IA32_MISC_ENABLE, l, h);
- wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
- l = apic_read (APIC_LVTTHMR);
- apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+ l = apic_read(APIC_LVTTHMR);
+ apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+ printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
@@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
{
u32 h;
- rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
- rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
- rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
- rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
- rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
- rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
- rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
- rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
- rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
- rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
+ rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
+ rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
+ rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
+ rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
+ rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
+ rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
+ rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
+ rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
+ rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
+ rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
}
-static void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs *regs, long error_code)
{
- int recover=1;
+ int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
- rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+ rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
- recover=0;
+ recover = 0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
@@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
}
if (recover & 2)
- panic ("CPU context corrupt");
+ panic("CPU context corrupt");
if (recover & 1)
- panic ("Unable to continue");
+ panic("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
+ /*
+ * Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
- for (i=0; i<nr_mce_banks; i++) {
+ for (i = 0; i < nr_mce_banks; i++) {
u32 msr;
msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr (msr, low, high);
+ rdmsr(msr, low, high);
if (high&(1<<31)) {
/* Clear it */
wrmsr(msr, 0UL, 0UL);
@@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
}
}
mcgstl &= ~(1<<2);
- wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+ wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
@@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
-
+
machine_check_vector = intel_machine_check;
wmb();
- printk (KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr (MSR_IA32_MCG_CAP, l, h);
+ printk(KERN_INFO "Intel machine check architecture supported.\n");
+ rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
- wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
- for (i=0; i<nr_mce_banks; i++) {
- wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+ for (i = 0; i < nr_mce_banks; i++) {
+ wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
- set_in_cr4 (X86_CR4_MCE);
- printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+ set_in_cr4(X86_CR4_MCE);
+ printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
/* Check for P4/Xeon extended MCE MSRs */
- rdmsr (MSR_IA32_MCG_CAP, l, h);
+ rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<9)) {/* MCG_EXT_P */
mce_num_extended_msrs = (l >> 16) & 0xff;
- printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
+ printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
" available\n",
smp_processor_id(), mce_num_extended_msrs);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5d241ce94a4..509bd3d9eac 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
static unsigned long smp_changes_mask;
static struct mtrr_state mtrr_state = {};
static int mtrr_state_set;
-static u64 tom2;
+u64 mtrr_tom2;
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
}
}
- if (tom2) {
- if (start >= (1ULL<<32) && (end < tom2))
+ if (mtrr_tom2) {
+ if (start >= (1ULL<<32) && (end < mtrr_tom2))
return MTRR_TYPE_WRBACK;
}
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
}
+/* fill the MSR pair relating to a var range */
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
+{
+ struct mtrr_var_range *vr;
+
+ vr = mtrr_state.var_ranges;
+
+ vr[index].base_lo = base_lo;
+ vr[index].base_hi = base_hi;
+ vr[index].mask_lo = mask_lo;
+ vr[index].mask_hi = mask_hi;
+}
+
static void
get_fixed_ranges(mtrr_type * frs)
{
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void)
mtrr_state.enabled = (lo & 0xc00) >> 10;
if (amd_special_default_mtrr()) {
- unsigned lo, hi;
+ unsigned low, high;
/* TOP_MEM2 */
- rdmsr(MSR_K8_TOP_MEM2, lo, hi);
- tom2 = hi;
- tom2 <<= 32;
- tom2 |= lo;
- tom2 &= 0xffffff8000000ULL;
+ rdmsr(MSR_K8_TOP_MEM2, low, high);
+ mtrr_tom2 = high;
+ mtrr_tom2 <<= 32;
+ mtrr_tom2 |= low;
+ mtrr_tom2 &= 0xffffff800000ULL;
}
if (mtrr_show) {
int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
else
printk(KERN_INFO "MTRR %u disabled\n", i);
}
- if (tom2) {
+ if (mtrr_tom2) {
printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
- tom2, tom2>>20);
+ mtrr_tom2, mtrr_tom2>>20);
}
}
mtrr_state_set = 1;
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
if (lo != msrwords[0] || hi != msrwords[1]) {
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 == 15 &&
+ (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
k8_enable_fixed_iorrs();
mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d932..6f23969c8fa 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -37,6 +37,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
+#include <linux/sort.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
@@ -222,7 +223,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
atomic_set(&data.gate,0);
/* Start the ball rolling on other CPUs */
- if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
+ if (smp_call_function(ipi_handler, &data, 0) != 0)
panic("mtrr: timed out waiting for other CPUs\n");
local_irq_save(flags);
@@ -609,6 +610,787 @@ static struct sysdev_driver mtrr_sysdev_driver = {
.resume = mtrr_restore,
};
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ /* out of slots */
+ if (nr_range >= RANGE_NUM)
+ return nr_range;
+
+ range[nr_range].start = start;
+ range[nr_range].end = end;
+
+ nr_range++;
+
+ return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ /* try to merge it with old one */
+ for (i = 0; i < nr_range; i++) {
+ unsigned long final_start, final_end;
+ unsigned long common_start, common_end;
+
+ if (!range[i].end)
+ continue;
+
+ common_start = max(range[i].start, start);
+ common_end = min(range[i].end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(range[i].start, start);
+ final_end = max(range[i].end, end);
+
+ range[i].start = final_start;
+ range[i].end = final_end;
+ return nr_range;
+ }
+
+ /* need to add that */
+ return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+ int i, j;
+
+ for (j = 0; j < RANGE_NUM; j++) {
+ if (!range[j].end)
+ continue;
+
+ if (start <= range[j].start && end >= range[j].end) {
+ range[j].start = 0;
+ range[j].end = 0;
+ continue;
+ }
+
+ if (start <= range[j].start && end < range[j].end &&
+ range[j].start < end + 1) {
+ range[j].start = end + 1;
+ continue;
+ }
+
+
+ if (start > range[j].start && end >= range[j].end &&
+ range[j].end > start - 1) {
+ range[j].end = start - 1;
+ continue;
+ }
+
+ if (start > range[j].start && end < range[j].end) {
+ /* find the new spare */
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (range[i].end == 0)
+ break;
+ }
+ if (i < RANGE_NUM) {
+ range[i].end = range[j].end;
+ range[i].start = end + 1;
+ } else {
+ printk(KERN_ERR "run of slot in ranges\n");
+ }
+ range[j].end = start - 1;
+ continue;
+ }
+ }
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+ const struct res_range *r1 = x1;
+ const struct res_range *r2 = x2;
+ long start1, start2;
+
+ start1 = r1->start;
+ start2 = r2->start;
+
+ return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+ unsigned long base_pfn;
+ unsigned long size_pfn;
+ mtrr_type type;
+};
+
+struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size)
+{
+ unsigned long i, base, size;
+ mtrr_type type;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ nr_range = add_range_with_merge(range, nr_range, base,
+ base + size - 1);
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* take out UC ranges */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_UNCACHABLE)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ continue;
+ base = range_state[i].base_pfn;
+ subtract_range(range, base, base + size - 1);
+ }
+ if (extra_remove_size)
+ subtract_range(range, extra_remove_base,
+ extra_remove_base + extra_remove_size - 1);
+
+ /* get new range num */
+ nr_range = 0;
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+ nr_range++;
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After UC checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ if (debug_print) {
+ printk(KERN_DEBUG "After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* clear those is not used */
+ for (i = nr_range; i < RANGE_NUM; i++)
+ memset(&range[i], 0, sizeof(range[i]));
+
+ return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+ unsigned long sum;
+ int i;
+
+ sum = 0;
+ for (i = 0; i < nr_range; i++)
+ sum += range[i].end + 1 - range[i].start;
+
+ return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+ if (enable_mtrr_cleanup != -1)
+ enable_mtrr_cleanup = 0;
+ return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+ if (enable_mtrr_cleanup != -1)
+ enable_mtrr_cleanup = 1;
+ return 0;
+}
+early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+struct var_mtrr_state {
+ unsigned long range_startk;
+ unsigned long range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+ unsigned int reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type, unsigned int address_bits)
+{
+ u32 base_lo, base_hi, mask_lo, mask_hi;
+ u64 base, mask;
+
+ if (!sizek) {
+ fill_mtrr_var_range(reg, 0, 0, 0, 0);
+ return;
+ }
+
+ mask = (1ULL << address_bits) - 1;
+ mask &= ~((((u64)sizek) << 10) - 1);
+
+ base = ((u64)basek) << 10;
+
+ base |= type;
+ mask |= 0x800;
+
+ base_lo = base & ((1ULL<<32) - 1);
+ base_hi = base >> 32;
+
+ mask_lo = mask & ((1ULL<<32) - 1);
+ mask_hi = mask >> 32;
+
+ fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type)
+{
+ range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+ range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+ range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+ unsigned long basek, sizek;
+ unsigned char type;
+ unsigned int reg;
+
+ for (reg = 0; reg < num_var_ranges; reg++) {
+ basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+ sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+ type = range_state[reg].type;
+
+ set_var_mtrr(reg, basek, sizek, type, address_bits);
+ }
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+ unsigned long range_sizek, unsigned char type)
+{
+ if (!range_sizek || (reg >= num_var_ranges))
+ return reg;
+
+ while (range_sizek) {
+ unsigned long max_align, align;
+ unsigned long sizek;
+
+ /* Compute the maximum size I can make a range */
+ if (range_startk)
+ max_align = ffs(range_startk) - 1;
+ else
+ max_align = 32;
+ align = fls(range_sizek) - 1;
+ if (align > max_align)
+ align = max_align;
+
+ sizek = 1 << align;
+ if (debug_print)
+ printk(KERN_DEBUG "Setting variable MTRR %d, "
+ "base: %ldMB, range: %ldMB, type %s\n",
+ reg, range_startk >> 10, sizek >> 10,
+ (type == MTRR_TYPE_UNCACHABLE)?"UC":
+ ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
+ );
+ save_var_mtrr(reg++, range_startk, sizek, type);
+ range_startk += sizek;
+ range_sizek -= sizek;
+ if (reg >= num_var_ranges)
+ break;
+ }
+ return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+ unsigned long sizek)
+{
+ unsigned long hole_basek, hole_sizek;
+ unsigned long second_basek, second_sizek;
+ unsigned long range0_basek, range0_sizek;
+ unsigned long range_basek, range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+
+ hole_basek = 0;
+ hole_sizek = 0;
+ second_basek = 0;
+ second_sizek = 0;
+ chunk_sizek = state->chunk_sizek;
+ gran_sizek = state->gran_sizek;
+
+ /* align with gran size, prevent small block used up MTRRs */
+ range_basek = ALIGN(state->range_startk, gran_sizek);
+ if ((range_basek > basek) && basek)
+ return second_sizek;
+ state->range_sizek -= (range_basek - state->range_startk);
+ range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+ while (range_sizek > state->range_sizek) {
+ range_sizek -= gran_sizek;
+ if (!range_sizek)
+ return 0;
+ }
+ state->range_sizek = range_sizek;
+
+ /* try to append some small hole */
+ range0_basek = state->range_startk;
+ range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+ if (range0_sizek == state->range_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + state->range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ state->range_sizek, MTRR_TYPE_WRBACK);
+ return 0;
+ }
+
+ range0_sizek -= chunk_sizek;
+ if (range0_sizek && sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ range0_sizek -= chunk_sizek;
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+ if (range0_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + range0_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ range0_sizek, MTRR_TYPE_WRBACK);
+
+ }
+
+ range_basek = range0_basek + range0_sizek;
+ range_sizek = chunk_sizek;
+
+ if (range_basek + range_sizek > basek &&
+ range_basek + range_sizek <= (basek + sizek)) {
+ /* one hole */
+ second_basek = basek;
+ second_sizek = range_basek + range_sizek - basek;
+ }
+
+ /* if last piece, only could one hole near end */
+ if ((second_basek || !basek) &&
+ range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
+ (chunk_sizek >> 1)) {
+ /*
+ * one hole in middle (second_sizek is 0) or at end
+ * (second_sizek is 0 )
+ */
+ hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
+ - second_sizek;
+ hole_basek = range_basek + range_sizek - hole_sizek
+ - second_sizek;
+ } else {
+ /* fallback for big hole, or several holes */
+ range_sizek = state->range_sizek - range0_sizek;
+ second_basek = 0;
+ second_sizek = 0;
+ }
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
+ MTRR_TYPE_WRBACK);
+ if (hole_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+ hole_basek<<10, (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
+ MTRR_TYPE_UNCACHABLE);
+
+ }
+
+ return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+ unsigned long size_pfn)
+{
+ unsigned long basek, sizek;
+ unsigned long second_sizek = 0;
+
+ if (state->reg >= num_var_ranges)
+ return;
+
+ basek = base_pfn << (PAGE_SHIFT - 10);
+ sizek = size_pfn << (PAGE_SHIFT - 10);
+
+ /* See if I can merge with the last range */
+ if ((basek <= 1024) ||
+ (state->range_startk + state->range_sizek == basek)) {
+ unsigned long endk = basek + sizek;
+ state->range_sizek = endk - state->range_startk;
+ return;
+ }
+ /* Write the range mtrrs */
+ if (state->range_sizek != 0)
+ second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+ /* Allocate an msr */
+ state->range_startk = basek + second_sizek;
+ state->range_sizek = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_chunk_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_gran_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+ if (arg)
+ nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+ u64 chunk_size, u64 gran_size)
+{
+ struct var_mtrr_state var_state;
+ int i;
+ int num_reg;
+
+ var_state.range_startk = 0;
+ var_state.range_sizek = 0;
+ var_state.reg = 0;
+ var_state.chunk_sizek = chunk_size >> 10;
+ var_state.gran_sizek = gran_size >> 10;
+
+ memset(range_state, 0, sizeof(range_state));
+
+ /* Write the range etc */
+ for (i = 0; i < nr_range; i++)
+ set_var_mtrr_range(&var_state, range[i].start,
+ range[i].end - range[i].start + 1);
+
+ /* Write the last range */
+ if (var_state.range_sizek != 0)
+ range_to_mtrr_with_hole(&var_state, 0, 0);
+
+ num_reg = var_state.reg;
+ /* Clear out the extra MTRR's */
+ while (var_state.reg < num_var_ranges) {
+ save_var_mtrr(var_state.reg, 0, 0, 0);
+ var_state.reg++;
+ }
+
+ return num_reg;
+}
+
+struct mtrr_cleanup_result {
+ unsigned long gran_sizek;
+ unsigned long chunk_sizek;
+ unsigned long lose_cover_sizek;
+ unsigned int num_reg;
+ int bad;
+};
+
+/*
+ * gran_size: 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 4G
+ * so we need (2+13)*6
+ */
+#define NUM_RESULT 90
+#define PSHIFT (PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static struct res_range __initdata range_new[RANGE_NUM];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long extra_remove_base, extra_remove_size;
+ unsigned long i, base, size, def, dummy;
+ mtrr_type type;
+ int nr_range, nr_range_new;
+ u64 chunk_size, gran_size;
+ unsigned long range_sums, range_sums_new;
+ int index_good;
+ int num_reg_good;
+
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ size = range_state[i].size_pfn;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* check if we got UC entries */
+ if (!num[MTRR_TYPE_UNCACHABLE])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ extra_remove_size = 0;
+ if (mtrr_tom2) {
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ extra_remove_size =
+ (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+ extra_remove_size);
+ range_sums = sum_ranges(range, nr_range);
+ printk(KERN_INFO "total RAM coverred: %ldM\n",
+ range_sums >> (20 - PAGE_SHIFT));
+
+ if (mtrr_chunk_size && mtrr_gran_size) {
+ int num_reg;
+
+ debug_print = 1;
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
+ mtrr_gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base,
+ extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ i = 0;
+ result[i].chunk_sizek = mtrr_chunk_size >> 10;
+ result[i].gran_sizek = mtrr_gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
+
+ printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
+ result[i].num_reg, result[i].bad?"-":"",
+ result[i].lose_cover_sizek >> 10);
+ if (!result[i].bad) {
+ set_var_mtrr_all(address_bits);
+ return 1;
+ }
+ printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+ "will find optimal one\n");
+ debug_print = 0;
+ memset(result, 0, sizeof(result[0]));
+ }
+
+ i = 0;
+ memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+ memset(result, 0, sizeof(result));
+ for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
+ for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+ chunk_size <<= 1) {
+ int num_reg;
+
+ if (debug_print)
+ printk(KERN_INFO
+ "\ngran_size: %lldM chunk_size_size: %lldM\n",
+ gran_size >> 20, chunk_size >> 20);
+ if (i >= NUM_RESULT)
+ continue;
+
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range,
+ chunk_size, gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base, extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
+
+ /* double check it */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range ||
+ memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
+
+ if (!result[i].bad && (range_sums - range_sums_new <
+ min_loss_pfn[num_reg])) {
+ min_loss_pfn[num_reg] =
+ range_sums - range_sums_new;
+ }
+ i++;
+ }
+ }
+
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++) {
+ printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
+ result[i].num_reg, result[i].bad?"-":"",
+ result[i].lose_cover_sizek >> 10);
+ }
+
+ /* try to find the optimal index */
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i]) {
+ num_reg_good = i;
+ break;
+ }
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ if (index_good != -1) {
+ printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ i = index_good;
+ printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
+ result[i].num_reg,
+ result[i].lose_cover_sizek >> 10);
+ /* convert ranges to var ranges state */
+ chunk_size = result[i].chunk_sizek;
+ chunk_size <<= 10;
+ gran_size = result[i].gran_sizek;
+ gran_size <<= 10;
+ debug_print = 1;
+ x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ set_var_mtrr_all(address_bits);
+ return 1;
+ }
+
+ printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+ printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+ return 0;
+}
+#else
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+ return 0;
+}
+#endif
+
+static int __initdata changed_by_mtrr_cleanup;
+
static int disable_mtrr_trim;
static int __init disable_mtrr_trim_setup(char *str)
@@ -648,6 +1430,19 @@ int __init amd_special_default_mtrr(void)
return 0;
}
+static u64 __init real_trim_memory(unsigned long start_pfn,
+ unsigned long limit_pfn)
+{
+ u64 trim_start, trim_size;
+ trim_start = start_pfn;
+ trim_start <<= PAGE_SHIFT;
+ trim_size = limit_pfn;
+ trim_size <<= PAGE_SHIFT;
+ trim_size -= trim_start;
+
+ return e820_update_range(trim_start, trim_size, E820_RAM,
+ E820_RESERVED);
+}
/**
* mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
* @end_pfn: ending page frame number
@@ -663,8 +1458,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
{
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
- u64 trim_start, trim_size;
+ int nr_range;
+ u64 total_trim_size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
@@ -676,14 +1474,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
- if (amd_special_default_mtrr())
- return 0;
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
/* Find highest cached pfn */
for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
+ type = range_state[i].type;
if (type != MTRR_TYPE_WRBACK)
continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
if (highest_pfn < base + size)
highest_pfn = base + size;
}
@@ -698,22 +1504,65 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
return 0;
}
- if (highest_pfn < end_pfn) {
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* no entry for WB? */
+ if (!num[MTRR_TYPE_WRBACK])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ nr_range = 0;
+ if (mtrr_tom2) {
+ range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+ range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+ if (highest_pfn < range[nr_range].end + 1)
+ highest_pfn = range[nr_range].end + 1;
+ nr_range++;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+ total_trim_size = 0;
+ /* check the head */
+ if (range[0].start)
+ total_trim_size += real_trim_memory(0, range[0].start);
+ /* check the holes */
+ for (i = 0; i < nr_range - 1; i++) {
+ if (range[i].end + 1 < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ range[i+1].start);
+ }
+ /* check the top */
+ i = nr_range - 1;
+ if (range[i].end + 1 < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ end_pfn);
+
+ if (total_trim_size) {
printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
- " all of memory, losing %luMB of RAM.\n",
- (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
+ " all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
- WARN_ON(1);
+ if (!changed_by_mtrr_cleanup)
+ WARN_ON(1);
printk(KERN_INFO "update e820 for mtrr\n");
- trim_start = highest_pfn;
- trim_start <<= PAGE_SHIFT;
- trim_size = end_pfn;
- trim_size <<= PAGE_SHIFT;
- trim_size -= trim_start;
- update_memory_range(trim_start, trim_size, E820_RAM,
- E820_RESERVED);
update_e820();
+
return 1;
}
@@ -729,18 +1578,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
*/
void __init mtrr_bp_init(void)
{
+ u32 phys_addr;
init_ifs();
+ phys_addr = 32;
+
if (cpu_has_mtrr) {
mtrr_if = &generic_mtrr_ops;
size_or_mask = 0xff000000; /* 36 bits */
size_and_mask = 0x00f00000;
+ phys_addr = 36;
/* This is an AMD specific MSR, but we assume(hope?) that
Intel will implement it to when they extend the address
bus of the Xeon. */
if (cpuid_eax(0x80000000) >= 0x80000008) {
- u32 phys_addr;
phys_addr = cpuid_eax(0x80000008) & 0xff;
/* CPUID workaround for Intel 0F33/0F34 CPU */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1610,7 @@ void __init mtrr_bp_init(void)
don't support PAE */
size_or_mask = 0xfff00000; /* 32 bits */
size_and_mask = 0;
+ phys_addr = 32;
}
} else {
switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1644,15 @@ void __init mtrr_bp_init(void)
if (mtrr_if) {
set_num_var_ranges();
init_table();
- if (use_intel())
+ if (use_intel()) {
get_mtrr_state();
+
+ if (mtrr_cleanup(phys_addr)) {
+ changed_by_mtrr_cleanup = 1;
+ mtrr_if->set_all();
+ }
+
+ }
}
}
@@ -822,16 +1682,17 @@ void mtrr_ap_init(void)
*/
void mtrr_save_state(void)
{
- smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
+ smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
}
static int __init mtrr_init_finialize(void)
{
if (!mtrr_if)
return 0;
- if (use_intel())
- mtrr_state_warn();
- else {
+ if (use_intel()) {
+ if (!changed_by_mtrr_cleanup)
+ mtrr_state_warn();
+ } else {
/* The CPUs haven't MTRR and seem to not support SMP. They have
* specific drivers, we use a tricky method to support
* suspend/resume for them.
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2cc77eb6fea..2dc4ec656b2 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
void get_mtrr_state(void);
extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
extern unsigned int num_var_ranges;
+extern u64 mtrr_tom2;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe..6d4bdc02388 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,11 +1,15 @@
-/* local apic based NMI watchdog for various CPUs.
- This file also handles reservation of performance counters for coordination
- with other users (like oprofile).
-
- Note that these events normally don't tick when the CPU idles. This means
- the frequency varies with CPU load.
-
- Original code for K7/P6 written by Keith Owens */
+/*
+ * local apic based NMI watchdog for various CPUs.
+ *
+ * This file also handles reservation of performance counters for coordination
+ * with other users (like oprofile).
+ *
+ * Note that these events normally don't tick when the CPU idles. This means
+ * the frequency varies with CPU load.
+ *
+ * Original code for K7/P6 written by Keith Owens
+ *
+ */
#include <linux/percpu.h>
#include <linux/module.h>
@@ -36,12 +40,16 @@ struct wd_ops {
static const struct wd_ops *wd_ops;
-/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
- * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
+/*
+ * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.
+ *
+ * It will be the max for all platforms (for now)
*/
#define NMI_MAX_COUNTER_BITS 66
-/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+/*
+ * perfctr_nmi_owner tracks the ownership of the perfctr registers:
* evtsel_nmi_owner tracks the ownership of the event selection
* - different performance counters/ event selection may be reserved for
* different subsystems this reservation system just tries to coordinate
@@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
return 0;
}
-/* converts an msr to an appropriate reservation bit */
-/* returns the bit offset of the event selection register */
+/*
+ * converts an msr to an appropriate reservation bit
+ * returns the bit offset of the event selection register
+ */
static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
@@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr)
return (!test_bit(counter, perfctr_nmi_owner));
}
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
int reserve_perfctr_nmi(unsigned int msr)
{
@@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr)
return 1;
return 0;
}
+EXPORT_SYMBOL(reserve_perfctr_nmi);
void release_perfctr_nmi(unsigned int msr)
{
@@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr)
clear_bit(counter, perfctr_nmi_owner);
}
+EXPORT_SYMBOL(release_perfctr_nmi);
int reserve_evntsel_nmi(unsigned int msr)
{
@@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr)
return 1;
return 0;
}
+EXPORT_SYMBOL(reserve_evntsel_nmi);
void release_evntsel_nmi(unsigned int msr)
{
@@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr)
clear_bit(counter, evntsel_nmi_owner);
}
-
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-EXPORT_SYMBOL(reserve_perfctr_nmi);
-EXPORT_SYMBOL(release_perfctr_nmi);
-EXPORT_SYMBOL(reserve_evntsel_nmi);
EXPORT_SYMBOL(release_evntsel_nmi);
void disable_lapic_nmi_watchdog(void)
@@ -180,8 +189,10 @@ void disable_lapic_nmi_watchdog(void)
if (atomic_read(&nmi_active) <= 0)
return;
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
- wd_ops->unreserve();
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
+
+ if (wd_ops)
+ wd_ops->unreserve();
BUG_ON(atomic_read(&nmi_active) != 0);
}
@@ -202,7 +213,7 @@ void enable_lapic_nmi_watchdog(void)
return;
}
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
touch_nmi_watchdog();
}
@@ -232,8 +243,8 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz)
return retval;
}
-static void
-write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
+static void write_watchdog_counter(unsigned int perfctr_msr,
+ const char *descr, unsigned nmi_hz)
{
u64 count = (u64)cpu_khz * 1000;
@@ -244,7 +255,7 @@ write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi
}
static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
+ const char *descr, unsigned nmi_hz)
{
u64 count = (u64)cpu_khz * 1000;
@@ -254,9 +265,10 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
wrmsr(perfctr_msr, (u32)(-count), 0);
}
-/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
- nicely stable so there is not much variety */
-
+/*
+ * AMD K7/K8/Family10h/Family11h support.
+ * AMD keeps this interface nicely stable so there is not much variety
+ */
#define K7_EVNTSEL_ENABLE (1 << 22)
#define K7_EVNTSEL_INT (1 << 20)
#define K7_EVNTSEL_OS (1 << 17)
@@ -289,7 +301,7 @@ static int setup_k7_watchdog(unsigned nmi_hz)
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
+ wd->cccr_msr = 0; /* unused */
return 1;
}
@@ -325,18 +337,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
}
static const struct wd_ops k7_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_k7_watchdog,
- .rearm = single_msr_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_K7_PERFCTR0,
- .evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL<<47,
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_k7_watchdog,
+ .rearm = single_msr_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_K7_PERFCTR0,
+ .evntsel = MSR_K7_EVNTSEL0,
+ .checkbit = 1ULL << 47,
};
-/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
-
+/*
+ * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
+ */
#define P6_EVNTSEL0_ENABLE (1 << 22)
#define P6_EVNTSEL_INT (1 << 20)
#define P6_EVNTSEL_OS (1 << 17)
@@ -372,52 +385,58 @@ static int setup_p6_watchdog(unsigned nmi_hz)
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
+ wd->cccr_msr = 0; /* unused */
return 1;
}
static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
- /* P6 based Pentium M need to re-unmask
+ /*
+ * P6 based Pentium M need to re-unmask
* the apic vector but it doesn't hurt
* other P6 variant.
- * ArchPerfom/Core Duo also needs this */
+ * ArchPerfom/Core Duo also needs this
+ */
apic_write(APIC_LVTPC, APIC_DM_NMI);
+
/* P6/ARCH_PERFMON has 32 bit counter write */
write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
}
static const struct wd_ops p6_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_p6_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_P6_PERFCTR0,
- .evntsel = MSR_P6_EVNTSEL0,
- .checkbit = 1ULL<<39,
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_p6_watchdog,
+ .rearm = p6_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_P6_PERFCTR0,
+ .evntsel = MSR_P6_EVNTSEL0,
+ .checkbit = 1ULL << 39,
};
-/* Intel P4 performance counters. By far the most complicated of all. */
-
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI0 (1<<26)
-#define P4_CCCR_OVF_PMI1 (1<<27)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-#define P4_CCCR_OVF (1<<31)
-
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
+/*
+ * Intel P4 performance counters.
+ * By far the most complicated of all.
+ */
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
+#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
+#define P4_ESCR_OS (1 << 3)
+#define P4_ESCR_USR (1 << 2)
+#define P4_CCCR_OVF_PMI0 (1 << 26)
+#define P4_CCCR_OVF_PMI1 (1 << 27)
+#define P4_CCCR_THRESHOLD(N) ((N) << 20)
+#define P4_CCCR_COMPLEMENT (1 << 19)
+#define P4_CCCR_COMPARE (1 << 18)
+#define P4_CCCR_REQUIRED (3 << 16)
+#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
+#define P4_CCCR_ENABLE (1 << 12)
+#define P4_CCCR_OVF (1 << 31)
+/*
+ * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ * CRU_ESCR0 (with any non-null event selector) through a complemented
+ * max threshold. [IA32-Vol3, Section 14.9.9]
+ */
static int setup_p4_watchdog(unsigned nmi_hz)
{
unsigned int perfctr_msr, evntsel_msr, cccr_msr;
@@ -442,7 +461,8 @@ static int setup_p4_watchdog(unsigned nmi_hz)
#endif
ht_num = 0;
- /* performance counters are shared resources
+ /*
+ * performance counters are shared resources
* assign each hyperthread its own set
* (re-use the ESCR0 register, seems safe
* and keeps the cccr_val the same)
@@ -540,20 +560,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
}
static const struct wd_ops p4_wd_ops = {
- .reserve = p4_reserve,
- .unreserve = p4_unreserve,
- .setup = setup_p4_watchdog,
- .rearm = p4_rearm,
- .stop = stop_p4_watchdog,
+ .reserve = p4_reserve,
+ .unreserve = p4_unreserve,
+ .setup = setup_p4_watchdog,
+ .rearm = p4_rearm,
+ .stop = stop_p4_watchdog,
/* RED-PEN this is wrong for the other sibling */
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .evntsel = MSR_P4_BSU_ESCR0,
- .checkbit = 1ULL<<39,
+ .perfctr = MSR_P4_BPU_PERFCTR0,
+ .evntsel = MSR_P4_BSU_ESCR0,
+ .checkbit = 1ULL << 39,
};
-/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
- all future Intel CPUs. */
-
+/*
+ * Watchdog using the Intel architected PerfMon.
+ * Used for Core2 and hopefully all future Intel CPUs.
+ */
#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
@@ -599,19 +620,19 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
+ wd->cccr_msr = 0; /* unused */
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
}
static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_intel_arch_watchdog,
+ .rearm = p6_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
+ .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
};
static void probe_nmi_watchdog(void)
@@ -624,8 +645,10 @@ static void probe_nmi_watchdog(void)
wd_ops = &k7_wd_ops;
break;
case X86_VENDOR_INTEL:
- /* Work around Core Duo (Yonah) errata AE49 where perfctr1
- doesn't have a working enable bit. */
+ /*
+ * Work around Core Duo (Yonah) errata AE49 where perfctr1
+ * doesn't have a working enable bit.
+ */
if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
@@ -636,7 +659,7 @@ static void probe_nmi_watchdog(void)
}
switch (boot_cpu_data.x86) {
case 6:
- if (boot_cpu_data.x86_model > 0xd)
+ if (boot_cpu_data.x86_model > 13)
return;
wd_ops = &p6_wd_ops;
@@ -697,10 +720,11 @@ int lapic_wd_event(unsigned nmi_hz)
{
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
u64 ctr;
+
rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) { /* perfctr still running? */
+ if (ctr & wd_ops->checkbit) /* perfctr still running? */
return 0;
- }
+
wd_ops->rearm(wd, nmi_hz);
return 1;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a6224..2de5fa2bbf7 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,6 +33,7 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
+#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
@@ -95,7 +96,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
for (; count; count -= 16) {
cmd.eax = pos;
cmd.ecx = pos >> 32;
- smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+ smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
if (copy_to_user(tmp, &cmd, 16))
return -EFAULT;
tmp += 16;
@@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
static int cpuid_open(struct inode *inode, struct file *file)
{
- unsigned int cpu = iminor(file->f_path.dentry->d_inode);
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (cpu >= NR_CPUS || !cpu_online(cpu))
- return -ENXIO; /* No such CPU */
+ unsigned int cpu;
+ struct cpuinfo_x86 *c;
+ int ret = 0;
+
+ lock_kernel();
+
+ cpu = iminor(file->f_path.dentry->d_inode);
+ if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+ ret = -ENXIO; /* No such CPU */
+ goto out;
+ }
+ c = &cpu_data(cpu);
if (c->cpuid_level < 0)
- return -EIO; /* CPUID not supported */
-
- return 0;
+ ret = -EIO; /* CPUID not supported */
+out:
+ unlock_kernel();
+ return ret;
}
/*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
new file mode 100644
index 00000000000..28c29180b38
--- /dev/null
+++ b/arch/x86/kernel/e820.c
@@ -0,0 +1,1390 @@
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ * Getting sanitize_e820_map() in sync with i386 version by applying change:
+ * - Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/firmware-map.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/trampoline.h>
+
+/*
+ * The e820 map is the map that gets modified e.g. with command line parameters
+ * and that is also registered with modifications in the kernel resource tree
+ * with the iomem_resource as parent.
+ *
+ * The e820_saved is directly saved after the BIOS-provided memory map is
+ * copied. It doesn't get modified afterwards. It's registered for the
+ * /sys/firmware/memmap interface.
+ *
+ * That memory map is not modified and is used as base for kexec. The kexec'd
+ * kernel should get the same memory map as the firmware provides. Then the
+ * user can e.g. boot the original kernel with mem=1G while still booting the
+ * next kernel with full memory.
+ */
+struct e820map e820;
+struct e820map e820_saved;
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (type && ei->type != type)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /*
+ * if start is now at or beyond end, we're done, full
+ * coverage
+ */
+ if (start >= end)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+ int x = e820.nr_map;
+
+ if (x == ARRAY_SIZE(e820.map)) {
+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ return;
+ }
+
+ e820.map[x].addr = start;
+ e820.map[x].size = size;
+ e820.map[x].type = type;
+ e820.nr_map++;
+}
+
+void __init e820_print_map(char *who)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+ (unsigned long long) e820.map[i].addr,
+ (unsigned long long)
+ (e820.map[i].addr + e820.map[i].size));
+ switch (e820.map[i].type) {
+ case E820_RAM:
+ case E820_RESERVED_KERN:
+ printk(KERN_CONT "(usable)\n");
+ break;
+ case E820_RESERVED:
+ printk(KERN_CONT "(reserved)\n");
+ break;
+ case E820_ACPI:
+ printk(KERN_CONT "(ACPI data)\n");
+ break;
+ case E820_NVS:
+ printk(KERN_CONT "(ACPI NVS)\n");
+ break;
+ default:
+ printk(KERN_CONT "type %u\n", e820.map[i].type);
+ break;
+ }
+ }
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ * Visually we're performing the following
+ * (1,2,3,4 = memory types)...
+ *
+ * Sample memory map (w/overlaps):
+ * ____22__________________
+ * ______________________4_
+ * ____1111________________
+ * _44_____________________
+ * 11111111________________
+ * ____________________33__
+ * ___________44___________
+ * __________33333_________
+ * ______________22________
+ * ___________________2222_
+ * _________111111111______
+ * _____________________11_
+ * _________________4______
+ *
+ * Sanitized equivalent (no overlap):
+ * 1_______________________
+ * _44_____________________
+ * ___1____________________
+ * ____22__________________
+ * ______11________________
+ * _________1______________
+ * __________3_____________
+ * ___________44___________
+ * _____________33_________
+ * _______________2________
+ * ________________1_______
+ * _________________4______
+ * ___________________2____
+ * ____________________33__
+ * ______________________4_
+ */
+
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+ int *pnr_map)
+{
+ struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+ };
+ static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+ static struct change_member *change_point[2*E820_X_MAX] __initdata;
+ static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+ static struct e820entry new_bios[E820_X_MAX] __initdata;
+ struct change_member *change_tmp;
+ unsigned long current_type, last_type;
+ unsigned long long last_addr;
+ int chgidx, still_changing;
+ int overlap_entries;
+ int new_bios_entry;
+ int old_nr, new_nr, chg_nr;
+ int i;
+
+ /* if there's only one memory region, don't bother */
+ if (*pnr_map < 2)
+ return -1;
+
+ old_nr = *pnr_map;
+ BUG_ON(old_nr > max_nr_map);
+
+ /* bail out if we find any unreasonable addresses in bios map */
+ for (i = 0; i < old_nr; i++)
+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+ return -1;
+
+ /* create pointers for initial change-point information (for sorting) */
+ for (i = 0; i < 2 * old_nr; i++)
+ change_point[i] = &change_point_list[i];
+
+ /* record all known change-points (starting and ending addresses),
+ omitting those that are for empty memory regions */
+ chgidx = 0;
+ for (i = 0; i < old_nr; i++) {
+ if (biosmap[i].size != 0) {
+ change_point[chgidx]->addr = biosmap[i].addr;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ change_point[chgidx]->addr = biosmap[i].addr +
+ biosmap[i].size;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ }
+ }
+ chg_nr = chgidx;
+
+ /* sort change-point list by memory addresses (low -> high) */
+ still_changing = 1;
+ while (still_changing) {
+ still_changing = 0;
+ for (i = 1; i < chg_nr; i++) {
+ unsigned long long curaddr, lastaddr;
+ unsigned long long curpbaddr, lastpbaddr;
+
+ curaddr = change_point[i]->addr;
+ lastaddr = change_point[i - 1]->addr;
+ curpbaddr = change_point[i]->pbios->addr;
+ lastpbaddr = change_point[i - 1]->pbios->addr;
+
+ /*
+ * swap entries, when:
+ *
+ * curaddr > lastaddr or
+ * curaddr == lastaddr and curaddr == curpbaddr and
+ * lastaddr != lastpbaddr
+ */
+ if (curaddr < lastaddr ||
+ (curaddr == lastaddr && curaddr == curpbaddr &&
+ lastaddr != lastpbaddr)) {
+ change_tmp = change_point[i];
+ change_point[i] = change_point[i-1];
+ change_point[i-1] = change_tmp;
+ still_changing = 1;
+ }
+ }
+ }
+
+ /* create a new bios memory map, removing overlaps */
+ overlap_entries = 0; /* number of entries in the overlap table */
+ new_bios_entry = 0; /* index for creating new bios map entries */
+ last_type = 0; /* start with undefined memory type */
+ last_addr = 0; /* start with 0 as last starting address */
+
+ /* loop through change-points, determining affect on the new bios map */
+ for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+ /* keep track of all overlapping bios entries */
+ if (change_point[chgidx]->addr ==
+ change_point[chgidx]->pbios->addr) {
+ /*
+ * add map entry to overlap list (> 1 entry
+ * implies an overlap)
+ */
+ overlap_list[overlap_entries++] =
+ change_point[chgidx]->pbios;
+ } else {
+ /*
+ * remove entry from list (order independent,
+ * so swap with last)
+ */
+ for (i = 0; i < overlap_entries; i++) {
+ if (overlap_list[i] ==
+ change_point[chgidx]->pbios)
+ overlap_list[i] =
+ overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /*
+ * if there are overlapping entries, decide which
+ * "type" to use (larger value takes precedence --
+ * 1=usable, 2,3,4,4+=unusable)
+ */
+ current_type = 0;
+ for (i = 0; i < overlap_entries; i++)
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ /*
+ * continue building up new bios map based on this
+ * information
+ */
+ if (current_type != last_type) {
+ if (last_type != 0) {
+ new_bios[new_bios_entry].size =
+ change_point[chgidx]->addr - last_addr;
+ /*
+ * move forward only if the new size
+ * was non-zero
+ */
+ if (new_bios[new_bios_entry].size != 0)
+ /*
+ * no more space left for new
+ * bios entries ?
+ */
+ if (++new_bios_entry >= max_nr_map)
+ break;
+ }
+ if (current_type != 0) {
+ new_bios[new_bios_entry].addr =
+ change_point[chgidx]->addr;
+ new_bios[new_bios_entry].type = current_type;
+ last_addr = change_point[chgidx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+ /* retain count for new bios entries */
+ new_nr = new_bios_entry;
+
+ /* copy new bios mapping into original location */
+ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+ *pnr_map = new_nr;
+
+ return 0;
+}
+
+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+ while (nr_map) {
+ u64 start = biosmap->addr;
+ u64 size = biosmap->size;
+ u64 end = start + size;
+ u32 type = biosmap->type;
+
+ /* Overflow in 64 bits? Ignore the memory map. */
+ if (start > end)
+ return -1;
+
+ e820_add_region(start, size, type);
+
+ biosmap++;
+ nr_map--;
+ }
+ return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ */
+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_map < 2)
+ return -1;
+
+ return __append_e820_map(biosmap, nr_map);
+}
+
+static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+ u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ int i;
+ u64 real_updated_size = 0;
+
+ BUG_ON(old_type == new_type);
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820x->map[i];
+ u64 final_start, final_end;
+ if (ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start &&
+ (ei->addr + ei->size) <= (start + size)) {
+ ei->type = new_type;
+ real_updated_size += ei->size;
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+ e820_add_region(final_start, final_end - final_start,
+ new_type);
+ real_updated_size += final_end - final_start;
+
+ ei->size -= final_end - final_start;
+ if (ei->addr < final_start)
+ continue;
+ ei->addr = final_end;
+ }
+ return real_updated_size;
+}
+
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ return e820_update_range_map(&e820, start, size, old_type, new_type);
+}
+
+static u64 __init e820_update_range_saved(u64 start, u64 size,
+ unsigned old_type, unsigned new_type)
+{
+ return e820_update_range_map(&e820_saved, start, size, old_type,
+ new_type);
+}
+
+/* make e820 not cover the range */
+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
+ int checktype)
+{
+ int i;
+ u64 real_removed_size = 0;
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 final_start, final_end;
+
+ if (checktype && ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start &&
+ (ei->addr + ei->size) <= (start + size)) {
+ real_removed_size += ei->size;
+ memset(ei, 0, sizeof(struct e820entry));
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+ real_removed_size += final_end - final_start;
+
+ ei->size -= final_end - final_start;
+ if (ei->addr < final_start)
+ continue;
+ ei->addr = final_end;
+ }
+ return real_removed_size;
+}
+
+void __init update_e820(void)
+{
+ int nr_map;
+
+ nr_map = e820.nr_map;
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+ return;
+ e820.nr_map = nr_map;
+ printk(KERN_INFO "modified physical RAM map:\n");
+ e820_print_map("modified");
+}
+static void __init update_e820_saved(void)
+{
+ int nr_map;
+
+ nr_map = e820_saved.nr_map;
+ if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
+ return;
+ e820_saved.nr_map = nr_map;
+}
+#define MAX_GAP_END 0x100000000ull
+/*
+ * Search for a gap in the e820 memory space from start_addr to end_addr.
+ */
+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+ unsigned long start_addr, unsigned long long end_addr)
+{
+ unsigned long long last;
+ int i = e820.nr_map;
+ int found = 0;
+
+ last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
+
+ while (--i >= 0) {
+ unsigned long long start = e820.map[i].addr;
+ unsigned long long end = start + e820.map[i].size;
+
+ if (end < start_addr)
+ continue;
+
+ /*
+ * Since "last" is at most 4GB, we know we'll
+ * fit in 32 bits if this condition is true
+ */
+ if (last > end) {
+ unsigned long gap = last - end;
+
+ if (gap >= *gapsize) {
+ *gapsize = gap;
+ *gapstart = end;
+ found = 1;
+ }
+ }
+ if (start < last)
+ last = start;
+ }
+ return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space. We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+ unsigned long gapstart, gapsize, round;
+ int found;
+
+ gapstart = 0x10000000;
+ gapsize = 0x400000;
+ found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+
+#ifdef CONFIG_X86_64
+ if (!found) {
+ gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
+ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+ "address range\n"
+ KERN_ERR "PCI: Unassigned devices with 32bit resource "
+ "registers may break!\n");
+ }
+#endif
+
+ /*
+ * See how much we want to round up: start off with
+ * rounding to the next 1MB area.
+ */
+ round = 0x100000;
+ while ((gapsize >> 4) > round)
+ round += round;
+ /* Fun with two's complement */
+ pci_mem_start = (gapstart + round) & -round;
+
+ printk(KERN_INFO
+ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+ pci_mem_start, gapstart, gapsize);
+}
+
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+{
+ u32 map_len;
+ int entries;
+ struct e820entry *extmap;
+
+ entries = sdata->len / sizeof(struct e820entry);
+ map_len = sdata->len + sizeof(struct setup_data);
+ if (map_len > PAGE_SIZE)
+ sdata = early_ioremap(pa_data, map_len);
+ extmap = (struct e820entry *)(sdata->data);
+ __append_e820_map(extmap, entries);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ if (map_len > PAGE_SIZE)
+ early_iounmap(sdata, map_len);
+ printk(KERN_INFO "extended physical RAM map:\n");
+ e820_print_map("extended");
+}
+
+#if defined(CONFIG_X86_64) || \
+ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+ int i;
+ unsigned long pfn;
+
+ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (pfn < PFN_UP(ei->addr))
+ register_nosave_region(pfn, PFN_UP(ei->addr));
+
+ pfn = PFN_DOWN(ei->addr + ei->size);
+ if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+ register_nosave_region(PFN_UP(ei->addr), pfn);
+
+ if (pfn >= limit_pfn)
+ break;
+ }
+}
+#endif
+
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+ u64 start, end;
+ char name[16];
+ char overlap_ok;
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
+ /*
+ * Has to be in very low memory so we can execute
+ * real-mode AP code.
+ */
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
+#endif
+ {}
+};
+
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+ int i;
+ struct early_res *r;
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (end > r->start && start < r->end)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+ int j;
+
+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ ;
+
+ memmove(&early_res[i], &early_res[i + 1],
+ (j - 1 - i) * sizeof(struct early_res));
+
+ early_res[j - 1].end = 0;
+}
+
+/*
+ * Split any existing ranges that:
+ * 1) are marked 'overlap_ok', and
+ * 2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range. Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+ int i;
+ struct early_res *r;
+ u64 lower_start, lower_end;
+ u64 upper_start, upper_end;
+ char name[16];
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+
+ /* Continue past non-overlapping ranges */
+ if (end <= r->start || start >= r->end)
+ continue;
+
+ /*
+ * Leave non-ok overlaps as is; let caller
+ * panic "Overlapping early reservations"
+ * when it hits this overlap.
+ */
+ if (!r->overlap_ok)
+ return;
+
+ /*
+ * We have an ok overlap. We will drop it from the early
+ * reservation map, and add back in any non-overlapping
+ * portions (lower or upper) as separate, overlap_ok,
+ * non-overlapping ranges.
+ */
+
+ /* 1. Note any non-overlapping (lower or upper) ranges. */
+ strncpy(name, r->name, sizeof(name) - 1);
+
+ lower_start = lower_end = 0;
+ upper_start = upper_end = 0;
+ if (r->start < start) {
+ lower_start = r->start;
+ lower_end = start;
+ }
+ if (r->end > end) {
+ upper_start = end;
+ upper_end = r->end;
+ }
+
+ /* 2. Drop the original ok overlapping range */
+ drop_range(i);
+
+ i--; /* resume for-loop on copied down entry */
+
+ /* 3. Add back in any non-overlapping ranges. */
+ if (lower_end)
+ reserve_early_overlap_ok(lower_start, lower_end, name);
+ if (upper_end)
+ reserve_early_overlap_ok(upper_start, upper_end, name);
+ }
+}
+
+static void __init __reserve_early(u64 start, u64 end, char *name,
+ int overlap_ok)
+{
+ int i;
+ struct early_res *r;
+
+ i = find_overlapped_early(start, end);
+ if (i >= MAX_EARLY_RES)
+ panic("Too many early reservations");
+ r = &early_res[i];
+ if (r->end)
+ panic("Overlapping early reservations "
+ "%llx-%llx %s to %llx-%llx %s\n",
+ start, end - 1, name?name:"", r->start,
+ r->end - 1, r->name);
+ r->start = start;
+ r->end = end;
+ r->overlap_ok = overlap_ok;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+}
+
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first. We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+ drop_overlaps_that_are_ok(start, end);
+ __reserve_early(start, end, name, 1);
+}
+
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+ drop_overlaps_that_are_ok(start, end);
+ __reserve_early(start, end, name, 0);
+}
+
+void __init free_early(u64 start, u64 end)
+{
+ struct early_res *r;
+ int i;
+
+ i = find_overlapped_early(start, end);
+ r = &early_res[i];
+ if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
+ panic("free_early on not reserved area: %llx-%llx!",
+ start, end - 1);
+
+ drop_range(i);
+}
+
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+ int i, count;
+ u64 final_start, final_end;
+
+ count = 0;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
+ count++;
+
+ printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
+ for (i = 0; i < count; i++) {
+ struct early_res *r = &early_res[i];
+ printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
+ r->start, r->end, r->name);
+ final_start = max(start, r->start);
+ final_end = min(end, r->end);
+ if (final_start >= final_end) {
+ printk(KERN_CONT "\n");
+ continue;
+ }
+ printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+ final_start, final_end);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
+ }
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+ int i;
+ u64 addr = *addrp;
+ int changed = 0;
+ struct early_res *r;
+again:
+ i = find_overlapped_early(addr, addr + size);
+ r = &early_res[i];
+ if (i < MAX_EARLY_RES && r->end) {
+ *addrp = addr = round_up(r->end, align);
+ changed = 1;
+ goto again;
+ }
+ return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+ int i;
+ u64 addr = *addrp, last;
+ u64 size = *sizep;
+ int changed = 0;
+again:
+ last = addr + size;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ if (last > r->start && addr < r->start) {
+ size = r->start - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last > r->end && addr < r->end) {
+ addr = round_up(r->end, align);
+ size = last - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last <= r->end && addr >= r->start) {
+ (*sizep)++;
+ return 0;
+ }
+ }
+ if (changed) {
+ *addrp = addr;
+ *sizep = size;
+ }
+ return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr, last;
+ u64 ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+ ;
+ last = addr + size;
+ if (last > ei_last)
+ continue;
+ if (last > end)
+ continue;
+ return addr;
+ }
+ return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr, last;
+ u64 ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ *sizep = ei_last - addr;
+ while (bad_addr_size(&addr, sizep, align) &&
+ addr + *sizep <= ei_last)
+ ;
+ last = addr + *sizep;
+ if (last > ei_last)
+ continue;
+ return addr;
+ }
+ return -1UL;
+
+}
+
+/*
+ * pre allocated 4k and reserved it in e820
+ */
+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+{
+ u64 size = 0;
+ u64 addr;
+ u64 start;
+
+ start = startt;
+ while (size < sizet)
+ start = find_e820_area_size(start, &size, align);
+
+ if (size < sizet)
+ return 0;
+
+ addr = round_down(start + size - sizet, align);
+ e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+ e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
+ printk(KERN_INFO "update e820 for early_reserve_e820\n");
+ update_e820();
+ update_e820_saved();
+
+ return addr;
+}
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
+# else
+# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Find the highest page frame number we have available
+ */
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+{
+ int i;
+ unsigned long last_pfn = 0;
+ unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+
+ if (ei->type != type)
+ continue;
+
+ start_pfn = ei->addr >> PAGE_SHIFT;
+ end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+
+ if (start_pfn >= limit_pfn)
+ continue;
+ if (end_pfn > limit_pfn) {
+ last_pfn = limit_pfn;
+ break;
+ }
+ if (end_pfn > last_pfn)
+ last_pfn = end_pfn;
+ }
+
+ if (last_pfn > max_arch_pfn)
+ last_pfn = max_arch_pfn;
+
+ printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+ last_pfn, max_arch_pfn);
+ return last_pfn;
+}
+unsigned long __init e820_end_of_ram_pfn(void)
+{
+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+}
+
+unsigned long __init e820_end_of_low_ram_pfn(void)
+{
+ return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+}
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+int __init e820_find_active_region(const struct e820entry *ei,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn)
+{
+ u64 align = PAGE_SIZE;
+
+ *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
+ *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
+
+ /* Skip map entries smaller than a page */
+ if (*ei_startpfn >= *ei_endpfn)
+ return 0;
+
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+ *ei_startpfn >= last_pfn)
+ return 0;
+
+ /* Check for overlaps */
+ if (*ei_startpfn < start_pfn)
+ *ei_startpfn = start_pfn;
+ if (*ei_endpfn > last_pfn)
+ *ei_endpfn = last_pfn;
+
+ return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long last_pfn)
+{
+ unsigned long ei_startpfn;
+ unsigned long ei_endpfn;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++)
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init e820_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long last_pfn = end >> PAGE_SHIFT;
+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ ram += ei_endpfn - ei_startpfn;
+ }
+ return end - start - ((u64)ram << PAGE_SHIFT);
+}
+
+static void early_panic(char *msg)
+{
+ early_printk(msg);
+ panic(msg);
+}
+
+static int userdef __initdata;
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+ u64 mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ if (!strcmp(p, "nopentium")) {
+ setup_clear_cpu_cap(X86_FEATURE_PSE);
+ return 0;
+ }
+#endif
+
+ userdef = 1;
+ mem_size = memparse(p, &p);
+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+ return 0;
+}
+early_param("mem", parse_memopt);
+
+static int __init parse_memmap_opt(char *p)
+{
+ char *oldp;
+ u64 start_at, mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+ if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * If we are doing a crash dump, we still need to know
+ * the real mem size before original memory map is
+ * reset.
+ */
+ saved_max_pfn = e820_end_of_ram_pfn();
+#endif
+ e820.nr_map = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ userdef = 1;
+ if (*p == '@') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_RAM);
+ } else if (*p == '#') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_ACPI);
+ } else if (*p == '$') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_RESERVED);
+ } else
+ e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+ return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void __init finish_e820_parsing(void)
+{
+ if (userdef) {
+ int nr = e820.nr_map;
+
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+ early_panic("Invalid user supplied memory map");
+ e820.nr_map = nr;
+
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ e820_print_map("user");
+ }
+}
+
+static inline const char *e820_type_to_string(int e820_type)
+{
+ switch (e820_type) {
+ case E820_RESERVED_KERN:
+ case E820_RAM: return "System RAM";
+ case E820_ACPI: return "ACPI Tables";
+ case E820_NVS: return "ACPI Non-volatile Storage";
+ default: return "reserved";
+ }
+}
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+ int i;
+ struct resource *res;
+ u64 end;
+
+ res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+ for (i = 0; i < e820.nr_map; i++) {
+ end = e820.map[i].addr + e820.map[i].size - 1;
+#ifndef CONFIG_RESOURCES_64BIT
+ if (end > 0x100000000ULL) {
+ res++;
+ continue;
+ }
+#endif
+ res->name = e820_type_to_string(e820.map[i].type);
+ res->start = e820.map[i].addr;
+ res->end = end;
+
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ res++;
+ }
+
+ for (i = 0; i < e820_saved.nr_map; i++) {
+ struct e820entry *entry = &e820_saved.map[i];
+ firmware_map_add_early(entry->addr,
+ entry->addr + entry->size - 1,
+ e820_type_to_string(entry->type));
+ }
+}
+
+/*
+ * Non-standard memory setup can be specified via this quirk:
+ */
+char * (*arch_memory_setup_quirk)(void);
+
+char *__init default_machine_specific_memory_setup(void)
+{
+ char *who = "BIOS-e820";
+ int new_nr;
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+ * Otherwise fake a memory map; one section from 0k->640k,
+ * the next section from 1mb->appropriate_mem_k
+ */
+ new_nr = boot_params.e820_entries;
+ sanitize_e820_map(boot_params.e820_map,
+ ARRAY_SIZE(boot_params.e820_map),
+ &new_nr);
+ boot_params.e820_entries = new_nr;
+ if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
+ < 0) {
+ u64 mem_size;
+
+ /* compare results from other methods and take the greater */
+ if (boot_params.alt_mem_k
+ < boot_params.screen_info.ext_mem_k) {
+ mem_size = boot_params.screen_info.ext_mem_k;
+ who = "BIOS-88";
+ } else {
+ mem_size = boot_params.alt_mem_k;
+ who = "BIOS-e801";
+ }
+
+ e820.nr_map = 0;
+ e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+ e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+ }
+
+ /* In case someone cares... */
+ return who;
+}
+
+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+{
+ if (arch_memory_setup_quirk) {
+ char *who = arch_memory_setup_quirk();
+
+ if (who)
+ return who;
+ }
+ return default_machine_specific_memory_setup();
+}
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+ return machine_specific_memory_setup();
+}
+
+void __init setup_memory_map(void)
+{
+ char *who;
+
+ who = memory_setup();
+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ e820_print_map(who);
+}
+
+#ifdef CONFIG_X86_64
+int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
+{
+ int i;
+
+ if (slot < 0 || slot >= e820.nr_map)
+ return -1;
+ for (i = slot; i < e820.nr_map; i++) {
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ break;
+ }
+ if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
+ return -1;
+ *addr = e820.map[i].addr;
+ *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
+ max_pfn << PAGE_SHIFT) - *addr;
+ return i + 1;
+}
+#endif
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
deleted file mode 100644
index ed733e7cf4e..00000000000
--- a/arch/x86/kernel/e820_32.c
+++ /dev/null
@@ -1,775 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-struct e820map e820;
-struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-extern int user_defined_memmap;
-
-static struct resource system_rom_resource = {
- .name = "System ROM",
- .start = 0xf0000,
- .end = 0xfffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
- .name = "Extension ROM",
- .start = 0xe0000,
- .end = 0xeffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
- .name = "Adapter ROM",
- .start = 0xc8000,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
- .name = "Video ROM",
- .start = 0xc0000,
- .end = 0xc7fff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
- const unsigned short * const ptr = (const unsigned short *)rom;
- unsigned short sig;
-
- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
- unsigned char sum, c;
-
- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
- sum += c;
- return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
- const unsigned char *rom;
- unsigned long start, length, upper;
- unsigned char c;
- int i;
-
- /* video rom */
- upper = adapter_rom_resources[0].start;
- for (start = video_rom_resource.start; start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- video_rom_resource.start = start;
-
- if (probe_kernel_address(rom + 2, c) != 0)
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = c * 512;
-
- /* if checksum okay, trust length byte */
- if (length && romchecksum(rom, length))
- video_rom_resource.end = start + length - 1;
-
- request_resource(&iomem_resource, &video_rom_resource);
- break;
- }
-
- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
- if (start < upper)
- start = upper;
-
- /* system rom */
- request_resource(&iomem_resource, &system_rom_resource);
- upper = system_rom_resource.start;
-
- /* check for extension rom (ignore length byte!) */
- rom = isa_bus_to_virt(extension_rom_resource.start);
- if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
- if (romchecksum(rom, length)) {
- request_resource(&iomem_resource, &extension_rom_resource);
- upper = extension_rom_resource.start;
- }
- }
-
- /* check for adapter roms on 2k boundaries */
- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- if (probe_kernel_address(rom + 2, c) != 0)
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = c * 512;
-
- /* but accept any length that fits if checksum okay */
- if (!length || start + length > upper || !romchecksum(rom, length))
- continue;
-
- adapter_rom_resources[i].start = start;
- adapter_rom_resources[i].end = start + length - 1;
- request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
- start = adapter_rom_resources[i++].end & ~2047UL;
- }
-}
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-void __init init_iomem_resources(struct resource *code_resource,
- struct resource *data_resource,
- struct resource *bss_resource)
-{
- int i;
-
- probe_roms();
- for (i = 0; i < e820.nr_map; i++) {
- struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
-#endif
- res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
- switch (e820.map[i].type) {
- case E820_RAM: res->name = "System RAM"; break;
- case E820_ACPI: res->name = "ACPI Tables"; break;
- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
- default: res->name = "reserved";
- }
- res->start = e820.map[i].addr;
- res->end = res->start + e820.map[i].size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- if (request_resource(&iomem_resource, res)) {
- kfree(res);
- continue;
- }
- if (e820.map[i].type == E820_RAM) {
- /*
- * We don't know which RAM region contains kernel data,
- * so we try it repeatedly and let the resource manager
- * test it.
- */
- request_resource(res, code_resource);
- request_resource(res, data_resource);
- request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
- if (crashk_res.start != crashk_res.end)
- request_resource(res, &crashk_res);
-#endif
- }
- }
-}
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
- int i;
- unsigned long pfn;
-
- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (pfn < PFN_UP(ei->addr))
- register_nosave_region(pfn, PFN_UP(ei->addr));
-
- pfn = PFN_DOWN(ei->addr + ei->size);
- if (ei->type != E820_RAM)
- register_nosave_region(PFN_UP(ei->addr), pfn);
-
- if (pfn >= max_low_pfn)
- break;
- }
-}
-#endif
-
-void __init add_memory_region(unsigned long long start,
- unsigned long long size, int type)
-{
- int x;
-
- x = e820.nr_map;
-
- if (x == E820MAX) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
- return;
- }
-
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
- unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
-
- /*
- Visually we're performing the following (1,2,3,4 = memory types)...
-
- Sample memory map (w/overlaps):
- ____22__________________
- ______________________4_
- ____1111________________
- _44_____________________
- 11111111________________
- ____________________33__
- ___________44___________
- __________33333_________
- ______________22________
- ___________________2222_
- _________111111111______
- _____________________11_
- _________________4______
-
- Sanitized equivalent (no overlap):
- 1_______________________
- _44_____________________
- ___1____________________
- ____22__________________
- ______11________________
- _________1______________
- __________3_____________
- ___________44___________
- _____________33_________
- _______________2________
- ________________1_______
- _________________4______
- ___________________2____
- ____________________33__
- ______________________4_
- */
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2) {
- return -1;
- }
-
- old_nr = *pnr_map;
-
- /* bail out if we find any unreasonable addresses in bios map */
- for (i=0; i<old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
- return -1;
- }
-
- /* create pointers for initial change-point information (for sorting) */
- for (i=0; i < 2*old_nr; i++)
- change_point[i] = &change_point_list[i];
-
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i=0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx; /* true number of change-points */
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i=1; i < chg_nr; i++) {
- /* if <current_addr> > <last_addr>, swap */
- /* or, if current=<start_addr> & last=<end_addr>, swap */
- if ((change_point[i]->addr < change_point[i-1]->addr) ||
- ((change_point[i]->addr == change_point[i-1]->addr) &&
- (change_point[i]->addr == change_point[i]->pbios->addr) &&
- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
- )
- {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing=1;
- }
- }
- }
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries=0; /* number of entries in the overlap table */
- new_bios_entry=0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx=0; chgidx < chg_nr; chgidx++)
- {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
- {
- /* add map entry to overlap list (> 1 entry implies an overlap) */
- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
- }
- else
- {
- /* remove entry from list (order independent, so swap with last) */
- for (i=0; i<overlap_entries; i++)
- {
- if (overlap_list[i] == change_point[chgidx]->pbios)
- overlap_list[i] = overlap_list[overlap_entries-1];
- }
- overlap_entries--;
- }
- /* if there are overlapping entries, decide which "type" to use */
- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
- current_type = 0;
- for (i=0; i<overlap_entries; i++)
- if (overlap_list[i]->type > current_type)
- current_type = overlap_list[i]->type;
- /* continue building up new bios map based on this information */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /* move forward only if the new size was non-zero */
- if (new_bios[new_bios_entry].size != 0)
- if (++new_bios_entry >= E820MAX)
- break; /* no more space left for new bios entries */
- }
- if (current_type != 0) {
- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr=change_point[chgidx]->addr;
- }
- last_type = current_type;
- }
- }
- new_nr = new_bios_entry; /* retain count for new bios entries */
-
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
- *pnr_map = new_nr;
-
- return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
- /* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
- return -1;
-
- do {
- u64 start = biosmap->addr;
- u64 size = biosmap->size;
- u64 end = start + size;
- u32 type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
- return -1;
-
- add_memory_region(start, size, type);
- } while (biosmap++, --nr_map);
-
- return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init propagate_e820_map(void)
-{
- int i;
-
- max_pfn = 0;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
- /* RAM? */
- if (e820.map[i].type != E820_RAM)
- continue;
- start = PFN_UP(e820.map[i].addr);
- end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
- if (start >= end)
- continue;
- if (end > max_pfn)
- max_pfn = end;
- memory_present(0, start, end);
- }
-}
-
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long curr_pfn, last_pfn, size;
- /*
- * Reserve usable low memory
- */
- if (e820.map[i].type != E820_RAM)
- continue;
- /*
- * We are rounding up the start address of usable memory:
- */
- curr_pfn = PFN_UP(e820.map[i].addr);
- if (curr_pfn >= max_low_pfn)
- continue;
- /*
- * ... and at the end of the usable range downwards:
- */
- last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
- if (last_pfn > max_low_pfn)
- last_pfn = max_low_pfn;
-
- /*
- * .. finally, did all the rounding and playing
- * around just make the area go away?
- */
- if (last_pfn <= curr_pfn)
- continue;
-
- size = last_pfn - curr_pfn;
- free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
- }
-}
-
-void __init e820_register_memory(void)
-{
- unsigned long gapstart, gapsize, round;
- unsigned long long last;
- int i;
-
- /*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.
- */
- last = 0x100000000ull;
- gapstart = 0x10000000;
- gapsize = 0x400000;
- i = e820.nr_map;
- while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
-
- /*
- * Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
- */
- if (last > end) {
- unsigned long gap = last - end;
-
- if (gap > gapsize) {
- gapsize = gap;
- gapstart = end;
- }
- }
- if (start < last)
- last = start;
- }
-
- /*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
- */
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
-
- printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
-void __init print_memory_map(char *who)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- printk(" %s: %016Lx - %016Lx ", who,
- e820.map[i].addr,
- e820.map[i].addr + e820.map[i].size);
- switch (e820.map[i].type) {
- case E820_RAM: printk("(usable)\n");
- break;
- case E820_RESERVED:
- printk("(reserved)\n");
- break;
- case E820_ACPI:
- printk("(ACPI data)\n");
- break;
- case E820_NVS:
- printk("(ACPI NVS)\n");
- break;
- default: printk("type %u\n", e820.map[i].type);
- break;
- }
- }
-}
-
-void __init limit_regions(unsigned long long size)
-{
- unsigned long long current_addr;
- int i;
-
- print_memory_map("limit_regions start");
- for (i = 0; i < e820.nr_map; i++) {
- current_addr = e820.map[i].addr + e820.map[i].size;
- if (current_addr < size)
- continue;
-
- if (e820.map[i].type != E820_RAM)
- continue;
-
- if (e820.map[i].addr >= size) {
- /*
- * This region starts past the end of the
- * requested size, skip it completely.
- */
- e820.nr_map = i;
- } else {
- e820.nr_map = i + 1;
- e820.map[i].size -= current_addr - size;
- }
- print_memory_map("limit_regions endfor");
- return;
- }
- print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- const struct e820entry *ei = &e820.map[i];
- if (type && ei->type != type)
- continue;
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
- u64 start = s;
- u64 end = e;
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- if (type && ei->type != type)
- continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
- */
- if (ei->addr <= start)
- start = ei->addr + ei->size;
- /* if start is now at or beyond end, we're done, full
- * coverage */
- if (start >= end)
- return 1; /* we're done */
- }
- return 0;
-}
-
-static int __init parse_memmap(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- propagate_e820_map();
- saved_max_pfn = max_pfn;
-#endif
- e820.nr_map = 0;
- user_defined_memmap = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long start_at, mem_size;
-
- mem_size = memparse(arg, &arg);
- if (*arg == '@') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*arg == '#') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*arg == '$') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- limit_regions(mem_size);
- user_defined_memmap = 1;
- }
- }
- return 0;
-}
-early_param("memmap", parse_memmap);
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type)
-{
- int i;
-
- BUG_ON(old_type == new_type);
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 final_start, final_end;
- if (ei->type != old_type)
- continue;
- /* totally covered? */
- if (ei->addr >= start && ei->size <= size) {
- ei->type = new_type;
- continue;
- }
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
- if (final_start >= final_end)
- continue;
- add_memory_region(final_start, final_end - final_start,
- new_type);
- }
-}
-void __init update_e820(void)
-{
- u8 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, &nr_map))
- return;
- e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
- print_memory_map("modified");
-}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
deleted file mode 100644
index 124480c0008..00000000000
--- a/arch/x86/kernel/e820_64.c
+++ /dev/null
@@ -1,952 +0,0 @@
-/*
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
- *
- * Getting sanitize_e820_map() in sync with i386 version by applying change:
- * - Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/pfn.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/sections.h>
-#include <asm/kdebug.h>
-#include <asm/trampoline.h>
-
-struct e820map e820;
-
-/*
- * PFN of last memory page.
- */
-unsigned long end_pfn;
-
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_pfn_mapped;
-
-/*
- * Last pfn which the user wants to use.
- */
-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
-
-/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
- unsigned long start, end;
- char name[16];
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
- { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
-#ifdef CONFIG_X86_TRAMPOLINE
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
- {}
-};
-
-void __init reserve_early(unsigned long start, unsigned long end, char *name)
-{
- int i;
- struct early_res *r;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
- start, end - 1, name?name:"", r->start, r->end - 1, r->name);
- }
- if (i >= MAX_EARLY_RES)
- panic("Too many early reservations");
- r = &early_res[i];
- r->start = start;
- r->end = end;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-void __init free_early(unsigned long start, unsigned long end)
-{
- struct early_res *r;
- int i, j;
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (start == r->start && end == r->end)
- break;
- }
- if (i >= MAX_EARLY_RES || !early_res[i].end)
- panic("free_early on not reserved area: %lx-%lx!", start, end);
-
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
-}
-
-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
-{
- int i;
- unsigned long final_start, final_end;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end)
- continue;
- printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
- final_start, final_end - 1, r->name);
- reserve_bootmem_generic(final_start, final_end - final_start);
- }
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
-{
- int i;
- unsigned long addr = *addrp, last;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last >= r->start && addr < r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
- }
- }
- return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
-{
- int i;
- unsigned long addr = *addrp, last;
- unsigned long size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (type && ei->type != type)
- continue;
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
-/*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init e820_all_mapped(unsigned long start, unsigned long end,
- unsigned type)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (type && ei->type != type)
- continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
-
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
- */
- if (ei->addr <= start)
- start = ei->addr + ei->size;
- /*
- * if start is now at or beyond end, we're done, full
- * coverage
- */
- if (start >= end)
- return 1;
- }
- return 0;
-}
-
-/*
- * Find a free area with specified alignment in a specific range.
- */
-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
- unsigned long size, unsigned long align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long addr, last;
- unsigned long ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
- continue;
- return addr;
- }
- return -1UL;
-}
-
-/*
- * Find next free range after *start
- */
-unsigned long __init find_e820_area_size(unsigned long start,
- unsigned long *sizep,
- unsigned long align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long addr, last;
- unsigned long ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
- }
- return -1UL;
-
-}
-/*
- * Find the highest page frame number we have available
- */
-unsigned long __init e820_end_of_ram(void)
-{
- unsigned long end_pfn;
-
- end_pfn = find_max_pfn_with_active_regions();
-
- if (end_pfn > max_pfn_mapped)
- max_pfn_mapped = end_pfn;
- if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
- max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
- if (end_pfn > end_user_pfn)
- end_pfn = end_user_pfn;
- if (end_pfn > max_pfn_mapped)
- end_pfn = max_pfn_mapped;
-
- printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
- return end_pfn;
-}
-
-/*
- * Mark e820 reserved areas as busy for the resource manager.
- */
-void __init e820_reserve_resources(void)
-{
- int i;
- struct resource *res;
-
- res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
- for (i = 0; i < e820.nr_map; i++) {
- switch (e820.map[i].type) {
- case E820_RAM: res->name = "System RAM"; break;
- case E820_ACPI: res->name = "ACPI Tables"; break;
- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
- default: res->name = "reserved";
- }
- res->start = e820.map[i].addr;
- res->end = res->start + e820.map[i].size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- insert_resource(&iomem_resource, res);
- res++;
- }
-}
-
-/*
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for software
- * suspend and suspend to RAM.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
- int i;
- unsigned long paddr;
-
- paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (paddr < ei->addr)
- register_nosave_region(PFN_DOWN(paddr),
- PFN_UP(ei->addr));
-
- paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (ei->type != E820_RAM)
- register_nosave_region(PFN_UP(ei->addr),
- PFN_DOWN(paddr));
-
- if (paddr >= (end_pfn << PAGE_SHIFT))
- break;
- }
-}
-
-/*
- * Finds an active region in the address range from start_pfn to end_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-static int __init e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long end_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
- *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Check if max_pfn_mapped should be updated */
- if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
- max_pfn_mapped = *ei_endpfn;
-
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
- *ei_startpfn >= end_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > end_pfn)
- *ei_endpfn = end_pfn;
-
- /* Obey end_user_pfn to save on memmap */
- if (*ei_startpfn >= end_user_pfn)
- return 0;
- if (*ei_endpfn > end_user_pfn)
- *ei_endpfn = end_user_pfn;
-
- return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- int i;
-
- for (i = 0; i < e820.nr_map; i++)
- if (e820_find_active_region(&e820.map[i],
- start_pfn, end_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
- * Add a memory region to the kernel e820 map.
- */
-void __init add_memory_region(unsigned long start, unsigned long size, int type)
-{
- int x = e820.nr_map;
-
- if (x == E820MAX) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
- return;
- }
-
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- if (e820_find_active_region(&e820.map[i],
- start_pfn, end_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
- }
- return end - start - (ram << PAGE_SHIFT);
-}
-
-static void __init e820_print_map(char *who)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
- (unsigned long long) e820.map[i].addr,
- (unsigned long long)
- (e820.map[i].addr + e820.map[i].size));
- switch (e820.map[i].type) {
- case E820_RAM:
- printk(KERN_CONT "(usable)\n");
- break;
- case E820_RESERVED:
- printk(KERN_CONT "(reserved)\n");
- break;
- case E820_ACPI:
- printk(KERN_CONT "(ACPI data)\n");
- break;
- case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)\n");
- break;
- default:
- printk(KERN_CONT "type %u\n", e820.map[i].type);
- break;
- }
- }
-}
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
-{
- struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
- };
- static struct change_member change_point_list[2*E820MAX] __initdata;
- static struct change_member *change_point[2*E820MAX] __initdata;
- static struct e820entry *overlap_list[E820MAX] __initdata;
- static struct e820entry new_bios[E820MAX] __initdata;
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
- unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
-
- /*
- Visually we're performing the following
- (1,2,3,4 = memory types)...
-
- Sample memory map (w/overlaps):
- ____22__________________
- ______________________4_
- ____1111________________
- _44_____________________
- 11111111________________
- ____________________33__
- ___________44___________
- __________33333_________
- ______________22________
- ___________________2222_
- _________111111111______
- _____________________11_
- _________________4______
-
- Sanitized equivalent (no overlap):
- 1_______________________
- _44_____________________
- ___1____________________
- ____22__________________
- ______11________________
- _________1______________
- __________3_____________
- ___________44___________
- _____________33_________
- _______________2________
- ________________1_______
- _________________4______
- ___________________2____
- ____________________33__
- ______________________4_
- */
-
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2)
- return -1;
-
- old_nr = *pnr_map;
-
- /* bail out if we find any unreasonable addresses in bios map */
- for (i = 0; i < old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
- return -1;
-
- /* create pointers for initial change-point information (for sorting) */
- for (i = 0; i < 2 * old_nr; i++)
- change_point[i] = &change_point_list[i];
-
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i = 0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr +
- biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx;
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i = 1; i < chg_nr; i++) {
- unsigned long long curaddr, lastaddr;
- unsigned long long curpbaddr, lastpbaddr;
-
- curaddr = change_point[i]->addr;
- lastaddr = change_point[i - 1]->addr;
- curpbaddr = change_point[i]->pbios->addr;
- lastpbaddr = change_point[i - 1]->pbios->addr;
-
- /*
- * swap entries, when:
- *
- * curaddr > lastaddr or
- * curaddr == lastaddr and curaddr == curpbaddr and
- * lastaddr != lastpbaddr
- */
- if (curaddr < lastaddr ||
- (curaddr == lastaddr && curaddr == curpbaddr &&
- lastaddr != lastpbaddr)) {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing = 1;
- }
- }
- }
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries = 0; /* number of entries in the overlap table */
- new_bios_entry = 0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
-
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx = 0; chgidx < chg_nr; chgidx++) {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr ==
- change_point[chgidx]->pbios->addr) {
- /*
- * add map entry to overlap list (> 1 entry
- * implies an overlap)
- */
- overlap_list[overlap_entries++] =
- change_point[chgidx]->pbios;
- } else {
- /*
- * remove entry from list (order independent,
- * so swap with last)
- */
- for (i = 0; i < overlap_entries; i++) {
- if (overlap_list[i] ==
- change_point[chgidx]->pbios)
- overlap_list[i] =
- overlap_list[overlap_entries-1];
- }
- overlap_entries--;
- }
- /*
- * if there are overlapping entries, decide which
- * "type" to use (larger value takes precedence --
- * 1=usable, 2,3,4,4+=unusable)
- */
- current_type = 0;
- for (i = 0; i < overlap_entries; i++)
- if (overlap_list[i]->type > current_type)
- current_type = overlap_list[i]->type;
- /*
- * continue building up new bios map based on this
- * information
- */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /*
- * move forward only if the new size
- * was non-zero
- */
- if (new_bios[new_bios_entry].size != 0)
- /*
- * no more space left for new
- * bios entries ?
- */
- if (++new_bios_entry >= E820MAX)
- break;
- }
- if (current_type != 0) {
- new_bios[new_bios_entry].addr =
- change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr = change_point[chgidx]->addr;
- }
- last_type = current_type;
- }
- }
- /* retain count for new bios entries */
- new_nr = new_bios_entry;
-
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
- *pnr_map = new_nr;
-
- return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- */
-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
- /* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
- return -1;
-
- do {
- u64 start = biosmap->addr;
- u64 size = biosmap->size;
- u64 end = start + size;
- u32 type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
- return -1;
-
- add_memory_region(start, size, type);
- } while (biosmap++, --nr_map);
- return 0;
-}
-
-static void early_panic(char *msg)
-{
- early_printk(msg);
- panic(msg);
-}
-
-/* We're not void only for x86 32-bit compat */
-char * __init machine_specific_memory_setup(void)
-{
- char *who = "BIOS-e820";
- /*
- * Try to copy the BIOS-supplied E820-map.
- *
- * Otherwise fake a memory map; one section from 0k->640k,
- * the next section from 1mb->appropriate_mem_k
- */
- sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
- if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
- early_panic("Cannot find a valid memory map");
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- e820_print_map(who);
-
- /* In case someone cares... */
- return who;
-}
-
-static int __init parse_memopt(char *p)
-{
- if (!p)
- return -EINVAL;
- end_user_pfn = memparse(p, &p);
- end_user_pfn >>= PAGE_SHIFT;
- return 0;
-}
-early_param("mem", parse_memopt);
-
-static int userdef __initdata;
-
-static int __init parse_memmap_opt(char *p)
-{
- char *oldp;
- unsigned long long start_at, mem_size;
-
- if (!strcmp(p, "exactmap")) {
-#ifdef CONFIG_CRASH_DUMP
- /*
- * If we are doing a crash dump, we still need to know
- * the real mem size before original memory map is
- * reset.
- */
- e820_register_active_regions(0, 0, -1UL);
- saved_max_pfn = e820_end_of_ram();
- remove_all_active_ranges();
-#endif
- max_pfn_mapped = 0;
- e820.nr_map = 0;
- userdef = 1;
- return 0;
- }
-
- oldp = p;
- mem_size = memparse(p, &p);
- if (p == oldp)
- return -EINVAL;
-
- userdef = 1;
- if (*p == '@') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*p == '#') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*p == '$') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- end_user_pfn = (mem_size >> PAGE_SHIFT);
- }
- return *p == '\0' ? 0 : -EINVAL;
-}
-early_param("memmap", parse_memmap_opt);
-
-void __init finish_e820_parsing(void)
-{
- if (userdef) {
- char nr = e820.nr_map;
-
- if (sanitize_e820_map(e820.map, &nr) < 0)
- early_panic("Invalid user supplied memory map");
- e820.nr_map = nr;
-
- printk(KERN_INFO "user-defined physical RAM map:\n");
- e820_print_map("user");
- }
-}
-
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type)
-{
- int i;
-
- BUG_ON(old_type == new_type);
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 final_start, final_end;
- if (ei->type != old_type)
- continue;
- /* totally covered? */
- if (ei->addr >= start && ei->size <= size) {
- ei->type = new_type;
- continue;
- }
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
- if (final_start >= final_end)
- continue;
- add_memory_region(final_start, final_end - final_start,
- new_type);
- }
-}
-
-void __init update_e820(void)
-{
- u8 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, &nr_map))
- return;
- e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
- e820_print_map("modified");
-}
-
-unsigned long pci_mem_start = 0xaeedbabe;
-EXPORT_SYMBOL(pci_mem_start);
-
-/*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space. We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
- * Hopefully the BIOS let enough space left.
- */
-__init void e820_setup_gap(void)
-{
- unsigned long gapstart, gapsize, round;
- unsigned long last;
- int i;
- int found = 0;
-
- last = 0x100000000ull;
- gapstart = 0x10000000;
- gapsize = 0x400000;
- i = e820.nr_map;
- while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
-
- /*
- * Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
- */
- if (last > end) {
- unsigned long gap = last - end;
-
- if (gap > gapsize) {
- gapsize = gap;
- gapstart = end;
- found = 1;
- }
- }
- if (start < last)
- last = start;
- }
-
- if (!found) {
- gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
- printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
- "address range\n"
- KERN_ERR "PCI: Unassigned devices with 32bit resource "
- "registers may break!\n");
- }
-
- /*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
- */
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
-
- printk(KERN_INFO
- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
- int i;
-
- if (slot < 0 || slot >= e820.nr_map)
- return -1;
- for (i = slot; i < e820.nr_map; i++) {
- if (e820.map[i].type != E820_RAM)
- continue;
- break;
- }
- if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
- return -1;
- *addr = e820.map[i].addr;
- *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
- max_pfn << PAGE_SHIFT) - *addr;
- return i + 1;
-}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f51e1ea9e8..a0e11c0cc87 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -50,7 +50,7 @@ static void __init fix_hypertransport_config(int num, int slot, int func)
static void __init via_bugs(int num, int slot, int func)
{
#ifdef CONFIG_GART_IOMMU
- if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
+ if ((max_pfn > MAX_DMA32_PFN || force_iommu) &&
!gart_iommu_aperture_allowed) {
printk(KERN_INFO
"Looks like a VIA chipset. Disabling IOMMU."
@@ -98,17 +98,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
-static void __init ati_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_X86_IO_APIC
- if (timer_over_8254 == 1) {
- timer_over_8254 = 0;
- printk(KERN_INFO
- "ATI board detected. Disabling timer routing over 8254.\n");
- }
-#endif
-}
-
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -126,14 +115,23 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
{ PCI_VENDOR_ID_VIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
- { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
- PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
{}
};
-static void __init check_dev_quirk(int num, int slot, int func)
+/**
+ * check_dev_quirk - apply early quirks to a given PCI device
+ * @num: bus number
+ * @slot: slot number
+ * @func: PCI function
+ *
+ * Check the vendor & device ID against the early quirks table.
+ *
+ * If the device is single function, let early_quirks() know so we don't
+ * poke at this device again.
+ */
+static int __init check_dev_quirk(int num, int slot, int func)
{
u16 class;
u16 vendor;
@@ -144,7 +142,7 @@ static void __init check_dev_quirk(int num, int slot, int func)
class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
if (class == 0xffff)
- return;
+ return -1; /* no class, treat as single function */
vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
@@ -167,7 +165,9 @@ static void __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
if (!(type & 0x80))
- return;
+ return -1;
+
+ return 0;
}
void __init early_quirks(void)
@@ -180,6 +180,9 @@ void __init early_quirks(void)
/* Poor man's PCI discovery */
for (num = 0; num < 32; num++)
for (slot = 0; slot < 32; slot++)
- for (func = 0; func < 8; func++)
- check_dev_quirk(num, slot, func);
+ for (func = 0; func < 8; func++) {
+ /* Only probe function 0 on single fn devices */
+ if (check_dev_quirk(num, slot, func))
+ break;
+ }
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 643fd861b72..ff9e7350da5 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -196,7 +196,7 @@ static struct console simnow_console = {
static struct console *early_console = &early_vga_console;
static int early_console_initialized;
-void early_printk(const char *fmt, ...)
+asmlinkage void early_printk(const char *fmt, ...)
{
char buf[512];
int n;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 77d424cf68b..06cc8d4254b 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -64,6 +64,17 @@ static int __init setup_noefi(char *arg)
}
early_param("noefi", setup_noefi);
+int add_efi_memmap;
+EXPORT_SYMBOL(add_efi_memmap);
+
+static int __init setup_add_efi_memmap(char *arg)
+{
+ add_efi_memmap = 1;
+ return 0;
+}
+early_param("add_efi_memmap", setup_add_efi_memmap);
+
+
static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
return efi_call_virt2(get_time, tm, tc);
@@ -213,6 +224,50 @@ unsigned long efi_get_time(void)
eft.minute, eft.second);
}
+/*
+ * Tell the kernel about the EFI memory map. This might include
+ * more than the max 128 entries that can fit in the e820 legacy
+ * (zeropage) memory map.
+ */
+
+static void __init do_add_efi_memmap(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+ int e820_type;
+
+ if (md->attribute & EFI_MEMORY_WB)
+ e820_type = E820_RAM;
+ else
+ e820_type = E820_RESERVED;
+ e820_add_region(start, size, e820_type);
+ }
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
+void __init efi_reserve_early(void)
+{
+ unsigned long pmap;
+
+#ifdef CONFIG_X86_32
+ pmap = boot_params.efi_info.efi_memmap;
+#else
+ pmap = (boot_params.efi_info.efi_memmap |
+ ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#endif
+ memmap.phys_map = (void *)pmap;
+ memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+ boot_params.efi_info.efi_memdesc_size;
+ memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+ memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+ reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
+ "EFI memmap");
+}
+
#if EFI_DEBUG
static void __init print_efi_memmap(void)
{
@@ -244,19 +299,11 @@ void __init efi_init(void)
#ifdef CONFIG_X86_32
efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
- memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
#else
efi_phys.systab = (efi_system_table_t *)
(boot_params.efi_info.efi_systab |
((__u64)boot_params.efi_info.efi_systab_hi<<32));
- memmap.phys_map = (void *)
- (boot_params.efi_info.efi_memmap |
- ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
#endif
- memmap.nr_map = boot_params.efi_info.efi_memmap_size /
- boot_params.efi_info.efi_memdesc_size;
- memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
- memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
efi.systab = early_ioremap((unsigned long)efi_phys.systab,
sizeof(efi_system_table_t));
@@ -370,6 +417,8 @@ void __init efi_init(void)
if (memmap.desc_size != sizeof(efi_memory_desc_t))
printk(KERN_WARNING "Kernel-defined memdesc"
"doesn't match the one from EFI!\n");
+ if (add_efi_memmap)
+ do_add_efi_memmap();
/* Setup for EFI runtime service */
reboot_type = BOOT_EFI;
@@ -424,7 +473,7 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
- if (PFN_UP(end) <= max_pfn_mapped)
+ if (PFN_UP(end) <= max_low_pfn_mapped)
va = __va(md->phys_addr);
else
va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 5d23d85624d..4b63c8e1f13 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -49,13 +49,13 @@ void efi_call_phys_prelog(void)
local_irq_save(efi_rt_eflags);
/*
- * If I don't have PSE, I should just duplicate two entries in page
- * directory. If I have PSE, I just need to duplicate one entry in
+ * If I don't have PAE, I should just duplicate two entries in page
+ * directory. If I have PAE, I just need to duplicate one entry in
* page directory.
*/
cr4 = read_cr4();
- if (cr4 & X86_CR4_PSE) {
+ if (cr4 & X86_CR4_PAE) {
efi_bak_pg_dir_pointer[0].pgd =
swapper_pg_dir[pgd_index(0)].pgd;
swapper_pg_dir[0].pgd =
@@ -93,7 +93,7 @@ void efi_call_phys_epilog(void)
cr4 = read_cr4();
- if (cr4 & X86_CR4_PSE) {
+ if (cr4 & X86_CR4_PAE) {
swapper_pg_dir[pgd_index(0)].pgd =
efi_bak_pg_dir_pointer[0].pgd;
} else {
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d0060fdccca..652c5287215 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -97,13 +97,7 @@ void __init efi_call_phys_epilog(void)
early_runtime_code_mapping_set_exec(0);
}
-void __init efi_reserve_bootmem(void)
-{
- reserve_bootmem_generic((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
-}
-
-void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
{
static unsigned pages_mapped __initdata;
unsigned i, pages;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c778e4fa55a..6bc07f0f120 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,14 +51,15 @@
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
-#include "irq_vectors.h"
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
/*
* We use macros for low-level operations which need to be overridden
* for paravirtualization. The following will never clobber any registers:
* INTERRUPT_RETURN (aka. "iret")
* GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
*
* For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
* specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -349,7 +350,7 @@ sysenter_past_esp:
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
- ENABLE_INTERRUPTS_SYSCALL_RET
+ ENABLE_INTERRUPTS_SYSEXIT
CFI_ENDPROC
.pushsection .fixup,"ax"
2: movl $0,PT_FS(%esp)
@@ -874,10 +875,10 @@ ENTRY(native_iret)
.previous
END(native_iret)
-ENTRY(native_irq_enable_syscall_ret)
+ENTRY(native_irq_enable_sysexit)
sti
sysexit
-END(native_irq_enable_syscall_ret)
+END(native_irq_enable_sysexit)
#endif
KPROBE_ENTRY(int3)
@@ -1024,6 +1025,7 @@ ENTRY(xen_sysenter_target)
RING0_INT_FRAME
addl $5*4, %esp /* remove xen-provided frame */
jmp sysenter_past_esp
+ CFI_ENDPROC
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
@@ -1110,6 +1112,77 @@ ENDPROC(xen_failsafe_callback)
#endif /* CONFIG_XEN */
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl mcount_call
+mcount_call:
+ call ftrace_stub
+
+ popl %edx
+ popl %ecx
+ popl %eax
+
+ ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ popl %edx
+ popl %ecx
+ popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+ ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz trace
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+trace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ call *ftrace_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
.section .rodata,"a"
#include "syscall_table_32.S"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a..ae63e584c34 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -51,16 +51,121 @@
#include <asm/page.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
+#include <asm/ftrace.h>
.code64
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ movq 0x38(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl mcount_call
+mcount_call:
+ call ftrace_stub
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+
+ retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+ /* taken from glibc */
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ movq 0x38(%rsp), %rdi
+ movq 8(%rbp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+ retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+.globl ftrace_stub
+ftrace_stub:
+ retq
+
+trace:
+ /* taken from glibc */
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ movq 0x38(%rsp), %rdi
+ movq 8(%rbp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
#ifdef CONFIG_PARAVIRT
-ENTRY(native_irq_enable_syscall_ret)
- movq %gs:pda_oldrsp,%rsp
+ENTRY(native_usergs_sysret64)
swapgs
sysretq
#endif /* CONFIG_PARAVIRT */
@@ -104,7 +209,7 @@ ENTRY(native_irq_enable_syscall_ret)
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
xorl %eax, %eax
- pushq %rax /* ss */
+ pushq $__KERNEL_DS /* ss */
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET ss,0*/
pushq %rax /* rsp */
@@ -169,13 +274,13 @@ ENTRY(ret_from_fork)
CFI_ADJUST_CFA_OFFSET -4
call schedule_tail
GET_THREAD_INFO(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
jnz rff_trace
rff_action:
RESTORE_REST
testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
je int_ret_from_sys_call
- testl $_TIF_IA32,threadinfo_flags(%rcx)
+ testl $_TIF_IA32,TI_flags(%rcx)
jnz int_ret_from_sys_call
RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
jmp ret_from_sys_call
@@ -244,7 +349,8 @@ ENTRY(system_call_after_swapgs)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
+ TI_flags(%rcx)
jnz tracesys
cmpq $__NR_syscall_max,%rax
ja badsys
@@ -263,7 +369,7 @@ sysret_check:
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
@@ -275,7 +381,8 @@ sysret_check:
CFI_REGISTER rip,rcx
RESTORE_ARGS 0,-ARG_SKIP,1
/*CFI_REGISTER rflags,r11*/
- ENABLE_INTERRUPTS_SYSCALL_RET
+ movq %gs:pda_oldrsp, %rsp
+ USERGS_SYSRET64
CFI_RESTORE_STATE
/* Handle reschedules */
@@ -305,7 +412,7 @@ sysret_signal:
leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
xorl %esi,%esi # oldset -> arg2
call ptregscall_common
-1: movl $_TIF_NEED_RESCHED,%edi
+1: movl $_TIF_WORK_MASK,%edi
/* Use IRET because user could have changed frame. This
works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -347,10 +454,10 @@ int_ret_from_sys_call:
int_with_check:
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
- andl $~TS_COMPAT,threadinfo_status(%rcx)
+ andl $~TS_COMPAT,TI_status(%rcx)
jmp retint_swapgs
/* Either reschedule or signal or syscall exit tracking needed. */
@@ -393,7 +500,7 @@ int_signal:
movq %rsp,%rdi # &ptregs -> arg1
xorl %esi,%esi # oldset -> arg2
call do_notify_resume
-1: movl $_TIF_NEED_RESCHED,%edi
+1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -420,7 +527,6 @@ END(\label)
PTREGSCALL stub_clone, sys_clone, %r8
PTREGSCALL stub_fork, sys_fork, %rdi
PTREGSCALL stub_vfork, sys_vfork, %rdi
- PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
@@ -559,7 +665,7 @@ retint_with_reschedule:
movl $_TIF_WORK_MASK,%edi
retint_check:
LOCKDEP_SYS_EXIT_IRQ
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
andl %edi,%edx
CFI_REMEMBER_STATE
jnz retint_careful
@@ -647,17 +753,16 @@ retint_signal:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl $_TIF_NEED_RESCHED,%edi
GET_THREAD_INFO(%rcx)
- jmp retint_check
+ jmp retint_with_reschedule
#ifdef CONFIG_PREEMPT
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
ENTRY(retint_kernel)
- cmpl $0,threadinfo_preempt_count(%rcx)
+ cmpl $0,TI_preempt_count(%rcx)
jnz retint_restore_args
- bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+ bt $TIF_NEED_RESCHED,TI_flags(%rcx)
jnc retint_restore_args
bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
@@ -711,6 +816,9 @@ END(invalidate_interrupt\num)
ENTRY(call_function_interrupt)
apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
END(call_function_interrupt)
+ENTRY(call_function_single_interrupt)
+ apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
+END(call_function_single_interrupt)
ENTRY(irq_move_cleanup_interrupt)
apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
END(irq_move_cleanup_interrupt)
@@ -720,6 +828,10 @@ ENTRY(apic_timer_interrupt)
apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
END(apic_timer_interrupt)
+ENTRY(uv_bau_message_intr1)
+ apicinterrupt 220,uv_bau_message_interrupt
+END(uv_bau_message_intr1)
+
ENTRY(error_interrupt)
apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
END(error_interrupt)
@@ -733,6 +845,7 @@ END(spurious_interrupt)
*/
.macro zeroentry sym
INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0 /* push error code/oldrax */
CFI_ADJUST_CFA_OFFSET 8
pushq %rax /* push real oldrax to the rdi slot */
@@ -745,6 +858,7 @@ END(spurious_interrupt)
.macro errorentry sym
XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq %rax
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rax,0
@@ -814,7 +928,7 @@ paranoid_restore\trace:
jmp irq_return
paranoid_userspace\trace:
GET_THREAD_INFO(%rcx)
- movl threadinfo_flags(%rcx),%ebx
+ movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
jz paranoid_swapgs\trace
movq %rsp,%rdi /* &pt_regs */
@@ -912,7 +1026,7 @@ error_exit:
testl %eax,%eax
jne retint_kernel
LOCKDEP_SYS_EXIT_IRQ
- movl threadinfo_flags(%rcx),%edx
+ movl TI_flags(%rcx),%edx
movl $_TIF_WORK_MASK,%edi
andl %edi,%edx
jnz retint_careful
@@ -926,11 +1040,11 @@ error_kernelspace:
iret run with kernel gs again, so don't set the user space flag.
B stepping K8s sometimes report an truncated RIP for IRET
exceptions returning to compat mode. Check for these here too. */
- leaq irq_return(%rip),%rbp
- cmpq %rbp,RIP(%rsp)
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP(%rsp)
je error_swapgs
- movl %ebp,%ebp /* zero extend */
- cmpq %rbp,RIP(%rsp)
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP(%rsp)
je error_swapgs
cmpq $gs_change,RIP(%rsp)
je error_swapgs
@@ -939,7 +1053,7 @@ KPROBE_END(error_entry)
/* Reload gs selector with exception handling */
/* edi: new selector */
-ENTRY(load_gs_index)
+ENTRY(native_load_gs_index)
CFI_STARTPROC
pushf
CFI_ADJUST_CFA_OFFSET 8
@@ -953,7 +1067,7 @@ gs_change:
CFI_ADJUST_CFA_OFFSET -8
ret
CFI_ENDPROC
-ENDPROC(load_gs_index)
+ENDPROC(native_load_gs_index)
.section __ex_table,"a"
.align 8
@@ -1120,10 +1234,6 @@ ENTRY(coprocessor_segment_overrun)
zeroentry do_coprocessor_segment_overrun
END(coprocessor_segment_overrun)
-ENTRY(reserved)
- zeroentry do_reserved
-END(reserved)
-
/* runs on exception stack */
ENTRY(double_fault)
XCPT_FRAME
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 00000000000..ab115cd15fd
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,141 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/alternative.h>
+#include <asm/ftrace.h>
+
+
+/* Long is fine, even if it is only 4 bytes ;-) */
+static long *ftrace_nop;
+
+union ftrace_code_union {
+ char code[MCOUNT_INSN_SIZE];
+ struct {
+ char e8;
+ int offset;
+ } __attribute__((packed));
+};
+
+
+static int notrace ftrace_calc_offset(long ip, long addr)
+{
+ return (int)(addr - ip);
+}
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+ return (char *)ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+ static union ftrace_code_union calc;
+
+ calc.e8 = 0xe8;
+ calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+
+ /*
+ * No locking needed, this must be called via kstop_machine
+ * which in essence is like running on a uniprocessor machine.
+ */
+ return calc.code;
+}
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+ unsigned char *new_code)
+{
+ unsigned replaced;
+ unsigned old = *(unsigned *)old_code; /* 4 bytes */
+ unsigned new = *(unsigned *)new_code; /* 4 bytes */
+ unsigned char newch = new_code[4];
+ int faulted = 0;
+
+ /*
+ * Note: Due to modules and __init, code can
+ * disappear and change, we need to protect against faulting
+ * as well as code changing.
+ *
+ * No real locking needed, this code is run through
+ * kstop_machine.
+ */
+ asm volatile (
+ "1: lock\n"
+ " cmpxchg %3, (%2)\n"
+ " jnz 2f\n"
+ " movb %b4, 4(%2)\n"
+ "2:\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movl $1, %0\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : "=r"(faulted), "=a"(replaced)
+ : "r"(ip), "r"(new), "c"(newch),
+ "0"(faulted), "a"(old)
+ : "memory");
+ sync_core();
+
+ if (replaced != old && replaced != new)
+ faulted = 2;
+
+ return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+ unsigned long ip = (unsigned long)(&ftrace_call);
+ unsigned char old[MCOUNT_INSN_SIZE], *new;
+ int ret;
+
+ memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = ftrace_modify_code(ip, old, new);
+
+ return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+ unsigned long ip = (long)(&mcount_call);
+ unsigned long *addr = data;
+ unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+ /*
+ * Replace the mcount stub with a pointer to the
+ * ip recorder function.
+ */
+ memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
+ new = ftrace_call_replace(ip, *addr);
+ *addr = ftrace_modify_code(ip, old, new);
+
+ return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+ const unsigned char *const *noptable = find_nop_table();
+
+ /* This is running in kstop_machine */
+
+ ftrace_mcount_set(data);
+
+ ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
+
+ return 0;
+}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cbaaf69bedb..1fa8be5bd21 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
else
#endif
- if (num_possible_cpus() <= 8)
+ if (max_physical_apicid < 8)
genapic = &apic_flat;
else
genapic = &apic_physflat;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index ebf13908a74..711f11c30b0 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -5,9 +5,10 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
*/
+#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
@@ -20,6 +21,7 @@
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
+#include <asm/pgtable.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -55,37 +57,37 @@ static cpumask_t uv_vector_allocation_domain(int cpu)
int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
{
unsigned long val;
- int nasid;
+ int pnode;
- nasid = uv_apicid_to_nasid(phys_apicid);
+ pnode = uv_apicid_to_pnode(phys_apicid);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
(((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_INIT;
- uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
mdelay(10);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
(((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_STARTUP;
- uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
return 0;
}
static void uv_send_IPI_one(int cpu, int vector)
{
unsigned long val, apicid, lapicid;
- int nasid;
+ int pnode;
apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
lapicid = apicid & 0x3f; /* ZZZ macro needed */
- nasid = uv_apicid_to_nasid(apicid);
+ pnode = uv_apicid_to_pnode(apicid);
val =
(1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
UVH_IPI_INT_APIC_ID_SHFT) |
(vector << UVH_IPI_INT_VECTOR_SHFT);
- uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
static void uv_send_IPI_mask(cpumask_t mask, int vector)
@@ -159,39 +161,146 @@ struct genapic apic_x2apic_uv_x = {
.phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */
};
-static __cpuinit void set_x2apic_extra_bits(int nasid)
+static __cpuinit void set_x2apic_extra_bits(int pnode)
{
- __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6);
+ __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
}
/*
* Called on boot cpu.
*/
+static __init int boot_pnode_to_blade(int pnode)
+{
+ int blade;
+
+ for (blade = 0; blade < uv_num_possible_blades(); blade++)
+ if (pnode == uv_blade_info[blade].pnode)
+ return blade;
+ BUG();
+}
+
+struct redir_addr {
+ unsigned long redirect;
+ unsigned long alias;
+};
+
+#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+
+static __initdata struct redir_addr redir_addrs[] = {
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+};
+
+static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
+{
+ union uvh_si_alias0_overlay_config_u alias;
+ union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
+ alias.v = uv_read_local_mmr(redir_addrs[i].alias);
+ if (alias.s.base == 0) {
+ *size = (1UL << alias.s.m_alias);
+ redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
+ *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+ return;
+ }
+ }
+ BUG();
+}
+
+static __init void map_low_mmrs(void)
+{
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+}
+
+enum map_type {map_wb, map_uc};
+
+static void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
+{
+ unsigned long bytes, paddr;
+
+ paddr = base << shift;
+ bytes = (1UL << shift);
+ printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
+ paddr + bytes);
+ if (map_type == map_uc)
+ init_extra_mapping_uc(paddr, bytes);
+ else
+ init_extra_mapping_wb(paddr, bytes);
+
+}
+static __init void map_gru_high(int max_pnode)
+{
+ union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+ int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
+ if (gru.s.enable)
+ map_high("GRU", gru.s.base, shift, map_wb);
+}
+
+static __init void map_config_high(int max_pnode)
+{
+ union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
+ int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
+ if (cfg.s.enable)
+ map_high("CONFIG", cfg.s.base, shift, map_uc);
+}
+
+static __init void map_mmr_high(int max_pnode)
+{
+ union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
+ int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
+ if (mmr.s.enable)
+ map_high("MMR", mmr.s.base, shift, map_uc);
+}
+
+static __init void map_mmioh_high(int max_pnode)
+{
+ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
+ int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+ if (mmioh.s.enable)
+ map_high("MMIOH", mmioh.s.base, shift, map_uc);
+}
+
static __init void uv_system_init(void)
{
union uvh_si_addr_map_config_u m_n_config;
- int bytes, nid, cpu, lcpu, nasid, last_nasid, blade;
- unsigned long mmr_base;
+ union uvh_node_id_u node_id;
+ unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
+ int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+ int max_pnode = 0;
+ unsigned long mmr_base, present;
+
+ map_low_mmrs();
m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+ m_val = m_n_config.s.m_skt;
+ n_val = m_n_config.s.n_skt;
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
- last_nasid = -1;
- for_each_possible_cpu(cpu) {
- nid = cpu_to_node(cpu);
- nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
- if (nasid != last_nasid)
- uv_possible_blades++;
- last_nasid = nasid;
- }
+ for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
+ uv_possible_blades +=
+ hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
uv_blade_info = alloc_bootmem_pages(bytes);
+ get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
+
bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
uv_node_to_blade = alloc_bootmem_pages(bytes);
memset(uv_node_to_blade, 255, bytes);
@@ -200,43 +309,62 @@ static __init void uv_system_init(void)
uv_cpu_to_blade = alloc_bootmem_pages(bytes);
memset(uv_cpu_to_blade, 255, bytes);
- last_nasid = -1;
- blade = -1;
- lcpu = -1;
- for_each_possible_cpu(cpu) {
- nid = cpu_to_node(cpu);
- nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
- if (nasid != last_nasid) {
- blade++;
- lcpu = -1;
- uv_blade_info[blade].nr_posible_cpus = 0;
+ blade = 0;
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+ for (j = 0; j < 64; j++) {
+ if (!test_bit(j, &present))
+ continue;
+ uv_blade_info[blade].pnode = (i * 64 + j);
+ uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
+ blade++;
}
- last_nasid = nasid;
- lcpu++;
+ }
- uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt;
- uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt;
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ gnode_upper = (((unsigned long)node_id.s.node_id) &
+ ~((1 << n_val) - 1)) << m_val;
+
+ for_each_present_cpu(cpu) {
+ nid = cpu_to_node(cpu);
+ pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
+ blade = boot_pnode_to_blade(pnode);
+ lcpu = uv_blade_info[blade].nr_possible_cpus;
+ uv_blade_info[blade].nr_possible_cpus++;
+
+ uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
+ uv_cpu_hub_info(cpu)->lowmem_remap_top =
+ lowmem_redir_base + lowmem_redir_size;
+ uv_cpu_hub_info(cpu)->m_val = m_val;
+ uv_cpu_hub_info(cpu)->n_val = m_val;
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
- uv_cpu_hub_info(cpu)->local_nasid = nasid;
- uv_cpu_hub_info(cpu)->gnode_upper =
- nasid & ~((1 << uv_hub_info->n_val) - 1);
+ uv_cpu_hub_info(cpu)->pnode = pnode;
+ uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
+ uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
+ uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
- uv_blade_info[blade].nasid = nasid;
- uv_blade_info[blade].nr_posible_cpus++;
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
+ max_pnode = max(pnode, max_pnode);
- printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n",
- cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid);
- printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade);
+ printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
+ "lcpu %d, blade %d\n",
+ cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
+ lcpu, blade);
}
+
+ map_gru_high(max_pnode);
+ map_mmr_high(max_pnode);
+ map_config_high(max_pnode);
+ map_mmioh_high(max_pnode);
}
/*
* Called on each cpu to initialize the per_cpu UV data area.
+ * ZZZ hotplug not supported yet
*/
void __cpuinit uv_cpu_init(void)
{
@@ -246,5 +374,5 @@ void __cpuinit uv_cpu_init(void)
uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->local_nasid);
+ set_x2apic_extra_bits(uv_hub_info->pnode);
}
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index e8edd63ab00..9b08e852fd1 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -166,6 +166,8 @@ int geode_has_vsa2(void)
static int has_vsa2 = -1;
if (has_vsa2 == -1) {
+ u16 val;
+
/*
* The VSA has virtual registers that we can query for a
* signature.
@@ -173,7 +175,8 @@ int geode_has_vsa2(void)
outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
- has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG);
+ val = inw(VSA_VRC_DATA);
+ has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
}
return has_vsa2;
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
new file mode 100644
index 00000000000..3e66bd364a9
--- /dev/null
+++ b/arch/x86/kernel/head.c
@@ -0,0 +1,55 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+ unsigned int lowmem, ebda_addr;
+
+ /* To determine the position of the EBDA and the */
+ /* end of conventional memory, we need to look at */
+ /* the BIOS data area. In a paravirtual environment */
+ /* that area is absent. We'll just have to assume */
+ /* that the paravirt case can handle memory setup */
+ /* correctly, without our help. */
+ if (paravirt_enabled())
+ return;
+
+ /* end of low (conventional) memory */
+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+ lowmem <<= 10;
+
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
+ if ((lowmem - ebda_addr) <= 0x10000)
+ lowmem = ebda_addr;
+
+ /* Fixup: bios does not report an EBDA at all. */
+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+ lowmem = 0x9f000;
+
+ /* Paranoia: should never happen, but... */
+ if ((lowmem == 0) || (lowmem >= 0x100000))
+ lowmem = 0x9f000;
+
+ /* reserve all memory between lowmem and the 1MB mark */
+ reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3db05905892..fa1d25dd83e 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,7 +8,34 @@
#include <linux/init.h>
#include <linux/start_kernel.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
void __init i386_start_kernel(void)
{
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ /* Reserve INITRD */
+ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ }
+#endif
+ reserve_early(init_pg_tables_start, init_pg_tables_end,
+ "INIT_PG_TABLE");
+
+ reserve_ebda_region();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
start_kernel();
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e25c57b8aa8..c9781982914 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,6 +25,20 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>
+/* boot cpu pda */
+static struct x8664_pda _boot_cpu_pda __read_mostly;
+
+#ifdef CONFIG_SMP
+/*
+ * We install an empty cpu_pda pointer table to indicate to early users
+ * (numa_set_node) that the cpu_pda pointer table for cpus other than
+ * the boot cpu is not yet setup.
+ */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
+#else
+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
+#endif
+
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -51,74 +65,6 @@ static void __init copy_bootdata(char *real_mode_data)
}
}
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_early(lowmem, 0x100000, "BIOS reserved");
-}
-
-static void __init reserve_setup_data(void)
-{
- struct setup_data *data;
- unsigned long pa_data;
- char buf[32];
-
- if (boot_params.hdr.version < 0x0209)
- return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_ioremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
- pa_data = data->next;
- early_iounmap(data, sizeof(*data));
- }
-}
-
void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
@@ -156,10 +102,17 @@ void __init x86_64_start_kernel(char * real_mode_data)
early_printk("Kernel alive\n");
- for (i = 0; i < NR_CPUS; i++)
- cpu_pda(i) = &boot_cpu_pda[i];
-
+ _cpu_pda = __cpu_pda;
+ cpu_pda(0) = &_boot_cpu_pda;
pda_init(0);
+
+ early_printk("Kernel really alive\n");
+
+ x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
copy_bootdata(__va(real_mode_data));
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
@@ -175,7 +128,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
#endif
reserve_ebda_region();
- reserve_setup_data();
/*
* At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f7357cc0162..f67e93441ca 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -194,6 +194,7 @@ default_entry:
xorl %ebx,%ebx /* %ebx is kept at zero */
movl $pa(pg0), %edi
+ movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_pmd), %edx
movl $PTE_ATTR, %eax
10:
@@ -219,6 +220,8 @@ default_entry:
jb 10b
1:
movl %edi,pa(init_pg_tables_end)
+ shrl $12, %eax
+ movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -228,6 +231,7 @@ default_entry:
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(pg0), %edi
+ movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_dir), %edx
movl $PTE_ATTR, %eax
10:
@@ -249,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
cmpl %ebp,%eax
jb 10b
movl %edi,pa(init_pg_tables_end)
+ shrl $12, %eax
+ movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -446,10 +452,13 @@ is386: movl $2,%ecx # set MP
je 1f
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
- jmp initialize_secondary # all other CPUs call initialize_secondary
+ movl (stack_start), %esp
1:
#endif /* CONFIG_SMP */
- jmp i386_start_kernel
+ jmp *(initial_code)
+.align 4
+ENTRY(initial_code)
+ .long i386_start_kernel
/*
* We depend on ET to be correct. This checks for 287/387.
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 10a1955bb1d..b07ac7b217c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -18,6 +18,7 @@
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
+#include <asm/processor-flags.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -31,6 +32,13 @@
*
*/
+#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+
+L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
+L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
+L4_START_KERNEL = pgd_index(__START_KERNEL_map)
+L3_START_KERNEL = pud_index(__START_KERNEL_map)
+
.text
.section .text.head
.code64
@@ -76,8 +84,8 @@ startup_64:
/* Fixup the physical addresses in the page table
*/
addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (258*8)(%rip)
- addq %rbp, init_level4_pgt + (511*8)(%rip)
+ addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
+ addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
addq %rbp, level3_ident_pgt + 0(%rip)
@@ -128,7 +136,7 @@ ident_complete:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_TRAMPOLINE
addq %rbp, trampoline_level4_pgt + 0(%rip)
addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
#endif
@@ -154,9 +162,7 @@ ENTRY(secondary_startup_64)
*/
/* Enable PAE mode and PGE */
- xorq %rax, %rax
- btsq $5, %rax
- btsq $7, %rax
+ movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
movq %rax, %cr4
/* Setup early boot stage 4 level pagetables. */
@@ -184,19 +190,15 @@ ENTRY(secondary_startup_64)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
-#define CR0_PM 1 /* protected mode */
-#define CR0_MP (1<<1)
-#define CR0_ET (1<<4)
-#define CR0_NE (1<<5)
-#define CR0_WP (1<<16)
-#define CR0_AM (1<<18)
-#define CR0_PAGING (1<<31)
- movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+ X86_CR0_PG)
+ movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
/* Setup a boot time stack */
- movq init_rsp(%rip),%rsp
+ movq stack_start(%rip),%rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -208,7 +210,7 @@ ENTRY(secondary_startup_64)
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
- lgdt cpu_gdt_descr(%rip)
+ lgdt early_gdt_descr(%rip)
/* set up data segments. actually 0 would do too */
movl $__KERNEL_DS,%eax
@@ -257,8 +259,9 @@ ENTRY(secondary_startup_64)
.quad x86_64_start_kernel
__FINITDATA
- ENTRY(init_rsp)
+ ENTRY(stack_start)
.quad init_thread_union+THREAD_SIZE-8
+ .word 0
bad_address:
jmp bad_address
@@ -327,11 +330,11 @@ early_idt_ripmsg:
ENTRY(name)
/* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT) \
- i = 0 ; \
- .rept (COUNT) ; \
- .quad (START) + (i << 21) + (PERM) ; \
- i = i + 1 ; \
+#define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+ .rept (COUNT) ; \
+ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
+ i = i + 1 ; \
.endr
/*
@@ -342,9 +345,9 @@ ENTRY(name)
*/
NEXT_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 257,8,0
+ .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 252,8,0
+ .org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -353,7 +356,7 @@ NEXT_PAGE(level3_ident_pgt)
.fill 511,8,0
NEXT_PAGE(level3_kernel_pgt)
- .fill 510,8,0
+ .fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -384,7 +387,7 @@ NEXT_PAGE(level2_kernel_pgt)
* If you want to increase this then increase MODULES_VADDR
* too.)
*/
- PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)
NEXT_PAGE(level2_spare_pgt)
@@ -395,54 +398,16 @@ NEXT_PAGE(level2_spare_pgt)
.data
.align 16
- .globl cpu_gdt_descr
-cpu_gdt_descr:
- .word gdt_end-cpu_gdt_table-1
-gdt:
- .quad cpu_gdt_table
-#ifdef CONFIG_SMP
- .rept NR_CPUS-1
- .word 0
- .quad 0
- .endr
-#endif
+ .globl early_gdt_descr
+early_gdt_descr:
+ .word GDT_ENTRIES*8-1
+ .quad per_cpu__gdt_page
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-
- .section .data.page_aligned, "aw"
- .align PAGE_SIZE
-
-/* The TLS descriptors are currently at a different place compared to i386.
- Hopefully nobody expects them at a fixed place (Wine?) */
-ENTRY(cpu_gdt_table)
- .quad 0x0000000000000000 /* NULL descriptor */
- .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
- .quad 0x00af9b000000ffff /* __KERNEL_CS */
- .quad 0x00cf93000000ffff /* __KERNEL_DS */
- .quad 0x00cffb000000ffff /* __USER32_CS */
- .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
- .quad 0x00affb000000ffff /* __USER_CS */
- .quad 0x0 /* unused */
- .quad 0,0 /* TSS */
- .quad 0,0 /* LDT */
- .quad 0,0,0 /* three TLS descriptors */
- .quad 0x0000f40000000000 /* node/CPU stored in limit */
-gdt_end:
- /* asm/segment.h:GDT_ENTRIES must match this */
- /* This should be a multiple of the cache line size */
- /* GDTs of other CPUs are now dynamically allocated */
-
- /* zero the remaining page */
- .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
-
.section .bss, "aw", @nobits
.align L1_CACHE_BYTES
ENTRY(idt_table)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9b5cfcdfc42..0ea6a19bfdf 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -17,7 +17,7 @@
/* FSEC = 10^-15
NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000
+#define FSEC_PER_NSEC 1000000L
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
@@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a)
}
#ifdef CONFIG_X86_64
-
#include <asm/pgtable.h>
-
-static inline void hpet_set_mapping(void)
-{
- set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
- hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
-}
-
-static inline void hpet_clear_mapping(void)
-{
- hpet_virt_address = NULL;
-}
-
-#else
+#endif
static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+#ifdef CONFIG_X86_64
+ __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+#endif
}
static inline void hpet_clear_mapping(void)
@@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void)
iounmap(hpet_virt_address);
hpet_virt_address = NULL;
}
-#endif
/*
* HPET command line enable / disable
@@ -206,20 +194,19 @@ static void hpet_enable_legacy_int(void)
static void hpet_legacy_clockevent_register(void)
{
- uint64_t hpet_freq;
-
/* Start HPET legacy interrupts */
hpet_enable_legacy_int();
/*
- * The period is a femto seconds value. We need to calculate the
- * scaled math multiplication factor for nanosecond to hpet tick
- * conversion.
+ * The mult factor is defined as (include/linux/clockchips.h)
+ * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
+ * hpet_period is in units of femtoseconds (per cycle), so
+ * mult/2^shift = cyc/ns = 10^6/hpet_period
+ * mult = (10^6 * 2^shift)/hpet_period
+ * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
*/
- hpet_freq = 1000000000000000ULL;
- do_div(hpet_freq, hpet_period);
- hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, hpet_clockevent.shift);
+ hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
+ hpet_period, hpet_clockevent.shift);
/* Calculate the min / max delta */
hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
&hpet_clockevent);
@@ -324,7 +311,7 @@ static struct clocksource clocksource_hpet = {
static int hpet_clocksource_register(void)
{
- u64 tmp, start, now;
+ u64 start, now;
cycle_t t1;
/* Start the counter */
@@ -351,21 +338,15 @@ static int hpet_clocksource_register(void)
return -ENODEV;
}
- /* Initialize and register HPET clocksource
- *
- * hpet period is in femto seconds per cycle
- * so we need to convert this to ns/cyc units
- * approximated by mult/2^shift
- *
- * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
- * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
- * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
- * (fsec/cyc << shift)/1000000 = mult
- * (hpet_period << shift)/FSEC_PER_NSEC = mult
+ /*
+ * The definition of mult is (include/linux/clocksource.h)
+ * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
+ * so we first need to convert hpet_period to ns/cyc units:
+ * mult/2^shift = ns/cyc = hpet_period/10^6
+ * mult = (hpet_period * 2^shift)/10^6
+ * mult = (hpet_period << shift)/FSEC_PER_NSEC
*/
- tmp = (u64)hpet_period << HPET_SHIFT;
- do_div(tmp, FSEC_PER_NSEC);
- clocksource_hpet.mult = (u32)tmp;
+ clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
clocksource_register(&clocksource_hpet);
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index deb43785e92..dd7ebee446a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,7 +1,14 @@
#include <linux/module.h>
+
#include <asm/checksum.h>
-#include <asm/desc.h>
#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/ftrace.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
/* Networking helper routines. */
EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259.c
index fe631967d62..dc92b49d920 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,8 +1,10 @@
+#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/timex.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/init.h>
@@ -10,10 +12,12 @@
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <asm/acpi.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/timer.h>
+#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/delay.h>
#include <asm/desc.h>
@@ -32,7 +36,7 @@ static int i8259A_auto_eoi;
DEFINE_SPINLOCK(i8259A_lock);
static void mask_and_ack_8259A(unsigned int);
-static struct irq_chip i8259A_chip = {
+struct irq_chip i8259A_chip = {
.name = "XT-PIC",
.mask = disable_8259A_irq,
.disable = disable_8259A_irq,
@@ -125,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq)
int irqmask = 1<<irq;
if (irq < 8) {
- outb(0x0B,PIC_MASTER_CMD); /* ISR register */
+ outb(0x0B, PIC_MASTER_CMD); /* ISR register */
value = inb(PIC_MASTER_CMD) & irqmask;
- outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
+ outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */
return value;
}
- outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
+ outb(0x0B, PIC_SLAVE_CMD); /* ISR register */
value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
- outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
+ outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */
return value;
}
@@ -171,12 +175,14 @@ handle_real_irq:
if (irq & 8) {
inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
outb(cached_slave_mask, PIC_SLAVE_IMR);
- outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
- outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+ /* 'Specific EOI' to slave */
+ outb(0x60+(irq&7), PIC_SLAVE_CMD);
+ /* 'Specific EOI' to master-IRQ2 */
+ outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
} else {
inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
outb(cached_master_mask, PIC_MASTER_IMR);
- outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */
+ outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
}
spin_unlock_irqrestore(&i8259A_lock, flags);
return;
@@ -199,7 +205,8 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+ printk(KERN_DEBUG
+ "spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
atomic_inc(&irq_err_count);
@@ -290,17 +297,28 @@ void init_8259A(int auto_eoi)
* outb_pic - this has to work on a wide range of PC hardware.
*/
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
- outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
+
+ /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
+ to 0x20-0x27 on i386 */
+ outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+
+ /* 8259A-1 (the master) has a slave on IR2 */
+ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
+
if (auto_eoi) /* master does Auto EOI */
outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
else /* master expects normal EOI */
outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
- outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
- outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
- outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+
+ /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
+ outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+ /* 8259A-2 is a slave on master's IR2 */
+ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+ /* (slave's support for AEOI in flat mode is to be investigated) */
+ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
if (auto_eoi)
/*
* In AEOI mode we just have to mask the interrupt
@@ -317,93 +335,3 @@ void init_8259A(int auto_eoi)
spin_unlock_irqrestore(&i8259A_lock, flags);
}
-
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
- extern void math_error(void __user *);
- outb(0,0xF0);
- if (ignore_fpu_irq || !boot_cpu_data.hard_math)
- return IRQ_NONE;
- math_error((void __user *)get_irq_regs()->ip);
- return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
- .handler = math_error_irq,
- .mask = CPU_MASK_NONE,
- .name = "fpu",
-};
-
-void __init init_ISA_irqs (void)
-{
- int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- init_bsp_APIC();
-#endif
- init_8259A(0);
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < 16; i++) {
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
- int i;
-
- /* all the set up before the call gates are initialised */
- pre_intr_init_hook();
-
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (i >= NR_IRQS)
- break;
- /* SYSCALL_VECTOR was reserved in trap_init. */
- if (!test_bit(vector, used_vectors))
- set_intr_gate(vector, interrupt[i]);
- }
-
- /* setup after call gates are initialised (usually add in
- * the architecture specific gates)
- */
- intr_init_hook();
-
- /*
- * External FPU? Set up irq13 if so, for
- * original braindamaged IBM FERR coupling.
- */
- if (boot_cpu_data.hard_math && !cpu_has_fpu)
- setup_irq(FPU_IRQ, &fpu_irq);
-
- irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
deleted file mode 100644
index fa57a156850..00000000000
--- a/arch/x86/kernel/i8259_64.c
+++ /dev/null
@@ -1,512 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/acpi.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define BI(x,y) \
- BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
- BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
- BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
- BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
- BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
- IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
- IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
- IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
- IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
- IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
- IRQLIST_16(0x2), IRQLIST_16(0x3),
- IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
- IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
- IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
- .name = "XT-PIC",
- .mask = disable_8259A_irq,
- .disable = disable_8259A_irq,
- .unmask = enable_8259A_irq,
- .mask_ack = mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
- unsigned int mask = 1 << irq;
- unsigned long flags;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- cached_irq_mask |= mask;
- if (irq & 8)
- outb(cached_slave_mask, PIC_SLAVE_IMR);
- else
- outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
- unsigned int mask = ~(1 << irq);
- unsigned long flags;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- cached_irq_mask &= mask;
- if (irq & 8)
- outb(cached_slave_mask, PIC_SLAVE_IMR);
- else
- outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
- unsigned int mask = 1<<irq;
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- if (irq < 8)
- ret = inb(PIC_MASTER_CMD) & mask;
- else
- ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
- spin_unlock_irqrestore(&i8259A_lock, flags);
-
- return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
- disable_irq_nosync(irq);
- io_apic_irqs &= ~(1<<irq);
- set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
- "XT");
- enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
- int value;
- int irqmask = 1<<irq;
-
- if (irq < 8) {
- outb(0x0B,PIC_MASTER_CMD); /* ISR register */
- value = inb(PIC_MASTER_CMD) & irqmask;
- outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
- return value;
- }
- outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
- value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
- outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
- return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
- unsigned int irqmask = 1 << irq;
- unsigned long flags;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- /*
- * Lightweight spurious IRQ detection. We do not want
- * to overdo spurious IRQ handling - it's usually a sign
- * of hardware problems, so we only do the checks we can
- * do without slowing down good hardware unnecessarily.
- *
- * Note that IRQ7 and IRQ15 (the two spurious IRQs
- * usually resulting from the 8259A-1|2 PICs) occur
- * even if the IRQ is masked in the 8259A. Thus we
- * can check spurious 8259A IRQs without doing the
- * quite slow i8259A_irq_real() call for every IRQ.
- * This does not cover 100% of spurious interrupts,
- * but should be enough to warn the user that there
- * is something bad going on ...
- */
- if (cached_irq_mask & irqmask)
- goto spurious_8259A_irq;
- cached_irq_mask |= irqmask;
-
-handle_real_irq:
- if (irq & 8) {
- inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
- outb(cached_slave_mask, PIC_SLAVE_IMR);
- /* 'Specific EOI' to slave */
- outb(0x60+(irq&7),PIC_SLAVE_CMD);
- /* 'Specific EOI' to master-IRQ2 */
- outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
- } else {
- inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
- outb(cached_master_mask, PIC_MASTER_IMR);
- /* 'Specific EOI' to master */
- outb(0x60+irq,PIC_MASTER_CMD);
- }
- spin_unlock_irqrestore(&i8259A_lock, flags);
- return;
-
-spurious_8259A_irq:
- /*
- * this is the slow path - should happen rarely.
- */
- if (i8259A_irq_real(irq))
- /*
- * oops, the IRQ _is_ in service according to the
- * 8259A - not spurious, go handle it.
- */
- goto handle_real_irq;
-
- {
- static int spurious_irq_mask;
- /*
- * At this point we can be sure the IRQ is spurious,
- * lets ACK and report it. [once per IRQ]
- */
- if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
- "spurious 8259A interrupt: IRQ%d.\n", irq);
- spurious_irq_mask |= irqmask;
- }
- atomic_inc(&irq_err_count);
- /*
- * Theoretically we do not have to handle this IRQ,
- * but in Linux this does not cause problems and is
- * simpler for us.
- */
- goto handle_real_irq;
- }
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
- outb(trigger[0], 0x4d0);
- outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
- /* IRQ 0,1,2,8,13 are marked as reserved */
- trigger[0] = inb(0x4d0) & 0xF8;
- trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
- init_8259A(i8259A_auto_eoi);
- restore_ELCR(irq_trigger);
- return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
- save_ELCR(irq_trigger);
- return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
- /* Put the i8259A into a quiescent state that
- * the kernel initialization code can get it
- * out of.
- */
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
- return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
- .name = "i8259",
- .suspend = i8259A_suspend,
- .resume = i8259A_resume,
- .shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
- .id = 0,
- .cls = &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
- int error = sysdev_class_register(&i8259_sysdev_class);
- if (!error)
- error = sysdev_register(&device_i8259A);
- return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
- unsigned long flags;
-
- i8259A_auto_eoi = auto_eoi;
-
- spin_lock_irqsave(&i8259A_lock, flags);
-
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
-
- /*
- * outb_pic - this has to work on a wide range of PC hardware.
- */
- outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
- outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
- /* 8259A-1 (the master) has a slave on IR2 */
- outb_pic(0x04, PIC_MASTER_IMR);
- if (auto_eoi) /* master does Auto EOI */
- outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
- else /* master expects normal EOI */
- outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
- outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
- /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
- outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
- /* 8259A-2 is a slave on master's IR2 */
- outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
- /* (slave's support for AEOI in flat mode is to be investigated) */
- outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
-
- if (auto_eoi)
- /*
- * In AEOI mode we just have to mask the interrupt
- * when acking.
- */
- i8259A_chip.mask_ack = disable_8259A_irq;
- else
- i8259A_chip.mask_ack = mask_and_ack_8259A;
-
- udelay(100); /* wait for 8259A to initialize */
-
- outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
- outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
-
- spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-
-
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
- .handler = no_action,
- .mask = CPU_MASK_NONE,
- .name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-void __init init_ISA_irqs (void)
-{
- int i;
-
- init_bsp_APIC();
- init_8259A(0);
-
- for (i = 0; i < NR_IRQS; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = NULL;
- irq_desc[i].depth = 1;
-
- if (i < 16) {
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- } else {
- /*
- * 'high' PCI IRQs filled in on demand
- */
- irq_desc[i].chip = &no_irq_chip;
- }
- }
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
- int i;
-
- init_ISA_irqs();
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (vector != IA32_SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
- }
-
-#ifdef CONFIG_SMP
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPIs for invalidation */
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
- /* IPI for generic function call */
- set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-#endif
- set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
- set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
- /* self generated IPI for local APIC timer */
- set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d2..558abf4c796 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
+#include <linux/bootmem.h>
#include <linux/mc146818rtc.h>
#include <linux/compiler.h>
#include <linux/acpi.h>
@@ -58,7 +59,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
static DEFINE_SPINLOCK(ioapic_lock);
static DEFINE_SPINLOCK(vector_lock);
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
/*
* Is the SiS APIC rmw bug present ?
@@ -72,15 +73,21 @@ int sis_apic_bug = -1;
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
static int disable_timer_pin_1 __initdata;
/*
@@ -110,7 +117,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -239,7 +246,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
}
}
-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
{
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int pin, reg;
@@ -259,30 +266,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign
}
/* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
+static void __mask_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00010000, 0);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
}
/* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
+static void __unmask_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0, 0x00010000);
+ __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
}
/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER);
}
/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED);
}
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
@@ -291,7 +300,7 @@ static void mask_IO_APIC_irq (unsigned int irq)
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
@@ -303,7 +312,7 @@ static void unmask_IO_APIC_irq (unsigned int irq)
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
-
+
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
@@ -315,7 +324,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
ioapic_mask_entry(apic, pin);
}
-static void clear_IO_APIC (void)
+static void clear_IO_APIC(void)
{
int apic, pin;
@@ -332,7 +341,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int apicid_value;
cpumask_t tmp;
-
+
cpus_and(tmp, cpumask, cpu_online_map);
if (cpus_empty(tmp))
tmp = TARGET_CPUS;
@@ -361,7 +370,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
# include <linux/kernel_stat.h> /* kstat */
# include <linux/slab.h> /* kmalloc() */
# include <linux/timer.h>
-
+
#define IRQBALANCE_CHECK_ARCH -999
#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
@@ -373,14 +382,14 @@ static int physical_balance __read_mostly;
static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
static struct irq_cpu_info {
- unsigned long * last_irq;
- unsigned long * irq_delta;
+ unsigned long *last_irq;
+ unsigned long *irq_delta;
unsigned long irq;
} irq_cpu_data[NR_CPUS];
#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
+#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -419,8 +428,8 @@ inside:
if (cpu == -1)
cpu = NR_CPUS-1;
}
- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
- (search_idle && !IDLE_ENOUGH(cpu,now)));
+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
+ (search_idle && !IDLE_ENOUGH(cpu, now)));
return cpu;
}
@@ -430,15 +439,14 @@ static inline void balance_irq(int cpu, int irq)
unsigned long now = jiffies;
cpumask_t allowed_mask;
unsigned int new_cpu;
-
+
if (irqbalance_disabled)
- return;
+ return;
cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
new_cpu = move(cpu, allowed_mask, now, 1);
- if (cpu != new_cpu) {
+ if (cpu != new_cpu)
set_pending_irq(irq, cpumask_of_cpu(new_cpu));
- }
}
static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -450,14 +458,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
if (!irq_desc[j].action)
continue;
/* Is it a significant load ? */
- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
useful_load_threshold)
continue;
balance_irq(i, j);
}
}
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
@@ -486,22 +494,22 @@ static void do_irq_balance(void)
/* Is this an active IRQ or balancing disabled ? */
if (!irq_desc[j].action || irq_balancing_disabled(j))
continue;
- if ( package_index == i )
- IRQ_DELTA(package_index,j) = 0;
+ if (package_index == i)
+ IRQ_DELTA(package_index, j) = 0;
/* Determine the total count per processor per IRQ */
value_now = (unsigned long) kstat_cpu(i).irqs[j];
/* Determine the activity per processor per IRQ */
- delta = value_now - LAST_CPU_IRQ(i,j);
+ delta = value_now - LAST_CPU_IRQ(i, j);
/* Update last_cpu_irq[][] for the next time */
- LAST_CPU_IRQ(i,j) = value_now;
+ LAST_CPU_IRQ(i, j) = value_now;
/* Ignore IRQs whose rate is less than the clock */
if (delta < useful_load_threshold)
continue;
/* update the load for the processor or package total */
- IRQ_DELTA(package_index,j) += delta;
+ IRQ_DELTA(package_index, j) += delta;
/* Keep track of the higher numbered sibling as well */
if (i != package_index)
@@ -527,7 +535,8 @@ static void do_irq_balance(void)
max_cpu_irq = ULONG_MAX;
tryanothercpu:
- /* Look for heaviest loaded processor.
+ /*
+ * Look for heaviest loaded processor.
* We may come back to get the next heaviest loaded processor.
* Skip processors with trivial loads.
*/
@@ -536,7 +545,7 @@ tryanothercpu:
for_each_online_cpu(i) {
if (i != CPU_TO_PACKAGEINDEX(i))
continue;
- if (max_cpu_irq <= CPU_IRQ(i))
+ if (max_cpu_irq <= CPU_IRQ(i))
continue;
if (tmp_cpu_irq < CPU_IRQ(i)) {
tmp_cpu_irq = CPU_IRQ(i);
@@ -545,8 +554,9 @@ tryanothercpu:
}
if (tmp_loaded == -1) {
- /* In the case of small number of heavy interrupt sources,
- * loading some of the cpus too much. We use Ingo's original
+ /*
+ * In the case of small number of heavy interrupt sources,
+ * loading some of the cpus too much. We use Ingo's original
* approach to rotate them around.
*/
if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -555,13 +565,14 @@ tryanothercpu:
}
goto not_worth_the_effort;
}
-
+
first_attempt = 0; /* heaviest search */
max_cpu_irq = tmp_cpu_irq; /* load */
max_loaded = tmp_loaded; /* processor */
imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
- /* if imbalance is less than approx 10% of max load, then
+
+ /*
+ * if imbalance is less than approx 10% of max load, then
* observe diminishing returns action. - quit
*/
if (imbalance < (max_cpu_irq >> 3))
@@ -577,26 +588,25 @@ tryanotherirq:
/* Is this an active IRQ? */
if (!irq_desc[j].action)
continue;
- if (imbalance <= IRQ_DELTA(max_loaded,j))
+ if (imbalance <= IRQ_DELTA(max_loaded, j))
continue;
/* Try to find the IRQ that is closest to the imbalance
* without going over.
*/
- if (move_this_load < IRQ_DELTA(max_loaded,j)) {
- move_this_load = IRQ_DELTA(max_loaded,j);
+ if (move_this_load < IRQ_DELTA(max_loaded, j)) {
+ move_this_load = IRQ_DELTA(max_loaded, j);
selected_irq = j;
}
}
- if (selected_irq == -1) {
+ if (selected_irq == -1)
goto tryanothercpu;
- }
imbalance = move_this_load;
-
+
/* For physical_balance case, we accumulated both load
* values in the one of the siblings cpu_irq[],
* to use the same code for physical and logical processors
- * as much as possible.
+ * as much as possible.
*
* NOTE: the cpu_irq[] array holds the sum of the load for
* sibling A and sibling B in the slot for the lowest numbered
@@ -625,11 +635,11 @@ tryanotherirq:
/* mark for change destination */
set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
- /* Since we made a change, come back sooner to
+ /* Since we made a change, come back sooner to
* check for more variation.
*/
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
goto tryanotherirq;
@@ -640,7 +650,7 @@ not_worth_the_effort:
* upward
*/
balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
return;
}
@@ -679,13 +689,13 @@ static int __init balanced_irq_init(void)
cpumask_t tmp;
cpus_shift_right(tmp, cpu_online_map, 2);
- c = &boot_cpu_data;
+ c = &boot_cpu_data;
/* When not overwritten by the command line ask subarchitecture. */
if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
irqbalance_disabled = NO_BALANCE_IRQ;
if (irqbalance_disabled)
return 0;
-
+
/* disable irqbalance completely if there is only one processor online */
if (num_online_cpus() < 2) {
irqbalance_disabled = 1;
@@ -699,16 +709,14 @@ static int __init balanced_irq_init(void)
physical_balance = 1;
for_each_online_cpu(i) {
- irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
printk(KERN_ERR "balanced_irq_init: out of memory");
goto failed;
}
- memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
- memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
}
-
+
printk(KERN_INFO "Starting balanced_irq\n");
if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
return 0;
@@ -801,10 +809,10 @@ static int find_irq_entry(int apic, int pin, int type)
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == type &&
- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mpc_dstirq == pin)
+ if (mp_irqs[i].mp_irqtype == type &&
+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mp_dstirq == pin)
return i;
return -1;
@@ -818,13 +826,13 @@ static int __init find_isa_irq_pin(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
- return mp_irqs[i].mpc_dstirq;
+ return mp_irqs[i].mp_dstirq;
}
return -1;
}
@@ -834,17 +842,17 @@ static int __init find_isa_irq_apic(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
- for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
return apic;
}
}
@@ -864,28 +872,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
"slot:%d, pin:%d.\n", bus, slot, pin);
- if (mp_bus_id_to_pci_bus[bus] == -1) {
+ if (test_bit(bus, mp_bus_not_pci)) {
printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mpc_irqtype &&
+ !mp_irqs[i].mp_irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -900,7 +908,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
/*
- * This function currently is only a helper for the i386 smp boot process where
+ * This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
* so mask in all cases should simply be TARGET_CPUS
*/
@@ -952,7 +960,7 @@ static int EISA_ELCR(unsigned int irq)
* EISA conforming in the MP table, that means its trigger type must
* be read in from the ELCR */
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
@@ -969,118 +977,115 @@ static int EISA_ELCR(unsigned int irq)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mpc_irqflag & 3)
+ switch (mp_irqs[idx].mp_irqflag & 3) {
+ case 0: /* conforms, ie. bus-type dependent polarity */
{
- case 0: /* conforms, ie. bus-type dependent polarity */
- {
- polarity = test_bit(bus, mp_bus_not_pci)?
- default_ISA_polarity(idx):
- default_PCI_polarity(idx);
- break;
- }
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
+ polarity = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_polarity(idx):
+ default_PCI_polarity(idx);
+ break;
+ }
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
}
return polarity;
}
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
+ case 0: /* conforms, ie. bus-type dependent */
{
- case 0: /* conforms, ie. bus-type dependent */
- {
- trigger = test_bit(bus, mp_bus_not_pci)?
- default_ISA_trigger(idx):
- default_PCI_trigger(idx);
+ trigger = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_trigger(idx):
+ default_PCI_trigger(idx);
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
break;
}
- case 1: /* edge */
+ case MP_BUS_EISA: /* EISA pin */
{
- trigger = 0;
+ trigger = default_EISA_trigger(idx);
break;
}
- case 2: /* reserved */
+ case MP_BUS_PCI: /* PCI pin */
{
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
+ /* set before the switch */
break;
}
- case 3: /* level */
+ case MP_BUS_MCA: /* MCA pin */
{
- trigger = 1;
+ trigger = default_MCA_trigger(idx);
break;
}
- default: /* invalid */
+ default:
{
printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
+ trigger = 1;
break;
}
}
+#endif
+ break;
+ }
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
return trigger;
}
@@ -1097,16 +1102,16 @@ static inline int irq_trigger(int idx)
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mpc_dstirq != pin)
+ if (mp_irqs[idx].mp_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci))
- irq = mp_irqs[idx].mpc_srcbusirq;
+ irq = mp_irqs[idx].mp_srcbusirq;
else {
/*
* PCI IRQs are mapped in order
@@ -1148,8 +1153,8 @@ static inline int IO_APIC_irq_trigger(int irq)
for (apic = 0; apic < nr_ioapics; apic++) {
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic,pin,mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
return irq_trigger(idx);
}
}
@@ -1164,7 +1169,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }
static int __assign_irq_vector(int irq)
{
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
int vector, offset;
BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1176,7 +1181,7 @@ static int __assign_irq_vector(int irq)
offset = current_offset;
next:
vector += 8;
- if (vector >= FIRST_SYSTEM_VECTOR) {
+ if (vector >= first_system_vector) {
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
}
@@ -1203,6 +1208,11 @@ static int assign_irq_vector(int irq)
return vector;
}
+
+void setup_vector_irq(int cpu)
+{
+}
+
static struct irq_chip ioapic_chip;
#define IOAPIC_AUTO -1
@@ -1237,25 +1247,25 @@ static void __init setup_IO_APIC_irqs(void)
/*
* add it to the IO-APIC irq-routing table:
*/
- memset(&entry,0,sizeof(entry));
+ memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest =
+ entry.dest.logical.logical_dest =
cpu_mask_to_apicid(TARGET_CPUS);
- idx = find_irq_entry(apic,pin,mp_INT);
+ idx = find_irq_entry(apic, pin, mp_INT);
if (idx == -1) {
if (first_notcon) {
apic_printk(APIC_VERBOSE, KERN_DEBUG
" IO-APIC (apicid-pin) %d-%d",
- mp_ioapics[apic].mpc_apicid,
+ mp_ioapics[apic].mp_apicid,
pin);
first_notcon = 0;
} else
apic_printk(APIC_VERBOSE, ", %d-%d",
- mp_ioapics[apic].mpc_apicid, pin);
+ mp_ioapics[apic].mp_apicid, pin);
continue;
}
@@ -1289,7 +1299,7 @@ static void __init setup_IO_APIC_irqs(void)
vector = assign_irq_vector(irq);
entry.vector = vector;
ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
+
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
@@ -1302,25 +1312,21 @@ static void __init setup_IO_APIC_irqs(void)
}
/*
- * Set up the 8259A-master output pin:
+ * Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+ int vector)
{
struct IO_APIC_route_entry entry;
- memset(&entry,0,sizeof(entry));
-
- disable_8259A_irq(0);
-
- /* mask LVT0 */
- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ memset(&entry, 0, sizeof(entry));
/*
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* unmask IRQ now */
+ entry.mask = 1; /* mask IRQ now */
entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
@@ -1329,17 +1335,14 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
/*
* The timer IRQ doesn't have to know that behind the
- * scene we have a 8259A-master in AEOI mode ...
+ * scene we may have a 8259A-master in AEOI mode ...
*/
- irq_desc[0].chip = &ioapic_chip;
- set_irq_handler(0, handle_edge_irq);
+ ioapic_register_intr(0, vector, IOAPIC_EDGE);
/*
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(apic, pin, entry);
-
- enable_8259A_irq(0);
}
void __init print_IO_APIC(void)
@@ -1354,10 +1357,10 @@ void __init print_IO_APIC(void)
if (apic_verbosity == APIC_QUIET)
return;
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -1376,7 +1379,7 @@ void __init print_IO_APIC(void)
reg_03.raw = io_apic_read(apic, 3);
spin_unlock_irqrestore(&ioapic_lock, flags);
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1459,7 +1462,7 @@ void __init print_IO_APIC(void)
#if 0
-static void print_APIC_bitfield (int base)
+static void print_APIC_bitfield(int base)
{
unsigned int v;
int i, j;
@@ -1480,7 +1483,7 @@ static void print_APIC_bitfield (int base)
}
}
-void /*__init*/ print_local_APIC(void * dummy)
+void /*__init*/ print_local_APIC(void *dummy)
{
unsigned int v, ver, maxlvt;
@@ -1489,6 +1492,7 @@ void /*__init*/ print_local_APIC(void * dummy)
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
@@ -1563,9 +1567,9 @@ void /*__init*/ print_local_APIC(void * dummy)
printk("\n");
}
-void print_all_local_APICs (void)
+void print_all_local_APICs(void)
{
- on_each_cpu(print_local_APIC, NULL, 1, 1);
+ on_each_cpu(print_local_APIC, NULL, 1);
}
void /*__init*/ print_PIC(void)
@@ -1586,11 +1590,11 @@ void /*__init*/ print_PIC(void)
v = inb(0xa0) << 8 | inb(0x20);
printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
- outb(0x0b,0xa0);
- outb(0x0b,0x20);
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a,0xa0);
- outb(0x0a,0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
spin_unlock_irqrestore(&i8259A_lock, flags);
@@ -1626,7 +1630,7 @@ static void __init enable_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
- for(apic = 0; apic < nr_ioapics; apic++) {
+ for (apic = 0; apic < nr_ioapics; apic++) {
int pin;
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1716,7 +1720,6 @@ void disable_IO_APIC(void)
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-#ifndef CONFIG_X86_NUMAQ
static void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
@@ -1726,6 +1729,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
unsigned char old_id;
unsigned long flags;
+#ifdef CONFIG_X86_NUMAQ
+ if (found_numaq)
+ return;
+#endif
+
/*
* Don't check I/O APIC IDs for xAPIC systems. They have
* no meaning without the serial APIC bus.
@@ -1748,15 +1756,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mpc_apicid;
- if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+ old_id = mp_ioapics[apic].mp_apicid;
+
+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mpc_apicid);
+ apic, mp_ioapics[apic].mp_apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
}
/*
@@ -1765,9 +1773,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mpc_apicid)) {
+ mp_ioapics[apic].mp_apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mpc_apicid);
+ apic, mp_ioapics[apic].mp_apicid);
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -1776,13 +1784,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mpc_apicid = i;
+ mp_ioapics[apic].mp_apicid = i;
} else {
physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic].mpc_apicid);
+ mp_ioapics[apic].mp_apicid);
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -1791,21 +1799,21 @@ static void __init setup_ioapic_ids_from_mpc(void)
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic].mpc_apicid)
+ if (old_id != mp_ioapics[apic].mp_apicid)
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_dstapic == old_id)
- mp_irqs[i].mpc_dstapic
- = mp_ioapics[apic].mpc_apicid;
+ if (mp_irqs[i].mp_dstapic == old_id)
+ mp_irqs[i].mp_dstapic
+ = mp_ioapics[apic].mp_apicid;
/*
* Read the right value from the MPC table and
* write it into the ID register.
- */
+ */
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mpc_apicid);
+ mp_ioapics[apic].mp_apicid);
- reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0, reg_00.raw);
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1816,15 +1824,12 @@ static void __init setup_ioapic_ids_from_mpc(void)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
}
}
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
int no_timer_check __initdata;
@@ -2015,12 +2020,12 @@ static inline void init_IO_APIC_traps(void)
* The local APIC irq-chip implementation:
*/
-static void ack_apic(unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
{
ack_APIC_irq();
}
-static void mask_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -2028,7 +2033,7 @@ static void mask_lapic_irq (unsigned int irq)
apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -2037,23 +2042,31 @@ static void unmask_lapic_irq (unsigned int irq)
}
static struct irq_chip lapic_chip __read_mostly = {
- .name = "local-APIC-edge",
+ .name = "local-APIC",
.mask = mask_lapic_irq,
.unmask = unmask_lapic_irq,
- .eoi = ack_apic,
+ .ack = ack_lapic_irq,
};
+static void lapic_register_intr(int irq, int vector)
+{
+ irq_desc[irq].status &= ~IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+ set_intr_gate(vector, interrupt[irq]);
+}
+
static void __init setup_nmi(void)
{
/*
- * Dirty trick to enable the NMI watchdog ...
+ * Dirty trick to enable the NMI watchdog ...
* We put the 8259A master into AEOI mode and
* unmask on all local APICs LVT0 as NMI.
*
* The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
* is from Maciej W. Rozycki - so we do not have to EOI from
* the NMI handler or the timer interrupt.
- */
+ */
apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
enable_NMI_through_LVT0();
@@ -2129,11 +2142,16 @@ static inline void __init unlock_ExtINT_logic(void)
static inline void __init check_timer(void)
{
int apic1, pin1, apic2, pin2;
+ int no_pin1 = 0;
int vector;
+ unsigned int ver;
unsigned long flags;
local_irq_save(flags);
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+
/*
* get/set the timer IRQ vector:
*/
@@ -2142,17 +2160,17 @@ static inline void __init check_timer(void)
set_intr_gate(vector, interrupt[0]);
/*
- * Subtle, code in do_timer_interrupt() expects an AEOI
- * mode for the 8259A whenever interrupts are routed
- * through I/O APICs. Also IRQ0 has to be enabled in
- * the 8259A which implies the virtual wire has to be
- * disabled in the local APIC.
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
*/
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- timer_ack = 1;
- if (timer_over_8254 > 0)
- enable_8259A_irq(0);
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2162,14 +2180,33 @@ static inline void __init check_timer(void)
printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
vector, apic1, pin1, apic2, pin2);
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
+
if (pin1 != -1) {
/*
* Ok, does IRQ0 through the IOAPIC work?
*/
+ if (no_pin1) {
+ add_pin_to_irq(0, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, vector);
+ }
unmask_IO_APIC_irq(0);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
setup_nmi();
enable_8259A_irq(0);
}
@@ -2178,45 +2215,47 @@ static inline void __init check_timer(void)
goto out;
}
clear_IO_APIC_pin(apic1, pin1);
- printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
- "IO-APIC\n");
- }
+ if (!no_pin1)
+ printk(KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
- printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
- if (pin2 != -1) {
+ printk(KERN_INFO "...trying to set up timer (IRQ0) "
+ "through the 8259A ... ");
printk("\n..... (found pin %d) ...", pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, vector);
+ unmask_IO_APIC_irq(0);
+ enable_8259A_irq(0);
if (timer_irq_works()) {
printk("works.\n");
- if (pin1 != -1)
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
- else
- add_pin_to_irq(0, apic2, pin2);
+ timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
setup_nmi();
+ enable_8259A_irq(0);
}
goto out;
}
/*
* Cleanup, just in case ...
*/
+ disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
+ printk(" failed.\n");
}
- printk(" failed.\n");
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = 0;
+ nmi_watchdog = NMI_NONE;
}
+ timer_ack = 0;
printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
- disable_8259A_irq(0);
- set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
- "fasteoi");
+ lapic_register_intr(0, vector);
apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -2224,12 +2263,12 @@ static inline void __init check_timer(void)
printk(" works.\n");
goto out;
}
+ disable_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
printk(" failed.\n");
printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
- timer_ack = 0;
init_8259A(0);
make_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
@@ -2248,11 +2287,21 @@ out:
}
/*
- *
- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- * Linux doesn't really care, as it's not actually used
- * for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices. However there may be an I/O APIC pin available for
+ * this interrupt regardless. The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A. In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table. With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default. We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now. No actual device should request
+ * it anyway. --macro
*/
#define PIC_IRQS (1 << PIC_CASCADE_IR)
@@ -2261,15 +2310,12 @@ void __init setup_IO_APIC(void)
int i;
/* Reserve all the system vectors. */
- for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
+ for (i = first_system_vector; i < NR_VECTORS; i++)
set_bit(i, used_vectors);
enable_IO_APIC();
- if (acpi_ioapic)
- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
- else
- io_apic_irqs = ~PIC_IRQS;
+ io_apic_irqs = ~PIC_IRQS;
printk("ENABLING IO-APIC IRQs\n");
@@ -2286,28 +2332,14 @@ void __init setup_IO_APIC(void)
print_IO_APIC();
}
-static int __init setup_disable_8254_timer(char *s)
-{
- timer_over_8254 = -1;
- return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
- timer_over_8254 = 2;
- return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
/*
* Called after all the initialization is done. If we didnt find any
* APIC bugs then we can allow the modify fast path
*/
-
+
static int __init io_apic_bug_finalize(void)
{
- if(sis_apic_bug == -1)
+ if (sis_apic_bug == -1)
sis_apic_bug = 0;
return 0;
}
@@ -2318,17 +2350,17 @@ struct sysfs_ioapic_data {
struct sys_device dev;
struct IO_APIC_route_entry entry[0];
};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
int i;
-
+
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
entry[i] = ioapic_read_entry(dev->id, i);
return 0;
@@ -2341,18 +2373,18 @@ static int ioapic_resume(struct sys_device *dev)
unsigned long flags;
union IO_APIC_reg_00 reg_00;
int i;
-
+
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
ioapic_write_entry(dev->id, i, entry[i]);
return 0;
@@ -2366,24 +2398,23 @@ static struct sysdev_class ioapic_sysdev_class = {
static int __init ioapic_init_sysfs(void)
{
- struct sys_device * dev;
+ struct sys_device *dev;
int i, size, error = 0;
error = sysdev_class_register(&ioapic_sysdev_class);
if (error)
return error;
- for (i = 0; i < nr_ioapics; i++ ) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+ for (i = 0; i < nr_ioapics; i++) {
+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
* sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
if (!mp_ioapic_data[i]) {
printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
continue;
}
- memset(mp_ioapic_data[i], 0, size);
dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
+ dev->id = i;
dev->cls = &ioapic_sysdev_class;
error = sysdev_register(dev);
if (error) {
@@ -2458,7 +2489,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
msg->address_lo =
MSI_ADDR_BASE_LO |
((INT_DEST_MODE == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
+MSI_ADDR_DEST_MODE_PHYSICAL:
MSI_ADDR_DEST_MODE_LOGICAL) |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU:
@@ -2469,7 +2500,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
+MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(vector);
}
@@ -2640,12 +2671,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
#endif /* CONFIG_HT_IRQ */
/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
+ ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
{
union IO_APIC_reg_00 reg_00;
static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2654,10 +2685,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
int i = 0;
/*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
* supports up to 16 on one shared APIC bus.
- *
+ *
* TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
* advantage of new APIC bus architecture.
*/
@@ -2676,7 +2707,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
}
/*
- * Every APIC in a system must have a unique ID or we get lots of nice
+ * Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2693,7 +2724,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
"trying %d\n", ioapic, apic_id, i);
apic_id = i;
- }
+ }
tmp = apicid_to_cpu_present(apic_id);
physids_or(apic_id_map, apic_id_map, tmp);
@@ -2720,7 +2751,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
}
-int __init io_apic_get_version (int ioapic)
+int __init io_apic_get_version(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2733,7 +2764,7 @@ int __init io_apic_get_version (int ioapic)
}
-int __init io_apic_get_redir_entries (int ioapic)
+int __init io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2746,7 +2777,7 @@ int __init io_apic_get_redir_entries (int ioapic)
}
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
{
struct IO_APIC_route_entry entry;
@@ -2762,7 +2793,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
* corresponding device driver registers for this IRQ.
*/
- memset(&entry,0,sizeof(entry));
+ memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
@@ -2781,7 +2812,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
- mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+ mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
edge_level, active_high_low);
ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2802,8 +2833,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == mp_INT &&
- mp_irqs[i].mpc_srcbusirq == bus_irq)
+ if (mp_irqs[i].mp_irqtype == mp_INT &&
+ mp_irqs[i].mp_srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -2836,3 +2867,34 @@ static int __init parse_noapic(char *arg)
return 0;
}
early_param("noapic", parse_noapic);
+
+void __init ioapic_init_mappings(void)
+{
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ int i;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ if (smp_found_config) {
+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+ } else {
+fake_ioapic_page:
+ ioapic_phys = (unsigned long)
+ alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ set_fixmap_nocache(idx, ioapic_phys);
+ printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx), ioapic_phys);
+ idx++;
+ }
+}
+
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc52..6510cde36b3 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -61,7 +61,7 @@ struct irq_cfg {
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
[0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
[1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
[2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
@@ -82,6 +82,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
static int assign_irq_vector(int irq, cpumask_t mask);
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
#define __apicdebuginit __init
int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -90,7 +94,7 @@ static int no_timer_check;
static int disable_timer_pin_1 __initdata;
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@ -104,15 +108,17 @@ DEFINE_SPINLOCK(vector_lock);
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
/*
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
@@ -140,7 +146,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -183,7 +189,7 @@ static bool io_apic_level_ack_pending(unsigned int irq)
break;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if ((reg >> 14) & 1) {
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
@@ -298,7 +304,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
break;
io_apic_write(apic, 0x11 + pin*2, dest);
reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~0x000000ff;
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
io_apic_modify(apic, reg);
if (!entry->next)
@@ -360,16 +366,37 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
entry->pin = pin;
}
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ while (1) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ }
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+}
+
#define DO_ACTION(name,R,ACTION, FINAL) \
\
static void name##_IO_APIC_irq (unsigned int irq) \
__DO_ACTION(R, ACTION, FINAL)
-DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
- /* mask = 1 */
-DO_ACTION( __unmask, 0, &= 0xfffeffff, )
- /* mask = 0 */
+/* mask = 1 */
+DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+
+/* mask = 0 */
+DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
static void mask_IO_APIC_irq (unsigned int irq)
{
@@ -430,20 +457,6 @@ static int __init disable_timer_pin_setup(char *arg)
}
__setup("disable_timer_pin_1", disable_timer_pin_setup);
-static int __init setup_disable_8254_timer(char *s)
-{
- timer_over_8254 = -1;
- return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
- timer_over_8254 = 2;
- return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
/*
* Find the IRQ entry number of a certain pin.
@@ -453,10 +466,10 @@ static int find_irq_entry(int apic, int pin, int type)
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == type &&
- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mpc_dstirq == pin)
+ if (mp_irqs[i].mp_irqtype == type &&
+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mp_dstirq == pin)
return i;
return -1;
@@ -470,13 +483,13 @@ static int __init find_isa_irq_pin(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
- return mp_irqs[i].mpc_dstirq;
+ return mp_irqs[i].mp_dstirq;
}
return -1;
}
@@ -486,17 +499,17 @@ static int __init find_isa_irq_apic(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
return apic;
}
}
@@ -516,28 +529,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
bus, slot, pin);
- if (mp_bus_id_to_pci_bus[bus] == -1) {
+ if (test_bit(bus, mp_bus_not_pci)) {
apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mpc_irqtype &&
+ !mp_irqs[i].mp_irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -565,13 +578,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mpc_irqflag & 3)
+ switch (mp_irqs[idx].mp_irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
if (test_bit(bus, mp_bus_not_pci))
@@ -607,13 +620,13 @@ static int MPBIOS_polarity(int idx)
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
if (test_bit(bus, mp_bus_not_pci))
@@ -660,16 +673,16 @@ static inline int irq_trigger(int idx)
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mpc_dstirq != pin)
+ if (mp_irqs[idx].mp_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].mpc_srcbusirq;
+ irq = mp_irqs[idx].mp_srcbusirq;
} else {
/*
* PCI IRQs are mapped in order
@@ -730,7 +743,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
offset = current_offset;
next:
vector += 8;
- if (vector >= FIRST_SYSTEM_VECTOR) {
+ if (vector >= first_system_vector) {
/* If we run out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
@@ -788,7 +801,7 @@ static void __clear_irq_vector(int irq)
cpus_clear(cfg->domain);
}
-void __setup_vector_irq(int cpu)
+static void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
/* This function must be called with vector_lock held */
@@ -811,6 +824,13 @@ void __setup_vector_irq(int cpu)
}
}
+void setup_vector_irq(int cpu)
+{
+ spin_lock(&vector_lock);
+ __setup_vector_irq(smp_processor_id());
+ spin_unlock(&vector_lock);
+}
+
static struct irq_chip ioapic_chip;
@@ -846,7 +866,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
"IRQ %d Mode:%i Active:%i)\n",
- apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+ apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
irq, trigger, polarity);
/*
@@ -887,10 +907,10 @@ static void __init setup_IO_APIC_irqs(void)
idx = find_irq_entry(apic,pin,mp_INT);
if (idx == -1) {
if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
first_notcon = 0;
} else
- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
continue;
}
if (!first_notcon) {
@@ -911,26 +931,21 @@ static void __init setup_IO_APIC_irqs(void)
}
/*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
+ * Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+ int vector)
{
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
- disable_8259A_irq(0);
-
- /* mask LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
/*
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* unmask IRQ now */
+ entry.mask = 1; /* mask IRQ now */
entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
@@ -939,7 +954,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
/*
* The timer IRQ doesn't have to know that behind the
- * scene we have a 8259A-master in AEOI mode ...
+ * scene we may have a 8259A-master in AEOI mode ...
*/
set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
@@ -947,8 +962,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(apic, pin, entry);
-
- enable_8259A_irq(0);
}
void __apicdebuginit print_IO_APIC(void)
@@ -965,7 +978,7 @@ void __apicdebuginit print_IO_APIC(void)
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -983,7 +996,7 @@ void __apicdebuginit print_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1077,6 +1090,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
@@ -1146,7 +1160,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
void print_all_local_APICs (void)
{
- on_each_cpu(print_local_APIC, NULL, 1, 1);
+ on_each_cpu(print_local_APIC, NULL, 1);
}
void __apicdebuginit print_PIC(void)
@@ -1540,7 +1554,7 @@ static inline void init_IO_APIC_traps(void)
}
}
-static void enable_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -1548,7 +1562,7 @@ static void enable_lapic_irq (unsigned int irq)
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void disable_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -1561,19 +1575,20 @@ static void ack_lapic_irq (unsigned int irq)
ack_APIC_irq();
}
-static void end_lapic_irq (unsigned int i) { /* nothing */ }
-
-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
- .name = "local-APIC",
- .typename = "local-APIC-edge",
- .startup = NULL, /* startup_irq() not used for IRQ0 */
- .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
- .enable = enable_lapic_irq,
- .disable = disable_lapic_irq,
- .ack = ack_lapic_irq,
- .end = end_lapic_irq,
+static struct irq_chip lapic_chip __read_mostly = {
+ .name = "local-APIC",
+ .mask = mask_lapic_irq,
+ .unmask = unmask_lapic_irq,
+ .ack = ack_lapic_irq,
};
+static void lapic_register_intr(int irq)
+{
+ irq_desc[irq].status &= ~IRQ_LEVEL;
+ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+}
+
static void __init setup_nmi(void)
{
/*
@@ -1659,6 +1674,7 @@ static inline void __init check_timer(void)
struct irq_cfg *cfg = irq_cfg + 0;
int apic1, pin1, apic2, pin2;
unsigned long flags;
+ int no_pin1 = 0;
local_irq_save(flags);
@@ -1669,16 +1685,11 @@ static inline void __init check_timer(void)
assign_irq_vector(0, TARGET_CPUS);
/*
- * Subtle, code in do_timer_interrupt() expects an AEOI
- * mode for the 8259A whenever interrupts are routed
- * through I/O APICs. Also IRQ0 has to be enabled in
- * the 8259A which implies the virtual wire has to be
- * disabled in the local APIC.
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- if (timer_over_8254 > 0)
- enable_8259A_irq(0);
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1688,15 +1699,33 @@ static inline void __init check_timer(void)
apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
cfg->vector, apic1, pin1, apic2, pin2);
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
+
if (pin1 != -1) {
/*
* Ok, does IRQ0 through the IOAPIC work?
*/
+ if (no_pin1) {
+ add_pin_to_irq(0, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ }
unmask_IO_APIC_irq(0);
if (!no_timer_check && timer_irq_works()) {
- nmi_watchdog_default();
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
setup_nmi();
enable_8259A_irq(0);
}
@@ -1705,43 +1734,48 @@ static inline void __init check_timer(void)
goto out;
}
clear_IO_APIC_pin(apic1, pin1);
- apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
- "connected to IO-APIC\n");
- }
+ if (!no_pin1)
+ apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
- apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
- "through the 8259A ... ");
- if (pin2 != -1) {
+ apic_printk(APIC_VERBOSE,KERN_INFO
+ "...trying to set up timer (IRQ0) "
+ "through the 8259A ... ");
apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+ unmask_IO_APIC_irq(0);
+ enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_VERBOSE," works.\n");
- nmi_watchdog_default();
+ timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
setup_nmi();
+ enable_8259A_irq(0);
}
goto out;
}
/*
* Cleanup, just in case ...
*/
+ disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
+ apic_printk(APIC_VERBOSE," failed.\n");
}
- apic_printk(APIC_VERBOSE," failed.\n");
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = 0;
+ nmi_watchdog = NMI_NONE;
}
apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
- disable_8259A_irq(0);
- irq_desc[0].chip = &lapic_irq_type;
+ lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -1749,6 +1783,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_VERBOSE," works.\n");
goto out;
}
+ disable_8259A_irq(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_VERBOSE," failed.\n");
@@ -1778,11 +1813,21 @@ static int __init notimercheck(char *s)
__setup("no_timer_check", notimercheck);
/*
- *
- * IRQs that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- * Linux doesn't really care, as it's not actually used
- * for any interrupt handling anyway.
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices. However there may be an I/O APIC pin available for
+ * this interrupt regardless. The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A. In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table. With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default. We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now. No actual device should request
+ * it anyway. --macro
*/
#define PIC_IRQS (1<<2)
@@ -1793,10 +1838,7 @@ void __init setup_IO_APIC(void)
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
- if (acpi_ioapic)
- io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
- else
- io_apic_irqs = ~PIC_IRQS;
+ io_apic_irqs = ~PIC_IRQS;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
@@ -1841,8 +1883,8 @@ static int ioapic_resume(struct sys_device *dev)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2242,8 +2284,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == mp_INT &&
- mp_irqs[i].mpc_srcbusirq == bus_irq)
+ if (mp_irqs[i].mp_irqtype == mp_INT &&
+ mp_irqs[i].mp_srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -2336,7 +2378,7 @@ void __init ioapic_init_mappings(void)
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
} else {
ioapic_phys = (unsigned long)
alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index c0df7b89ca2..9d98cda39ad 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -8,7 +8,6 @@
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
-#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/module.h>
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b..47a6f6f1247 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
#endif
}
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/* Debugging check for stack overflow: is there less than 1KB free? */
+static int check_stack_overflow(void)
+{
+ long sp;
+
+ __asm__ __volatile__("andl %%esp,%0" :
+ "=r" (sp) : "0" (THREAD_SIZE - 1));
+
+ return sp < (sizeof(struct thread_info) + STACK_WARN);
+}
+
+static void print_stack_overflow(void)
+{
+ printk(KERN_WARNING "low stack detected by irq handler\n");
+ dump_stack();
+}
+
+#else
+static inline int check_stack_overflow(void) { return 0; }
+static inline void print_stack_overflow(void) { }
+#endif
+
#ifdef CONFIG_4KSTACKS
/*
* per-CPU IRQ handling contexts (thread information and stack)
@@ -59,48 +82,29 @@ union irq_ctx {
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
-#endif
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-unsigned int do_IRQ(struct pt_regs *regs)
-{
- struct pt_regs *old_regs;
- /* high bit used in ret_from_ code */
- int irq = ~regs->orig_ax;
- struct irq_desc *desc = irq_desc + irq;
-#ifdef CONFIG_4KSTACKS
- union irq_ctx *curctx, *irqctx;
- u32 *isp;
-#endif
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__section__(".bss.page_aligned")));
- if (unlikely((unsigned)irq >= NR_IRQS)) {
- printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
- __func__, irq);
- BUG();
- }
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__section__(".bss.page_aligned")));
- old_regs = set_irq_regs(regs);
- irq_enter();
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
- /* Debugging check for stack overflow: is there less than 1KB free? */
- {
- long sp;
-
- __asm__ __volatile__("andl %%esp,%0" :
- "=r" (sp) : "0" (THREAD_SIZE - 1));
- if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
- printk("do_IRQ: stack overflow: %ld\n",
- sp - sizeof(struct thread_info));
- dump_stack();
- }
- }
-#endif
+static void call_on_stack(void *func, void *stack)
+{
+ asm volatile("xchgl %%ebx,%%esp \n"
+ "call *%%edi \n"
+ "movl %%ebx,%%esp \n"
+ : "=b" (stack)
+ : "0" (stack),
+ "D"(func)
+ : "memory", "cc", "edx", "ecx", "eax");
+}
-#ifdef CONFIG_4KSTACKS
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+{
+ union irq_ctx *curctx, *irqctx;
+ u32 *isp, arg1, arg2;
curctx = (union irq_ctx *) current_thread_info();
irqctx = hardirq_ctx[smp_processor_id()];
@@ -111,52 +115,39 @@ unsigned int do_IRQ(struct pt_regs *regs)
* handler) we can't do that and just have to keep using the
* current stack (which is the irq stack already after all)
*/
- if (curctx != irqctx) {
- int arg1, arg2, bx;
-
- /* build the stack frame on the IRQ stack */
- isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
- irqctx->tinfo.task = curctx->tinfo.task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ if (unlikely(curctx == irqctx))
+ return 0;
- /*
- * Copy the softirq bits in preempt_count so that the
- * softirq checks work in the hardirq context.
- */
- irqctx->tinfo.preempt_count =
- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
-
- asm volatile(
- " xchgl %%ebx,%%esp \n"
- " call *%%edi \n"
- " movl %%ebx,%%esp \n"
- : "=a" (arg1), "=d" (arg2), "=b" (bx)
- : "0" (irq), "1" (desc), "2" (isp),
- "D" (desc->handle_irq)
- : "memory", "cc", "ecx"
- );
- } else
-#endif
- desc->handle_irq(irq, desc);
+ /* build the stack frame on the IRQ stack */
+ isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
+ irqctx->tinfo.task = curctx->tinfo.task;
+ irqctx->tinfo.previous_esp = current_stack_pointer;
- irq_exit();
- set_irq_regs(old_regs);
+ /*
+ * Copy the softirq bits in preempt_count so that the
+ * softirq checks work in the hardirq context.
+ */
+ irqctx->tinfo.preempt_count =
+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+ if (unlikely(overflow))
+ call_on_stack(print_stack_overflow, isp);
+
+ asm volatile("xchgl %%ebx,%%esp \n"
+ "call *%%edi \n"
+ "movl %%ebx,%%esp \n"
+ : "=a" (arg1), "=d" (arg2), "=b" (isp)
+ : "0" (irq), "1" (desc), "2" (isp),
+ "D" (desc->handle_irq)
+ : "memory", "cc", "ecx");
return 1;
}
-#ifdef CONFIG_4KSTACKS
-
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
-
/*
* allocate per-cpu stacks for hardirq and for softirq processing
*/
-void irq_ctx_init(int cpu)
+void __cpuinit irq_ctx_init(int cpu)
{
union irq_ctx *irqctx;
@@ -164,25 +155,25 @@ void irq_ctx_init(int cpu)
return;
irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
hardirq_ctx[cpu] = irqctx;
irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = 0;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = 0;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
softirq_ctx[cpu] = irqctx;
- printk("CPU %u irqstacks, hard=%p soft=%p\n",
- cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+ printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
}
void irq_ctx_exit(int cpu)
@@ -211,25 +202,56 @@ asmlinkage void do_softirq(void)
/* build the stack frame on the softirq stack */
isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
- asm volatile(
- " xchgl %%ebx,%%esp \n"
- " call __do_softirq \n"
- " movl %%ebx,%%esp \n"
- : "=b"(isp)
- : "0"(isp)
- : "memory", "cc", "edx", "ecx", "eax"
- );
+ call_on_stack(__do_softirq, isp);
/*
* Shouldnt happen, we returned above if in_interrupt():
- */
+ */
WARN_ON_ONCE(softirq_count());
}
local_irq_restore(flags);
}
+
+#else
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
#endif
/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+ /* high bit used in ret_from_ code */
+ int overflow, irq = ~regs->orig_ax;
+ struct irq_desc *desc = irq_desc + irq;
+
+ if (unlikely((unsigned)irq >= NR_IRQS)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+ __func__, irq);
+ BUG();
+ }
+
+ old_regs = set_irq_regs(regs);
+ irq_enter();
+
+ overflow = check_stack_overflow();
+
+ if (!execute_on_irq_stack(overflow, desc, irq)) {
+ if (unlikely(overflow))
+ print_stack_overflow();
+ desc->handle_irq(irq, desc);
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+/*
* Interrupt statistics:
*/
@@ -313,16 +335,20 @@ skip:
per_cpu(irq_stat,j).irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
+#ifdef CONFIG_X86_MCE
seq_printf(p, "TRM: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "SPU: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
+#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
@@ -331,6 +357,40 @@ skip:
return 0;
}
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = nmi_count(cpu);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+ sum += per_cpu(irq_stat, cpu).irq_resched_count;
+ sum += per_cpu(irq_stat, cpu).irq_call_count;
+ sum += per_cpu(irq_stat, cpu).irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += per_cpu(irq_stat, cpu).irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += per_cpu(irq_stat, cpu).irq_spurious_count;
+#endif
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+ sum += atomic_read(&irq_mis_count);
+#endif
+ return sum;
+}
+
#ifdef CONFIG_HOTPLUG_CPU
#include <mach_apic.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac15466a9..1f78b238d8d 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -135,6 +135,7 @@ skip:
seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
+#ifdef CONFIG_X86_MCE
seq_printf(p, "TRM: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
@@ -143,6 +144,7 @@ skip:
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
+#endif
seq_printf(p, "SPU: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
@@ -153,6 +155,32 @@ skip:
}
/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = cpu_pda(cpu)->__nmi_count;
+
+ sum += cpu_pda(cpu)->apic_timer_irqs;
+#ifdef CONFIG_SMP
+ sum += cpu_pda(cpu)->irq_resched_count;
+ sum += cpu_pda(cpu)->irq_call_count;
+ sum += cpu_pda(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += cpu_pda(cpu)->irq_thermal_count;
+ sum += cpu_pda(cpu)->irq_threshold_count;
+#endif
+ sum += cpu_pda(cpu)->irq_spurious_count;
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ return atomic_read(&irq_err_count);
+}
+
+/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
new file mode 100644
index 00000000000..d66914287ee
--- /dev/null
+++ b/arch/x86/kernel/irqinit_32.c
@@ -0,0 +1,114 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+
+
+
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+ extern void math_error(void __user *);
+ outb(0,0xF0);
+ if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+ return IRQ_NONE;
+ math_error((void __user *)get_irq_regs()->ip);
+ return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = {
+ .handler = math_error_irq,
+ .mask = CPU_MASK_NONE,
+ .name = "fpu",
+};
+
+void __init init_ISA_irqs (void)
+{
+ int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ init_bsp_APIC();
+#endif
+ init_8259A(0);
+
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ for (i = 0; i < 16; i++) {
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
+ handle_level_irq, "XT");
+ }
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ /* all the set up before the call gates are initialised */
+ pre_intr_init_hook();
+
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (i >= NR_IRQS)
+ break;
+ /* SYSCALL_VECTOR was reserved in trap_init. */
+ if (!test_bit(vector, used_vectors))
+ set_intr_gate(vector, interrupt[i]);
+ }
+
+ /* setup after call gates are initialised (usually add in
+ * the architecture specific gates)
+ */
+ intr_init_hook();
+
+ /*
+ * External FPU? Set up irq13 if so, for
+ * original braindamaged IBM FERR coupling.
+ */
+ if (boot_cpu_data.hard_math && !cpu_has_fpu)
+ setup_irq(FPU_IRQ, &fpu_irq);
+
+ irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
new file mode 100644
index 00000000000..0373e88de95
--- /dev/null
+++ b/arch/x86/kernel/irqinit_64.c
@@ -0,0 +1,221 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define IRQ_NAME2(nr) nr##_interrupt(void)
+#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+
+/*
+ * SMP has a few special interrupts for IPI messages
+ */
+
+#define BUILD_IRQ(nr) \
+ asmlinkage void IRQ_NAME(nr); \
+ asm("\n.p2align\n" \
+ "IRQ" #nr "_interrupt:\n\t" \
+ "push $~(" #nr ") ; " \
+ "jmp common_interrupt");
+
+#define BI(x,y) \
+ BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+ BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+ BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+ BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+ BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+
+#undef BUILD_16_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+ IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+ IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+ IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+/* for the irq vectors */
+static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+ IRQLIST_16(0x2), IRQLIST_16(0x3),
+ IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+ IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+ IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
+};
+
+#undef IRQ
+#undef IRQLIST_16
+
+
+
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = {
+ .handler = no_action,
+ .mask = CPU_MASK_NONE,
+ .name = "cascade",
+};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+ [0 ... IRQ0_VECTOR - 1] = -1,
+ [IRQ0_VECTOR] = 0,
+ [IRQ1_VECTOR] = 1,
+ [IRQ2_VECTOR] = 2,
+ [IRQ3_VECTOR] = 3,
+ [IRQ4_VECTOR] = 4,
+ [IRQ5_VECTOR] = 5,
+ [IRQ6_VECTOR] = 6,
+ [IRQ7_VECTOR] = 7,
+ [IRQ8_VECTOR] = 8,
+ [IRQ9_VECTOR] = 9,
+ [IRQ10_VECTOR] = 10,
+ [IRQ11_VECTOR] = 11,
+ [IRQ12_VECTOR] = 12,
+ [IRQ13_VECTOR] = 13,
+ [IRQ14_VECTOR] = 14,
+ [IRQ15_VECTOR] = 15,
+ [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+static void __init init_ISA_irqs (void)
+{
+ int i;
+
+ init_bsp_APIC();
+ init_8259A(0);
+
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc[i].status = IRQ_DISABLED;
+ irq_desc[i].action = NULL;
+ irq_desc[i].depth = 1;
+
+ if (i < 16) {
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
+ handle_level_irq, "XT");
+ } else {
+ /*
+ * 'high' PCI IRQs filled in on demand
+ */
+ irq_desc[i].chip = &no_irq_chip;
+ }
+ }
+}
+
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ init_ISA_irqs();
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (vector != IA32_SYSCALL_VECTOR)
+ set_intr_gate(vector, interrupt[i]);
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+ * IPI, driven by wakeup.
+ */
+ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+ /* IPIs for invalidation */
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+ /* IPI for generic function call */
+ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+ /* IPI for generic single function call */
+ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+ call_function_single_interrupt);
+
+ /* Low priority IPI to cleanup after moving an irq */
+ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+
+ /* self generated IPI for local APIC timer */
+ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+ /* IPI vectors for APIC spurious and error interrupts */
+ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+ if (!acpi_ioapic)
+ setup_irq(2, &irq2);
+}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08a30986d47..87edf1ceb1d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
#include <linux/clocksource.h>
#include <linux/kvm_para.h>
+#include <asm/pvclock.h>
#include <asm/arch_hooks.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
-#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_wall_clock wall_clock;
-static inline u64 kvm_get_delta(u64 last_tsc)
-{
- int cpu = smp_processor_id();
- u64 delta = native_read_tsc() - last_tsc;
- return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
-}
-
-static struct kvm_wall_clock wall_clock;
-static cycle_t kvm_clock_read(void);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
*/
static unsigned long kvm_get_wallclock(void)
{
- u32 wc_sec, wc_nsec;
- u64 delta;
+ struct pvclock_vcpu_time_info *vcpu_time;
struct timespec ts;
- int version, nsec;
int low, high;
low = (int)__pa(&wall_clock);
high = ((u64)__pa(&wall_clock) >> 32);
+ native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
- delta = kvm_clock_read();
+ vcpu_time = &get_cpu_var(hv_clock);
+ pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
+ put_cpu_var(hv_clock);
- native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
- do {
- version = wall_clock.wc_version;
- rmb();
- wc_sec = wall_clock.wc_sec;
- wc_nsec = wall_clock.wc_nsec;
- rmb();
- } while ((wall_clock.wc_version != version) || (version & 1));
-
- delta = kvm_clock_read() - delta;
- delta += wc_nsec;
- nsec = do_div(delta, NSEC_PER_SEC);
- set_normalized_timespec(&ts, wc_sec + delta, nsec);
- /*
- * Of all mechanisms of time adjustment I've tested, this one
- * was the champion!
- */
- return ts.tv_sec + 1;
+ return ts.tv_sec;
}
static int kvm_set_wallclock(unsigned long now)
{
- return 0;
+ return -1;
}
-/*
- * This is our read_clock function. The host puts an tsc timestamp each time
- * it updates a new time. Without the tsc adjustment, we can have a situation
- * in which a vcpu starts to run earlier (smaller system_time), but probes
- * time later (compared to another vcpu), leading to backwards time
- */
static cycle_t kvm_clock_read(void)
{
- u64 last_tsc, now;
- int cpu;
+ struct pvclock_vcpu_time_info *src;
+ cycle_t ret;
- preempt_disable();
- cpu = smp_processor_id();
-
- last_tsc = get_clock(cpu, tsc_timestamp);
- now = get_clock(cpu, system_time);
-
- now += kvm_get_delta(last_tsc);
- preempt_enable();
-
- return now;
+ src = &get_cpu_var(hv_clock);
+ ret = pvclock_clocksource_read(src);
+ put_cpu_var(hv_clock);
+ return ret;
}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(void)
+static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high;
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
-
+ printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
+ cpu, high, low, txt);
return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
}
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
* Now that the first cpu already had this clocksource initialized,
* we shouldn't fail.
*/
- WARN_ON(kvm_register_clock());
+ WARN_ON(kvm_register_clock("secondary cpu clock"));
/* ok, done with our trickery, call native */
setup_secondary_APIC_clock();
}
#endif
+#ifdef CONFIG_SMP
+void __init kvm_smp_prepare_boot_cpu(void)
+{
+ WARN_ON(kvm_register_clock("primary cpu clock"));
+ native_smp_prepare_boot_cpu();
+}
+#endif
+
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
return;
if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
- if (kvm_register_clock())
+ if (kvm_register_clock("boot clock"))
return;
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c7..a8449571858 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,9 +20,9 @@
#include <asm/mmu_context.h>
#ifdef CONFIG_SMP
-static void flush_ldt(void *null)
+static void flush_ldt(void *current_mm)
{
- if (current->active_mm)
+ if (current->active_mm == current_mm)
load_LDT(&current->active_mm->context);
}
#endif
@@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
load_LDT(pc);
mask = cpumask_of_cpu(smp_processor_id());
if (!cpus_equal(current->mm->cpu_vm_mask, mask))
- smp_call_function(flush_ldt, NULL, 1, 1);
+ smp_call_function(flush_ldt, current->mm, 1);
preempt_enable();
#else
load_LDT(pc);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc3..8864230d55a 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,8 @@
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/numa.h>
+#include <linux/ftrace.h>
+
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -39,7 +41,7 @@ static void set_idt(void *newidt, __u16 limit)
curidt.address = (unsigned long)newidt;
load_idt(&curidt);
-};
+}
static void set_gdt(void *newgdt, __u16 limit)
@@ -51,7 +53,7 @@ static void set_gdt(void *newgdt, __u16 limit)
curgdt.address = (unsigned long)newgdt;
load_gdt(&curgdt);
-};
+}
static void load_segments(void)
{
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
unsigned long page_list[PAGES_NR];
void *control_page;
+ tracer_disable();
+
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db451..9dd9262693a 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
#include <linux/string.h>
#include <linux/reboot.h>
#include <linux/numa.h>
+#include <linux/ftrace.h>
+
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -110,7 +112,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
pgd_t *level4p;
level4p = (pgd_t *)__va(start_pgtable);
- return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
}
static void set_idt(void *newidt, u16 limit)
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
unsigned long page_list[PAGES_NR];
void *control_page;
+ tracer_disable();
+
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78..56b933119a0 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -5,13 +5,14 @@
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
+ * belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
- * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
- * Order Number 245472 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/245472.htm
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
*
* For more information, go to http://www.urbanmyth.org/microcode
*
@@ -58,12 +59,12 @@
* nature of implementation.
* 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
* Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
* Jun Nakajima <jun.nakajima@intel.com>
* Support for the microcode updates in the new format.
* 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
* Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
+ * because we no longer hold a copy of applied microcode
* in kernel memory.
* 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
* Fix sigmatch() macro to handle old CPUs with pf == 0.
@@ -75,6 +76,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/smp_lock.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -320,11 +322,11 @@ static void apply_microcode(int cpu)
return;
/* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
+ spin_lock_irqsave(&microcode_update_lock, flags);
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) uci->mc->bits,
+ (unsigned long) uci->mc->bits,
(unsigned long) uci->mc->bits >> 16 >> 16);
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
@@ -341,7 +343,7 @@ static void apply_microcode(int cpu)
return;
}
printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %08x \n",
+ "0x%x to 0x%x, date = %08x \n",
cpu_num, uci->rev, val[1], uci->mc->hdr.date);
uci->rev = val[1];
}
@@ -422,6 +424,7 @@ out:
static int microcode_open (struct inode *unused1, struct file *unused2)
{
+ cycle_kernel_lock();
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}
@@ -488,7 +491,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
#define microcode_dev_exit() do { } while(0)
#endif
-static long get_next_ucode_from_buffer(void **mc, void *buf,
+static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
unsigned long size, long offset)
{
microcode_header_t *mc_header;
@@ -522,7 +525,7 @@ static int cpu_request_microcode(int cpu)
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
- void *buf;
+ const u8 *buf;
unsigned long size;
long offset = 0;
int error;
@@ -534,7 +537,7 @@ static int cpu_request_microcode(int cpu)
c->x86, c->x86_model, c->x86_mask);
error = request_firmware(&firmware, name, &microcode_pdev->dev);
if (error) {
- pr_debug("microcode: ucode data file %s load failed\n", name);
+ pr_debug("microcode: data file %s load failed\n", name);
return error;
}
buf = firmware->data;
@@ -805,6 +808,9 @@ static int __init microcode_init (void)
{
int error;
+ printk(KERN_INFO
+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+
error = microcode_dev_init();
if (error)
return error;
@@ -825,9 +831,6 @@ static int __init microcode_init (void)
}
register_hotcpu_notifier(&mc_cpu_notifier);
-
- printk(KERN_INFO
- "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
return 0;
}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index edc5fbfe85c..fdfdc550b36 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -12,6 +12,7 @@
#include <asm/io.h>
#include <asm/msr.h>
#include <asm/acpi.h>
+#include <asm/mmconfig.h>
#include "../pci/pci.h"
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 404683b94e7..3b25e49380c 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -25,6 +25,8 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
#include <mach_apic.h>
#ifdef CONFIG_X86_32
@@ -32,28 +34,6 @@
#include <mach_mpparse.h>
#endif
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-
-static int mp_current_pci_id;
-
-int pic_mode;
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
/*
* Checksum an MP configuration block.
*/
@@ -69,15 +49,73 @@ static int __init mpf_checksum(unsigned char *mp, int len)
}
#ifdef CONFIG_X86_NUMAQ
+int found_numaq;
/*
* Have to match translation table entries to main table entries by counter
* hence the mpc_record variable .... can't see a less disgusting way of
* doing this ....
*/
+struct mpc_config_translation {
+ unsigned char mpc_type;
+ unsigned char trans_len;
+ unsigned char trans_type;
+ unsigned char trans_quad;
+ unsigned char trans_global;
+ unsigned char trans_local;
+ unsigned short trans_reserved;
+};
+
static int mpc_record;
static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
__cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+ return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+ struct mpc_config_translation *translation_record)
+{
+ int quad = translation_record->trans_quad;
+ int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+ printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+ m->mpc_apicid,
+ (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+ (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+ m->mpc_apicver, quad, logical_apicid);
+ return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
+ struct mpc_config_translation *translation)
+{
+ int quad = translation->trans_quad;
+ int local = translation->trans_local;
+
+ mp_bus_id_to_node[m->mpc_busid] = quad;
+ mp_bus_id_to_local[m->mpc_busid] = local;
+ printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+ m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+static void mpc_oem_pci_bus(struct mpc_config_bus *m,
+ struct mpc_config_translation *translation)
+{
+ int quad = translation->trans_quad;
+ int local = translation->trans_local;
+
+ quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
#endif
static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
@@ -90,7 +128,10 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
return;
}
#ifdef CONFIG_X86_NUMAQ
- apicid = mpc_apic_id(m, translation_table[mpc_record]);
+ if (found_numaq)
+ apicid = mpc_apic_id(m, translation_table[mpc_record]);
+ else
+ apicid = m->mpc_apicid;
#else
apicid = m->mpc_apicid;
#endif
@@ -103,17 +144,18 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
generic_processor_info(apicid, m->mpc_apicver);
}
+#ifdef CONFIG_X86_IO_APIC
static void __init MP_bus_info(struct mpc_config_bus *m)
{
char str[7];
-
memcpy(str, m->mpc_bustype, 6);
str[6] = 0;
#ifdef CONFIG_X86_NUMAQ
- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+ if (found_numaq)
+ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
#else
- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+ printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
#endif
#if MAX_MP_BUSSES < 256
@@ -132,11 +174,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
#ifdef CONFIG_X86_NUMAQ
- mpc_oem_pci_bus(m, translation_table[mpc_record]);
+ if (found_numaq)
+ mpc_oem_pci_bus(m, translation_table[mpc_record]);
#endif
clear_bit(m->mpc_busid, mp_bus_not_pci);
- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
- mp_current_pci_id++;
#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -147,6 +188,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
+#endif
#ifdef CONFIG_X86_IO_APIC
@@ -176,18 +218,89 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
if (bad_ioapic(m->mpc_apicaddr))
return;
- mp_ioapics[nr_ioapics] = *m;
+ mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
+ mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
+ mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
+ mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
+ mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
nr_ioapics++;
}
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
{
- mp_irqs[mp_irq_entries] = *m;
- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+ printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+}
+
+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+{
+ printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
+ (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
+ mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+}
+
+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+ struct mp_config_intsrc *mp_irq)
+{
+ mp_irq->mp_dstapic = m->mpc_dstapic;
+ mp_irq->mp_type = m->mpc_type;
+ mp_irq->mp_irqtype = m->mpc_irqtype;
+ mp_irq->mp_irqflag = m->mpc_irqflag;
+ mp_irq->mp_srcbus = m->mpc_srcbus;
+ mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
+ mp_irq->mp_dstirq = m->mpc_dstirq;
+}
+
+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+ struct mpc_config_intsrc *m)
+{
+ m->mpc_dstapic = mp_irq->mp_dstapic;
+ m->mpc_type = mp_irq->mp_type;
+ m->mpc_irqtype = mp_irq->mp_irqtype;
+ m->mpc_irqflag = mp_irq->mp_irqflag;
+ m->mpc_srcbus = mp_irq->mp_srcbus;
+ m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
+ m->mpc_dstirq = mp_irq->mp_dstirq;
+}
+
+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+ struct mpc_config_intsrc *m)
+{
+ if (mp_irq->mp_dstapic != m->mpc_dstapic)
+ return 1;
+ if (mp_irq->mp_type != m->mpc_type)
+ return 2;
+ if (mp_irq->mp_irqtype != m->mpc_irqtype)
+ return 3;
+ if (mp_irq->mp_irqflag != m->mpc_irqflag)
+ return 4;
+ if (mp_irq->mp_srcbus != m->mpc_srcbus)
+ return 5;
+ if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+ return 6;
+ if (mp_irq->mp_dstirq != m->mpc_dstirq)
+ return 7;
+
+ return 0;
+}
+
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+{
+ int i;
+
+ print_MP_intsrc_info(m);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+ return;
+ }
+
+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
@@ -196,7 +309,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
{
- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+ printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -266,11 +379,14 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
}
}
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
char *productid)
{
if (strncmp(oem, "IBM NUMA", 8))
- printk("Warning! May not be a NUMA-Q system!\n");
+ printk("Warning! Not a NUMA-Q system!\n");
+ else
+ found_numaq = 1;
+
if (mpc->mpc_oemptr)
smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
mpc->mpc_oemsize);
@@ -281,12 +397,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
* Read/parse the MPC
*/
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
+ char *str)
{
- char str[16];
- char oem[10];
- int count = sizeof(*mpc);
- unsigned char *mpt = ((unsigned char *)mpc) + count;
if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -309,19 +422,42 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
}
memcpy(oem, mpc->mpc_oem, 8);
oem[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+ printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
memcpy(str, mpc->mpc_productid, 12);
str[12] = 0;
- printk("Product ID: %s ", str);
-#ifdef CONFIG_X86_32
- mps_oem_check(mpc, oem, str);
-#endif
- printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+ printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+ return 1;
+}
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+ char str[16];
+ char oem[10];
+
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
+
+#ifdef CONFIG_X86_32
+ /*
+ * need to make sure summit and es7000's mps_oem_check is safe to be
+ * called early via genericarch 's mps_oem_check
+ */
+ if (early) {
+#ifdef CONFIG_X86_NUMAQ
+ numaq_mps_oem_check(mpc, oem, str);
+#endif
+ } else
+ mps_oem_check(mpc, oem, str);
+#endif
+
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
mp_lapic_addr = mpc->mpc_lapic;
@@ -352,7 +488,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
{
struct mpc_config_bus *m =
(struct mpc_config_bus *)mpt;
+#ifdef CONFIG_X86_IO_APIC
MP_bus_info(m);
+#endif
mpt += sizeof(*m);
count += sizeof(*m);
break;
@@ -402,6 +540,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
++mpc_record;
#endif
}
+
+#ifdef CONFIG_X86_GENERICARCH
+ generic_bigsmp_probe();
+#endif
+
setup_apic_routing();
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -427,7 +570,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
intsrc.mpc_type = MP_INTSRC;
intsrc.mpc_irqflag = 0; /* conforming */
intsrc.mpc_srcbus = 0;
- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+ intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
intsrc.mpc_irqtype = mp_INT;
@@ -488,40 +631,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
MP_intsrc_info(&intsrc);
}
-#endif
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+static void construct_ioapic_table(int mpc_default_type)
{
- struct mpc_config_processor processor;
- struct mpc_config_bus bus;
-#ifdef CONFIG_X86_IO_APIC
struct mpc_config_ioapic ioapic;
-#endif
- struct mpc_config_lintsrc lintsrc;
- int linttypes[2] = { mp_ExtINT, mp_NMI };
- int i;
-
- /*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /*
- * 2 CPUs, numbered 0 & 1.
- */
- processor.mpc_type = MP_PROCESSOR;
- /* Either an integrated APIC or a discrete 82489DX. */
- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
- for (i = 0; i < 2; i++) {
- processor.mpc_apicid = i;
- MP_processor_info(&processor);
- }
+ struct mpc_config_bus bus;
bus.mpc_type = MP_BUS;
bus.mpc_busid = 0;
@@ -550,7 +664,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
MP_bus_info(&bus);
}
-#ifdef CONFIG_X86_IO_APIC
ioapic.mpc_type = MP_IOAPIC;
ioapic.mpc_apicid = 2;
ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -562,7 +675,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
* We set up most of the low 16 IO-APIC pins according to MPS rules.
*/
construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void construct_ioapic_table(int mpc_default_type) { }
#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+ struct mpc_config_processor processor;
+ struct mpc_config_lintsrc lintsrc;
+ int linttypes[2] = { mp_ExtINT, mp_NMI };
+ int i;
+
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /*
+ * 2 CPUs, numbered 0 & 1.
+ */
+ processor.mpc_type = MP_PROCESSOR;
+ /* Either an integrated APIC or a discrete 82489DX. */
+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.mpc_cpuflag = CPU_ENABLED;
+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_reserved[0] = 0;
+ processor.mpc_reserved[1] = 0;
+ for (i = 0; i < 2; i++) {
+ processor.mpc_apicid = i;
+ MP_processor_info(&processor);
+ }
+
+ construct_ioapic_table(mpc_default_type);
+
lintsrc.mpc_type = MP_LINTSRC;
lintsrc.mpc_irqflag = 0; /* conforming */
lintsrc.mpc_srcbusid = 0;
@@ -578,12 +726,22 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
static struct intel_mp_floating *mpf_found;
/*
+ * Machine specific quirk for finding the SMP config before other setup
+ * activities destroy the table:
+ */
+int (*mach_get_smp_config_quirk)(unsigned int early);
+
+/*
* Scan the memory blocks for an SMP configuration block.
*/
-static void __init __get_smp_config(unsigned early)
+static void __init __get_smp_config(unsigned int early)
{
struct intel_mp_floating *mpf = mpf_found;
+ if (mach_get_smp_config_quirk) {
+ if (mach_get_smp_config_quirk(early))
+ return;
+ }
if (acpi_lapic && early)
return;
/*
@@ -600,7 +758,7 @@ static void __init __get_smp_config(unsigned early)
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
mpf->mpf_specification);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
if (mpf->mpf_feature2 & (1 << 7)) {
printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
pic_mode = 1;
@@ -632,7 +790,9 @@ static void __init __get_smp_config(unsigned early)
* override the defaults.
*/
if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
+#endif
printk(KERN_ERR
"BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. "
@@ -689,7 +849,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned int *bp = phys_to_virt(base);
struct intel_mp_floating *mpf;
- Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+ printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -699,15 +859,21 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
!mpf_checksum((unsigned char *)bp, 16) &&
((mpf->mpf_specification == 1)
|| (mpf->mpf_specification == 4))) {
-
+#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
+#endif
mpf_found = mpf;
-#ifdef CONFIG_X86_32
+
printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
mpf, virt_to_phys(mpf));
- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+
+ if (!reserve)
+ return 1;
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
BOOTMEM_DEFAULT);
if (mpf->mpf_physptr) {
+ unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
/*
* We cannot access to MPC table to compute
* table size yet, as only few megabytes from
@@ -717,24 +883,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
* PAGE_SIZE from mpg->mpf_physptr yields BUG()
* in reserve_bootmem.
*/
- unsigned long size = PAGE_SIZE;
unsigned long end = max_low_pfn * PAGE_SIZE;
if (mpf->mpf_physptr + size > end)
size = end - mpf->mpf_physptr;
- reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+ reserve_bootmem_generic(mpf->mpf_physptr, size,
BOOTMEM_DEFAULT);
}
-#else
- if (!reserve)
- return 1;
-
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
- if (mpf->mpf_physptr)
- reserve_bootmem_generic(mpf->mpf_physptr,
- PAGE_SIZE);
-#endif
- return 1;
+ return 1;
}
bp += 4;
length -= 16;
@@ -742,10 +899,16 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
return 0;
}
-static void __init __find_smp_config(unsigned reserve)
+int (*mach_find_smp_config_quirk)(unsigned int reserve);
+
+static void __init __find_smp_config(unsigned int reserve)
{
unsigned int address;
+ if (mach_find_smp_config_quirk) {
+ if (mach_find_smp_config_quirk(reserve))
+ return;
+ }
/*
* FIXME: Linux assumes you have 640K of base ram..
* this continues the error...
@@ -790,298 +953,294 @@ void __init find_smp_config(void)
__find_smp_config(1);
}
-/* --------------------------------------------------------------------------
- ACPI-based MP Configuration
- -------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
-/*
- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
- */
-int es7000_plat;
+static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+{
+ int i;
-#ifdef CONFIG_ACPI
+ if (m->mpc_irqtype != mp_INT)
+ return 0;
-#ifdef CONFIG_X86_IO_APIC
+ if (m->mpc_irqflag != 0x0f)
+ return 0;
-#define MP_ISA_BUS 0
+ /* not legacy */
-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (mp_irqs[i].mp_irqtype != mp_INT)
+ continue;
-static int mp_find_ioapic(int gsi)
-{
- int i = 0;
+ if (mp_irqs[i].mp_irqflag != 0x0f)
+ continue;
- /* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_ioapic_routing[i].gsi_base)
- && (gsi <= mp_ioapic_routing[i].gsi_end))
- return i;
+ if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+ continue;
+ if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+ continue;
+ if (irq_used[i]) {
+ /* already claimed */
+ return -2;
+ }
+ irq_used[i] = 1;
+ return i;
}
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ /* not found */
return -1;
}
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return io_apic_get_unique_id(nr_ioapics, id);
- else
- return id;
-#else
- int i;
- DECLARE_BITMAP(used, 256);
- bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
- struct mpc_config_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->mpc_apicid, used);
- }
- if (!test_bit(id, used))
- return id;
- return find_first_zero_bit(used, 256);
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
#endif
-}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+static int __init replace_intsrc_all(struct mp_config_table *mpc,
+ unsigned long mpc_new_phys,
+ unsigned long mpc_new_length)
{
- int idx = 0;
-
- if (bad_ioapic(address))
- return;
+#ifdef CONFIG_X86_IO_APIC
+ int i;
+ int nr_m_spare = 0;
+#endif
- idx = nr_ioapics;
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
- mp_ioapics[idx].mpc_type = MP_IOAPIC;
- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
- mp_ioapics[idx].mpc_apicaddr = address;
+ printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
+ while (count < mpc->mpc_length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ {
+ struct mpc_config_processor *m =
+ (struct mpc_config_processor *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_BUS:
+ {
+ struct mpc_config_bus *m =
+ (struct mpc_config_bus *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_IOAPIC:
+ {
+ mpt += sizeof(struct mpc_config_ioapic);
+ count += sizeof(struct mpc_config_ioapic);
+ break;
+ }
+ case MP_INTSRC:
+ {
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-#else
- mp_ioapics[idx].mpc_apicver = 0;
+ printk(KERN_INFO "OLD ");
+ print_MP_intsrc_info(m);
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ printk(KERN_INFO "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ } else if (!i) {
+ /* legacy, do nothing */
+ } else if (nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2)
+ * are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[nr_m_spare] = m;
+ nr_m_spare++;
+ }
#endif
- /*
- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
- */
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
- mp_ioapic_routing[idx].gsi_base = gsi_base;
- mp_ioapic_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
-
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
- mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
-
- nr_ioapics++;
-}
+ mpt += sizeof(struct mpc_config_intsrc);
+ count += sizeof(struct mpc_config_intsrc);
+ break;
+ }
+ case MP_LINTSRC:
+ {
+ struct mpc_config_lintsrc *m =
+ (struct mpc_config_lintsrc *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ default:
+ /* wrong mptable */
+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+ printk(KERN_ERR "type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->mpc_length, 1);
+ goto out;
+ }
+ }
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- struct mpc_config_intsrc intsrc;
- int ioapic = -1;
- int pin = -1;
+#ifdef CONFIG_X86_IO_APIC
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (irq_used[i])
+ continue;
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ if (mp_irqs[i].mp_irqtype != mp_INT)
+ continue;
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
+ if (mp_irqs[i].mp_irqflag != 0x0f)
+ continue;
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_irqflag = (trigger << 2) | polarity;
- intsrc.mpc_srcbus = MP_ISA_BUS;
- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
- intsrc.mpc_dstirq = pin; /* INTIN# */
+ if (nr_m_spare > 0) {
+ printk(KERN_INFO "*NEW* found ");
+ nr_m_spare--;
+ assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+ m_spare[nr_m_spare] = NULL;
+ } else {
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
+ count += sizeof(struct mpc_config_intsrc);
+ if (!mpc_new_phys) {
+ printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
+ } else {
+ if (count <= mpc_new_length)
+ printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
+ else {
+ printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
+ goto out;
+ }
+ }
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ mpc->mpc_length = count;
+ mpt += sizeof(struct mpc_config_intsrc);
+ }
+ print_mp_irq_info(&mp_irqs[i]);
+ }
+#endif
+out:
+ /* update checksum */
+ mpc->mpc_checksum = 0;
+ mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
+ mpc->mpc_length);
- MP_intsrc_info(&intsrc);
+ return 0;
}
-void __init mp_config_acpi_legacy_irqs(void)
-{
- struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
+static int __initdata enable_update_mptable;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
- /*
- * Fabricate the legacy ISA bus (bus #31).
- */
- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-#endif
- set_bit(MP_ISA_BUS, mp_bus_not_pci);
- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+static int __init update_mptable_setup(char *str)
+{
+ enable_update_mptable = 1;
+ return 0;
+}
+early_param("update_mptable", update_mptable_setup);
- /*
- * Older generations of ES7000 have no legacy identity mappings
- */
- if (es7000_plat == 1)
- return;
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
- /*
- * Locate the IOAPIC that manages the ISA IRQs (0-15).
- */
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- return;
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+ enable_update_mptable = 1;
+ alloc_mptable = 1;
+ if (!p)
+ return 0;
+ mpc_new_length = memparse(p, &p);
+ return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* Conforming */
- intsrc.mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+void __init early_reserve_e820_mpc_new(void)
+{
+ if (enable_update_mptable && alloc_mptable) {
+ u64 startt = 0;
+#ifdef CONFIG_X86_TRAMPOLINE
+ startt = TRAMPOLINE_BASE;
#endif
- /*
- * Use the default configuration for the IRQs 0-15. Unless
- * overridden by (MADT) interrupt source override entries.
- */
- for (i = 0; i < 16; i++) {
- int idx;
-
- for (idx = 0; idx < mp_irq_entries; idx++) {
- struct mpc_config_intsrc *irq = mp_irqs + idx;
-
- /* Do we already have a mapping for this ISA IRQ? */
- if (irq->mpc_srcbus == MP_ISA_BUS
- && irq->mpc_srcbusirq == i)
- break;
-
- /* Do we already have a mapping for this IOAPIC pin */
- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
- (irq->mpc_dstirq == i))
- break;
- }
-
- if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
- continue; /* IRQ already used */
- }
-
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_srcbusirq = i; /* Identity mapped */
- intsrc.mpc_dstirq = i;
-
- MP_intsrc_info(&intsrc);
+ mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
}
}
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int __init update_mp_table(void)
{
- int ioapic;
- int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM 4096
-#define IRQ_COMPRESSION_START 64
+ char str[16];
+ char oem[10];
+ struct intel_mp_floating *mpf;
+ struct mp_config_table *mpc;
+ struct mp_config_table *mpc_new;
+
+ if (!enable_update_mptable)
+ return 0;
+
+ mpf = mpf_found;
+ if (!mpf)
+ return 0;
- static int pci_irq = IRQ_COMPRESSION_START;
/*
- * Mapping between Global System Interrupts, which
- * represent all possible interrupts, and IRQs
- * assigned to actual devices.
+ * Now see if we need to go further.
*/
- static int gsi_to_irq[MAX_GSI_NUM];
-#else
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-#endif
+ if (mpf->mpf_feature1 != 0)
+ return 0;
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
+ if (!mpf->mpf_physptr)
+ return 0;
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
+ mpc = phys_to_virt(mpf->mpf_physptr);
- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
-#ifdef CONFIG_X86_32
- if (ioapic_renumber_irq)
- gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
+ printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
+ printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
- /*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
- */
- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
- ioapic_pin);
- return gsi;
+ if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+ mpc_new_phys = 0;
+ printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
}
- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
- return gsi;
-#endif
+
+ if (!mpc_new_phys) {
+ unsigned char old, new;
+ /* check if we can change the postion */
+ mpc->mpc_checksum = 0;
+ old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ mpc->mpc_checksum = 0xff;
+ new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ if (old == new) {
+ printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+ return 0;
+ }
+ printk(KERN_INFO "use in-positon replacing\n");
+ } else {
+ mpf->mpf_physptr = mpc_new_phys;
+ mpc_new = phys_to_virt(mpc_new_phys);
+ memcpy(mpc_new, mpc, mpc->mpc_length);
+ mpc = mpc_new;
+ /* check if we can modify that */
+ if (mpc_new_phys - mpf->mpf_physptr) {
+ struct intel_mp_floating *mpf_new;
+ /* steal 16 bytes from [0, 1k) */
+ printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+ mpf_new = phys_to_virt(0x400 - 16);
+ memcpy(mpf_new, mpf, 16);
+ mpf = mpf_new;
+ mpf->mpf_physptr = mpc_new_phys;
+ }
+ mpf->mpf_checksum = 0;
+ mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
}
- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
/*
- * For GSI >= 64, use IRQ compression
+ * only replace the one with mp_INT and
+ * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+ * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+ * may need pci=routeirq for all coverage
*/
- if ((gsi >= IRQ_COMPRESSION_START)
- && (triggering == ACPI_LEVEL_SENSITIVE)) {
- /*
- * For PCI devices assign IRQs in order, avoiding gaps
- * due to unused I/O APIC pins.
- */
- int irq = gsi;
- if (gsi < MAX_GSI_NUM) {
- /*
- * Retain the VIA chipset work-around (gsi > 15), but
- * avoid a problem where the 8254 timer (IRQ0) is setup
- * via an override (so it's not on pin 0 of the ioapic),
- * and at the same time, the pin 0 interrupt is a PCI
- * type. The gsi > 15 test could cause these two pins
- * to be shared as IRQ0, and they are not shareable.
- * So test for this condition, and if necessary, avoid
- * the pin collision.
- */
- gsi = pci_irq++;
- /*
- * Don't assign IRQ used by ACPI SCI
- */
- if (gsi == acpi_gbl_FADT.sci_interrupt)
- gsi = pci_irq++;
- gsi_to_irq[irq] = gsi;
- } else {
- printk(KERN_ERR "GSI %u is too high\n", gsi);
- return gsi;
- }
- }
-#endif
- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- return gsi;
+ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+ return 0;
}
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1f3abe048e9..a153b3905f6 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file)
{
unsigned int cpu = iminor(file->f_path.dentry->d_inode);
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ int ret = 0;
- if (cpu >= NR_CPUS || !cpu_online(cpu))
- return -ENXIO; /* No such CPU */
- if (!cpu_has(c, X86_FEATURE_MSR))
- return -EIO; /* MSR not supported */
+ lock_kernel();
+ cpu = iminor(file->f_path.dentry->d_inode);
+ if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+ ret = -ENXIO; /* No such CPU */
+ goto out;
+ }
+ c = &cpu_data(cpu);
+ if (!cpu_has(c, X86_FEATURE_MSR))
+ ret = -EIO; /* MSR not supported */
+out:
+ unlock_kernel();
return 0;
}
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi.c
index 84160f74eeb..ec024b3baad 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi.c
@@ -11,10 +11,13 @@
* Mikael Pettersson : PM converted to driver model. Disable/enable API.
*/
+#include <asm/apic.h>
+
+#include <linux/nmi.h>
+#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
-#include <linux/nmi.h>
#include <linux/sysdev.h>
#include <linux/sysctl.h>
#include <linux/percpu.h>
@@ -22,12 +25,18 @@
#include <linux/cpumask.h>
#include <linux/kernel_stat.h>
#include <linux/kdebug.h>
-#include <linux/slab.h>
+#include <linux/smp.h>
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
#include <asm/smp.h>
#include <asm/nmi.h>
+#include <asm/proto.h>
+#include <asm/timer.h>
-#include "mach_traps.h"
+#include <asm/mce.h>
+
+#include <mach_traps.h>
int unknown_nmi_panic;
int nmi_watchdog_enabled;
@@ -41,28 +50,65 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
* 0: the lapic NMI watchdog is disabled, but can be enabled
*/
atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
-unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
+unsigned int nmi_watchdog = NMI_NONE;
+EXPORT_SYMBOL(nmi_watchdog);
+static int panic_on_timeout;
+
+static unsigned int nmi_hz = HZ;
static DEFINE_PER_CPU(short, wd_enabled);
+static int endflag __initdata;
-static int endflag __initdata = 0;
+static inline unsigned int get_nmi_count(int cpu)
+{
+#ifdef CONFIG_X86_64
+ return cpu_pda(cpu)->__nmi_count;
+#else
+ return nmi_count(cpu);
+#endif
+}
+
+static inline int mce_in_progress(void)
+{
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+ return atomic_read(&mce_entry) > 0;
+#endif
+ return 0;
+}
+
+/*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+static inline unsigned int get_timer_irqs(int cpu)
+{
+#ifdef CONFIG_X86_64
+ return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
+#else
+ return per_cpu(irq_stat, cpu).apic_timer_irqs +
+ per_cpu(irq_stat, cpu).irq0_irqs;
+#endif
+}
#ifdef CONFIG_SMP
-/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+/*
+ * The performance counters used by NMI_LOCAL_APIC don't trigger when
* the CPU is idle. To make sure the NMI watchdog really ticks on all
* CPUs during the test make them busy.
*/
static __init void nmi_cpu_busy(void *data)
{
local_irq_enable_in_hardirq();
- /* Intentionally don't use cpu_relax here. This is
- to make sure that the performance counter really ticks,
- even if there is a simulator or similar that catches the
- pause instruction. On a real HT machine this is fine because
- all other CPUs are busy with "useless" delay loops and don't
- care if they get somewhat less cycles. */
+ /*
+ * Intentionally don't use cpu_relax here. This is
+ * to make sure that the performance counter really ticks,
+ * even if there is a simulator or similar that catches the
+ * pause instruction. On a real HT machine this is fine because
+ * all other CPUs are busy with "useless" delay loops and don't
+ * care if they get somewhat less cycles.
+ */
while (endflag == 0)
mb();
}
@@ -73,43 +119,34 @@ int __init check_nmi_watchdog(void)
unsigned int *prev_nmi_count;
int cpu;
- if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
- return 0;
-
- if (!atomic_read(&nmi_active))
+ if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
return 0;
- prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+ prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
if (!prev_nmi_count)
- return -1;
+ goto error;
printk(KERN_INFO "Testing NMI watchdog ... ");
#ifdef CONFIG_SMP
if (nmi_watchdog == NMI_LOCAL_APIC)
- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+ smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
#endif
for_each_possible_cpu(cpu)
- prev_nmi_count[cpu] = nmi_count(cpu);
+ prev_nmi_count[cpu] = get_nmi_count(cpu);
local_irq_enable();
- mdelay((20*1000)/nmi_hz); // wait 20 ticks
+ mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_SMP
- /* Check cpu_callin_map here because that is set
- after the timer is started. */
- if (!cpu_isset(cpu, cpu_callin_map))
- continue;
-#endif
+ for_each_online_cpu(cpu) {
if (!per_cpu(wd_enabled, cpu))
continue;
- if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+ if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
printk(KERN_WARNING "WARNING: CPU#%d: NMI "
"appears to be stuck (%d->%d)!\n",
cpu,
prev_nmi_count[cpu],
- nmi_count(cpu));
+ get_nmi_count(cpu));
per_cpu(wd_enabled, cpu) = 0;
atomic_dec(&nmi_active);
}
@@ -118,37 +155,53 @@ int __init check_nmi_watchdog(void)
if (!atomic_read(&nmi_active)) {
kfree(prev_nmi_count);
atomic_set(&nmi_active, -1);
- return -1;
+ goto error;
}
printk("OK.\n");
- /* now that we know it works we can reduce NMI frequency to
- something more reasonable; makes a difference in some configs */
+ /*
+ * now that we know it works we can reduce NMI frequency to
+ * something more reasonable; makes a difference in some configs
+ */
if (nmi_watchdog == NMI_LOCAL_APIC)
nmi_hz = lapic_adjust_nmi_hz(1);
kfree(prev_nmi_count);
return 0;
+error:
+ if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+ disable_8259A_irq(0);
+#ifdef CONFIG_X86_32
+ timer_ack = 0;
+#endif
+ return -1;
}
static int __init setup_nmi_watchdog(char *str)
{
- int nmi;
+ unsigned int nmi;
+
+ if (!strncmp(str, "panic", 5)) {
+ panic_on_timeout = 1;
+ str = strchr(str, ',');
+ if (!str)
+ return 1;
+ ++str;
+ }
get_option(&str, &nmi);
- if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
+ if (nmi >= NMI_INVALID)
return 0;
nmi_watchdog = nmi;
return 1;
}
-
__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/* Suspend/resume support */
-
+/*
+ * Suspend/resume support
+ */
#ifdef CONFIG_PM
static int nmi_pm_active; /* nmi_active before suspend */
@@ -172,7 +225,6 @@ static int lapic_nmi_resume(struct sys_device *dev)
return 0;
}
-
static struct sysdev_class nmi_sysclass = {
.name = "lapic_nmi",
.resume = lapic_nmi_resume,
@@ -188,7 +240,8 @@ static int __init init_lapic_nmi_sysfs(void)
{
int error;
- /* should really be a BUG_ON but b/c this is an
+ /*
+ * should really be a BUG_ON but b/c this is an
* init call, it just doesn't work. -dcz
*/
if (nmi_watchdog != NMI_LOCAL_APIC)
@@ -202,6 +255,7 @@ static int __init init_lapic_nmi_sysfs(void)
error = sysdev_register(&device_lapic_nmi);
return error;
}
+
/* must come after the local APIC's device_initcall() */
late_initcall(init_lapic_nmi_sysfs);
@@ -218,12 +272,12 @@ static void __acpi_nmi_enable(void *__unused)
void acpi_nmi_enable(void)
{
if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+ on_each_cpu(__acpi_nmi_enable, NULL, 1);
}
static void __acpi_nmi_disable(void *__unused)
{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+ apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
}
/*
@@ -232,7 +286,7 @@ static void __acpi_nmi_disable(void *__unused)
void acpi_nmi_disable(void)
{
if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+ on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
void setup_apic_nmi_watchdog(void *unused)
@@ -242,12 +296,13 @@ void setup_apic_nmi_watchdog(void *unused)
/* cheap hack to support suspend/resume */
/* if cpu0 is not active neither should the other cpus */
- if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+ if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
return;
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
- __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
+ /* enable it before to avoid race with handler */
+ __get_cpu_var(wd_enabled) = 1;
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
@@ -262,9 +317,8 @@ void setup_apic_nmi_watchdog(void *unused)
void stop_apic_nmi_watchdog(void *unused)
{
/* only support LOCAL and IO APICs for now */
- if ((nmi_watchdog != NMI_LOCAL_APIC) &&
- (nmi_watchdog != NMI_IO_APIC))
- return;
+ if (!nmi_watchdog_active())
+ return;
if (__get_cpu_var(wd_enabled) == 0)
return;
if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -284,26 +338,26 @@ void stop_apic_nmi_watchdog(void *unused)
* since NMIs don't listen to _any_ locks, we have to be extremely
* careful not to rely on unsafe variables. The printk might lock
* up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up
- * here too!]
+ * [when there will be more tty-related locks, break them up here too!]
*/
-static unsigned int
- last_irq_sums [NR_CPUS],
- alert_counter [NR_CPUS];
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(int, nmi_touch);
void touch_nmi_watchdog(void)
{
- if (nmi_watchdog > 0) {
+ if (nmi_watchdog_active()) {
unsigned cpu;
/*
- * Just reset the alert counters, (other CPUs might be
- * spinning on locks we hold):
+ * Tell other CPUs to reset their alert counters. We cannot
+ * do it ourselves because the alert count increase is not
+ * atomic.
*/
for_each_present_cpu(cpu) {
- if (alert_counter[cpu])
- alert_counter[cpu] = 0;
+ if (per_cpu(nmi_touch, cpu) != 1)
+ per_cpu(nmi_touch, cpu) = 1;
}
}
@@ -314,12 +368,9 @@ void touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(touch_nmi_watchdog);
-extern void die_nmi(struct pt_regs *, const char *msg);
-
notrace __kprobes int
nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
{
-
/*
* Since current_thread_info()-> is always on the stack, and we
* always switch the stack NMI-atomically, it's safe to use
@@ -337,39 +388,45 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
touched = 1;
}
+ sum = get_timer_irqs(cpu);
+
+ if (__get_cpu_var(nmi_touch)) {
+ __get_cpu_var(nmi_touch) = 0;
+ touched = 1;
+ }
+
if (cpu_isset(cpu, backtrace_mask)) {
static DEFINE_SPINLOCK(lock); /* Serialise the printks */
spin_lock(&lock);
- printk("NMI backtrace for cpu %d\n", cpu);
+ printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
dump_stack();
spin_unlock(&lock);
cpu_clear(cpu, backtrace_mask);
}
- /*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
- sum = per_cpu(irq_stat, cpu).apic_timer_irqs +
- per_cpu(irq_stat, cpu).irq0_irqs;
+ /* Could check oops_in_progress here too, but it's safer not to */
+ if (mce_in_progress())
+ touched = 1;
/* if the none of the timers isn't firing, this cpu isn't doing much */
- if (!touched && last_irq_sums[cpu] == sum) {
+ if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
- alert_counter[cpu]++;
- if (alert_counter[cpu] == 5*nmi_hz)
+ local_inc(&__get_cpu_var(alert_counter));
+ if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
/*
* die_nmi will return ONLY if NOTIFY_STOP happens..
*/
- die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
+ die_nmi("BUG: NMI Watchdog detected LOCKUP",
+ regs, panic_on_timeout);
} else {
- last_irq_sums[cpu] = sum;
- alert_counter[cpu] = 0;
+ __get_cpu_var(last_irq_sum) = sum;
+ local_set(&__get_cpu_var(alert_counter), 0);
}
+
/* see if the nmi watchdog went off */
if (!__get_cpu_var(wd_enabled))
return rc;
@@ -378,7 +435,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
rc |= lapic_wd_event(nmi_hz);
break;
case NMI_IO_APIC:
- /* don't know how to accurately check for this.
+ /*
+ * don't know how to accurately check for this.
* just assume it was a watchdog timer interrupt
* This matches the old behaviour.
*/
@@ -396,7 +454,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
char buf[64];
sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(regs, buf);
+ die_nmi(buf, regs, 1); /* Always panic here */
return 0;
}
@@ -414,32 +472,26 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
if (!!old_state == !!nmi_watchdog_enabled)
return 0;
- if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
- printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+ if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
+ printk(KERN_WARNING
+ "NMI watchdog is permanently disabled\n");
return -EIO;
}
- if (nmi_watchdog == NMI_DEFAULT) {
- if (lapic_watchdog_ok())
- nmi_watchdog = NMI_LOCAL_APIC;
- else
- nmi_watchdog = NMI_IO_APIC;
- }
-
if (nmi_watchdog == NMI_LOCAL_APIC) {
if (nmi_watchdog_enabled)
enable_lapic_nmi_watchdog();
else
disable_lapic_nmi_watchdog();
} else {
- printk( KERN_WARNING
+ printk(KERN_WARNING
"NMI watchdog doesn't know what hardware to touch\n");
return -EIO;
}
return 0;
}
-#endif
+#endif /* CONFIG_SYSCTL */
int do_nmi_callback(struct pt_regs *regs, int cpu)
{
@@ -462,6 +514,3 @@ void __trigger_all_cpu_backtrace(void)
mdelay(1);
}
}
-
-EXPORT_SYMBOL(nmi_active);
-EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
deleted file mode 100644
index 5a29ded994f..00000000000
--- a/arch/x86/kernel/nmi_64.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * NMI watchdog support on APIC systems
- *
- * Started by Ingo Molnar <mingo@redhat.com>
- *
- * Fixes:
- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
- * Mikael Pettersson : Power Management for local APIC NMI watchdog.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model. Disable/enable API.
- */
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kdebug.h>
-
-#include <asm/smp.h>
-#include <asm/nmi.h>
-#include <asm/proto.h>
-#include <asm/mce.h>
-
-#include <mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-int panic_on_unrecovered_nmi;
-
-static cpumask_t backtrace_mask = CPU_MASK_NONE;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- * be enabled
- * 0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-static int panic_on_timeout;
-
-unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
-
-static DEFINE_PER_CPU(short, wd_enabled);
-
-/* Run after command line and cpu_init init, but before all other checks */
-void nmi_watchdog_default(void)
-{
- if (nmi_watchdog != NMI_DEFAULT)
- return;
- nmi_watchdog = NMI_NONE;
-}
-
-static int endflag __initdata = 0;
-
-#ifdef CONFIG_SMP
-/* The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
- local_irq_enable_in_hardirq();
- /* Intentionally don't use cpu_relax here. This is
- to make sure that the performance counter really ticks,
- even if there is a simulator or similar that catches the
- pause instruction. On a real HT machine this is fine because
- all other CPUs are busy with "useless" delay loops and don't
- care if they get somewhat less cycles. */
- while (endflag == 0)
- mb();
-}
-#endif
-
-int __init check_nmi_watchdog(void)
-{
- int *prev_nmi_count;
- int cpu;
-
- if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
- return 0;
-
- if (!atomic_read(&nmi_active))
- return 0;
-
- prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
- if (!prev_nmi_count)
- return -1;
-
- printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
- if (nmi_watchdog == NMI_LOCAL_APIC)
- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
-#endif
-
- for (cpu = 0; cpu < NR_CPUS; cpu++)
- prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
- local_irq_enable();
- mdelay((20*1000)/nmi_hz); // wait 20 ticks
-
- for_each_online_cpu(cpu) {
- if (!per_cpu(wd_enabled, cpu))
- continue;
- if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) {
- printk(KERN_WARNING "WARNING: CPU#%d: NMI "
- "appears to be stuck (%d->%d)!\n",
- cpu,
- prev_nmi_count[cpu],
- cpu_pda(cpu)->__nmi_count);
- per_cpu(wd_enabled, cpu) = 0;
- atomic_dec(&nmi_active);
- }
- }
- endflag = 1;
- if (!atomic_read(&nmi_active)) {
- kfree(prev_nmi_count);
- atomic_set(&nmi_active, -1);
- return -1;
- }
- printk("OK.\n");
-
- /* now that we know it works we can reduce NMI frequency to
- something more reasonable; makes a difference in some configs */
- if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = lapic_adjust_nmi_hz(1);
-
- kfree(prev_nmi_count);
- return 0;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
- int nmi;
-
- if (!strncmp(str,"panic",5)) {
- panic_on_timeout = 1;
- str = strchr(str, ',');
- if (!str)
- return 1;
- ++str;
- }
-
- get_option(&str, &nmi);
-
- if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
- return 0;
-
- nmi_watchdog = nmi;
- return 1;
-}
-
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- nmi_pm_active = atomic_read(&nmi_active);
- stop_apic_nmi_watchdog(NULL);
- BUG_ON(atomic_read(&nmi_active) != 0);
- return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- if (nmi_pm_active > 0) {
- setup_apic_nmi_watchdog(NULL);
- touch_nmi_watchdog();
- }
- return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
- .name = "lapic_nmi",
- .resume = lapic_nmi_resume,
- .suspend = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
- .id = 0,
- .cls = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
- int error;
-
- /* should really be a BUG_ON but b/c this is an
- * init call, it just doesn't work. -dcz
- */
- if (nmi_watchdog != NMI_LOCAL_APIC)
- return 0;
-
- if (atomic_read(&nmi_active) < 0)
- return 0;
-
- error = sysdev_class_register(&nmi_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic_nmi);
- return error;
-}
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
- if (__get_cpu_var(wd_enabled))
- return;
-
- /* cheap hack to support suspend/resume */
- /* if cpu0 is not active neither should the other cpus */
- if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
- return;
-
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- __get_cpu_var(wd_enabled) = 1;
- if (lapic_watchdog_init(nmi_hz) < 0) {
- __get_cpu_var(wd_enabled) = 0;
- return;
- }
- /* FALL THROUGH */
- case NMI_IO_APIC:
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
- /* only support LOCAL and IO APICs for now */
- if ((nmi_watchdog != NMI_LOCAL_APIC) &&
- (nmi_watchdog != NMI_IO_APIC))
- return;
- if (__get_cpu_var(wd_enabled) == 0)
- return;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- lapic_watchdog_stop();
- __get_cpu_var(wd_enabled) = 0;
- atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(local_t, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
- if (nmi_watchdog > 0) {
- unsigned cpu;
-
- /*
- * Tell other CPUs to reset their alert counters. We cannot
- * do it ourselves because the alert count increase is not
- * atomic.
- */
- for_each_present_cpu(cpu) {
- if (per_cpu(nmi_touch, cpu) != 1)
- per_cpu(nmi_touch, cpu) = 1;
- }
- }
-
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
- int sum;
- int touched = 0;
- int cpu = smp_processor_id();
- int rc = 0;
-
- /* check for other users first */
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- rc = 1;
- touched = 1;
- }
-
- sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
- if (__get_cpu_var(nmi_touch)) {
- __get_cpu_var(nmi_touch) = 0;
- touched = 1;
- }
-
- if (cpu_isset(cpu, backtrace_mask)) {
- static DEFINE_SPINLOCK(lock); /* Serialise the printks */
-
- spin_lock(&lock);
- printk("NMI backtrace for cpu %d\n", cpu);
- dump_stack();
- spin_unlock(&lock);
- cpu_clear(cpu, backtrace_mask);
- }
-
-#ifdef CONFIG_X86_MCE
- /* Could check oops_in_progress here too, but it's safer
- not too */
- if (atomic_read(&mce_entry) > 0)
- touched = 1;
-#endif
- /* if the apic timer isn't firing, this cpu isn't doing much */
- if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- /*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
- */
- local_inc(&__get_cpu_var(alert_counter));
- if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
- die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
- panic_on_timeout);
- } else {
- __get_cpu_var(last_irq_sum) = sum;
- local_set(&__get_cpu_var(alert_counter), 0);
- }
-
- /* see if the nmi watchdog went off */
- if (!__get_cpu_var(wd_enabled))
- return rc;
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- rc |= lapic_wd_event(nmi_hz);
- break;
- case NMI_IO_APIC:
- /* don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- break;
- }
- return rc;
-}
-
-static unsigned ignore_nmis;
-
-asmlinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_enter();
- add_pda(__nmi_count,1);
- if (!ignore_nmis)
- default_do_nmi(regs);
- nmi_exit();
-}
-
-void stop_nmi(void)
-{
- acpi_nmi_disable();
- ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
- ignore_nmis--;
- acpi_nmi_enable();
-}
-
-#ifdef CONFIG_SYSCTL
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
- unsigned char reason = get_nmi_reason();
- char buf[64];
-
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf, regs, 1); /* Always panic here */
- return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int old_state;
-
- nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
- old_state = nmi_watchdog_enabled;
- proc_dointvec(table, write, file, buffer, length, ppos);
- if (!!old_state == !!nmi_watchdog_enabled)
- return 0;
-
- if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
- printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
- return -EIO;
- }
-
- /* if nmi_watchdog is not set yet, then set it */
- nmi_watchdog_default();
-
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- if (nmi_watchdog_enabled)
- enable_lapic_nmi_watchdog();
- else
- disable_lapic_nmi_watchdog();
- } else {
- printk( KERN_WARNING
- "NMI watchdog doesn't know what hardware to touch\n");
- return -EIO;
- }
- return 0;
-}
-
-#endif
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
- if (unknown_nmi_panic)
- return unknown_nmi_panic_callback(regs, cpu);
-#endif
- return 0;
-}
-
-void __trigger_all_cpu_backtrace(void)
-{
- int i;
-
- backtrace_mask = cpu_online_map;
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpus_empty(backtrace_mask))
- break;
- mdelay(1);
- }
-}
-
-EXPORT_SYMBOL(nmi_active);
-EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634..a23e8233b9a 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,6 +31,8 @@
#include <asm/numaq.h>
#include <asm/topology.h>
#include <asm/processor.h>
+#include <asm/mpspec.h>
+#include <asm/e820.h>
#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
@@ -58,6 +60,8 @@ static void __init smp_dump_qct(void)
node_end_pfn[node] = MB_TO_PAGES(
eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+ e820_register_active_regions(node, node_start_pfn[node],
+ node_end_pfn[node]);
memory_present(node,
node_start_pfn[node], node_end_pfn[node]);
node_remap_size[node] = node_memmap_size_bytes(node,
@@ -67,23 +71,35 @@ static void __init smp_dump_qct(void)
}
}
-/*
- * Unlike Summit, we don't really care to let the NUMA-Q
- * fall back to flat mode. Don't compile for NUMA-Q
- * unless you really need it!
- */
+static __init void early_check_numaq(void)
+{
+ /*
+ * Find possible boot-time SMP configuration:
+ */
+ early_find_smp_config();
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ early_get_smp_config();
+}
+
int __init get_memcfg_numaq(void)
{
+ early_check_numaq();
+ if (!found_numaq)
+ return 0;
smp_dump_qct();
return 1;
}
-static int __init numaq_tsc_disable(void)
+void __init numaq_tsc_disable(void)
{
+ if (!found_numaq)
+ return;
+
if (num_online_nodes() > 1) {
printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
setup_clear_cpu_cap(X86_FEATURE_TSC);
}
- return 0;
}
-arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5ea2a0..e0f571d58c1 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -30,6 +30,7 @@
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/time.h>
+#include <asm/pgalloc.h>
#include <asm/irq.h>
#include <asm/delay.h>
#include <asm/fixmap.h>
@@ -139,7 +140,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
/* If the operation is a nop, then nop the callsite */
ret = paravirt_patch_nop();
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
+ type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
+ type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
+ type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
else
@@ -190,7 +193,9 @@ static void native_flush_tlb_single(unsigned long addr)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_irq_enable_syscall_ret(void);
+extern void native_irq_enable_sysexit(void);
+extern void native_usergs_sysret32(void);
+extern void native_usergs_sysret64(void);
static int __init print_banner(void)
{
@@ -280,7 +285,7 @@ struct pv_time_ops pv_time_ops = {
.get_wallclock = native_get_wallclock,
.set_wallclock = native_set_wallclock,
.sched_clock = native_sched_clock,
- .get_cpu_khz = native_calculate_cpu_khz,
+ .get_tsc_khz = native_calibrate_tsc,
};
struct pv_irq_ops pv_irq_ops = {
@@ -291,6 +296,9 @@ struct pv_irq_ops pv_irq_ops = {
.irq_enable = native_irq_enable,
.safe_halt = native_safe_halt,
.halt = native_halt,
+#ifdef CONFIG_X86_64
+ .adjust_exception_frame = paravirt_nop,
+#endif
};
struct pv_cpu_ops pv_cpu_ops = {
@@ -321,12 +329,23 @@ struct pv_cpu_ops pv_cpu_ops = {
.store_idt = native_store_idt,
.store_tr = native_store_tr,
.load_tls = native_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = native_load_gs_index,
+#endif
.write_ldt_entry = native_write_ldt_entry,
.write_gdt_entry = native_write_gdt_entry,
.write_idt_entry = native_write_idt_entry,
.load_sp0 = native_load_sp0,
- .irq_enable_syscall_ret = native_irq_enable_syscall_ret,
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ .irq_enable_sysexit = native_irq_enable_sysexit,
+#endif
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_IA32_EMULATION
+ .usergs_sysret32 = native_usergs_sysret32,
+#endif
+ .usergs_sysret64 = native_usergs_sysret64,
+#endif
.iret = native_iret,
.swapgs = native_swapgs,
@@ -366,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
.flush_tlb_single = native_flush_tlb_single,
.flush_tlb_others = native_flush_tlb_others,
+ .pgd_alloc = __paravirt_pgd_alloc,
+ .pgd_free = paravirt_nop,
+
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pmd_clone = paravirt_nop,
@@ -380,6 +402,9 @@ struct pv_mmu_ops pv_mmu_ops = {
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = kmap_atomic,
#endif
@@ -403,6 +428,7 @@ struct pv_mmu_ops pv_mmu_ops = {
#endif /* PAGETABLE_LEVELS >= 3 */
.pte_val = native_pte_val,
+ .pte_flags = native_pte_val,
.pgd_val = native_pgd_val,
.make_pte = native_make_pte,
@@ -416,6 +442,8 @@ struct pv_mmu_ops pv_mmu_ops = {
.enter = paravirt_nop,
.leave = paravirt_nop,
},
+
+ .set_fixmap = native_set_fixmap,
};
EXPORT_SYMBOL_GPL(pv_time_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 82fc5fcab4f..58262218781 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -29,7 +29,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, restore_fl);
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+ PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 7d904e138d7..061d01df9ae 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
-/* the three commands give us more control to how to return from a syscall */
-DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
+DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+ PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
+ PATCH_SITE(pv_cpu_ops, usergs_sysret32);
+ PATCH_SITE(pv_cpu_ops, usergs_sysret64);
PATCH_SITE(pv_cpu_ops, swapgs);
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e28ec497e14..6959b5c45df 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1394,7 +1394,7 @@ void __init detect_calgary(void)
return;
}
- specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
+ specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE);
for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
struct calgary_bus_info *info = &bus_info[bus];
@@ -1459,7 +1459,7 @@ int __init calgary_iommu_init(void)
if (ret) {
printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
"falling back to no_iommu\n", ret);
- if (end_pfn > MAX_DMA32_PFN)
+ if (max_pfn > MAX_DMA32_PFN)
printk(KERN_ERR "WARNING more than 4GB of memory, "
"32bit PCI may malfunction.\n");
return ret;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index dc00a1331ac..8467ec2320f 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -7,6 +7,7 @@
#include <asm/dma.h>
#include <asm/gart.h>
#include <asm/calgary.h>
+#include <asm/amd_iommu.h>
int forbid_dac __read_mostly;
EXPORT_SYMBOL(forbid_dac);
@@ -74,13 +75,17 @@ early_param("dma32_size", parse_dma32_size_opt);
void __init dma32_reserve_bootmem(void)
{
unsigned long size, align;
- if (end_pfn <= MAX_DMA32_PFN)
+ if (max_pfn <= MAX_DMA32_PFN)
return;
+ /*
+ * check aperture_64.c allocate_aperture() for reason about
+ * using 512M as goal
+ */
align = 64ULL<<20;
size = round_up(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
- __pa(MAX_DMA_ADDRESS));
+ 512ULL<<20);
if (dma32_bootmem_ptr)
dma32_bootmem_size = size;
else
@@ -88,17 +93,14 @@ void __init dma32_reserve_bootmem(void)
}
static void __init dma32_free_bootmem(void)
{
- int node;
- if (end_pfn <= MAX_DMA32_PFN)
+ if (max_pfn <= MAX_DMA32_PFN)
return;
if (!dma32_bootmem_ptr)
return;
- for_each_online_node(node)
- free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
- dma32_bootmem_size);
+ free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
@@ -122,6 +124,8 @@ void __init pci_iommu_alloc(void)
detect_intel_iommu();
+ amd_iommu_detect();
+
#ifdef CONFIG_SWIOTLB
pci_swiotlb_init();
#endif
@@ -357,7 +361,7 @@ int dma_supported(struct device *dev, u64 mask)
EXPORT_SYMBOL(dma_supported);
/* Allocate DMA memory on node near device */
-noinline struct page *
+static noinline struct page *
dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
int node;
@@ -502,6 +506,8 @@ static int __init pci_iommu_init(void)
intel_iommu_init();
+ amd_iommu_init();
+
#ifdef CONFIG_GART_IOMMU
gart_iommu_init();
#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index aa8ec928caa..c3fe78406d1 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -104,7 +104,6 @@ static unsigned long alloc_iommu(struct device *dev, int size)
size, base_index, boundary_size, 0);
}
if (offset != -1) {
- set_bit_string(iommu_gart_bitmap, offset, size);
next_bit = offset+size;
if (next_bit >= iommu_pages) {
next_bit = 0;
@@ -534,8 +533,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
unsigned aper_size = 0, aper_base_32, aper_order;
u64 aper_base;
- pci_read_config_dword(dev, 0x94, &aper_base_32);
- pci_read_config_dword(dev, 0x90, &aper_order);
+ pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
aper_order = (aper_order >> 1) & 7;
aper_base = aper_base_32 & 0x7fff;
@@ -549,14 +548,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
return aper_base;
}
+static void enable_gart_translations(void)
+{
+ int i;
+
+ for (i = 0; i < num_k8_northbridges; i++) {
+ struct pci_dev *dev = k8_northbridges[i];
+
+ enable_gart_translation(dev, __pa(agp_gatt_table));
+ }
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+ fix_up_north_bridges = true;
+ aperture_order = aper_order;
+ aperture_alloc = aper_alloc;
+}
+
static int gart_resume(struct sys_device *dev)
{
+ printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
+
+ if (fix_up_north_bridges) {
+ int i;
+
+ printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
+
+ for (i = 0; i < num_k8_northbridges; i++) {
+ struct pci_dev *dev = k8_northbridges[i];
+
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
+ aperture_order << 1);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
+ aperture_alloc >> 25);
+ }
+ }
+
+ enable_gart_translations();
+
return 0;
}
static int gart_suspend(struct sys_device *dev, pm_message_t state)
{
- return -EINVAL;
+ return 0;
}
static struct sysdev_class gart_sysdev_class = {
@@ -582,6 +630,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
struct pci_dev *dev;
void *gatt;
int i, error;
+ unsigned long start_pfn, end_pfn;
printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
@@ -614,31 +663,25 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
memset(gatt, 0, gatt_size);
agp_gatt_table = gatt;
- for (i = 0; i < num_k8_northbridges; i++) {
- u32 gatt_reg;
- u32 ctl;
-
- dev = k8_northbridges[i];
- gatt_reg = __pa(gatt) >> 12;
- gatt_reg <<= 4;
- pci_write_config_dword(dev, 0x98, gatt_reg);
- pci_read_config_dword(dev, 0x90, &ctl);
-
- ctl |= 1;
- ctl &= ~((1<<4) | (1<<5));
-
- pci_write_config_dword(dev, 0x90, ctl);
- }
+ enable_gart_translations();
error = sysdev_class_register(&gart_sysdev_class);
if (!error)
error = sysdev_register(&device_gart);
if (error)
panic("Could not register gart_sysdev -- would corrupt data on next suspend");
+
flush_gart();
printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
aper_base, aper_size>>10);
+
+ /* need to map that range */
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+ if (end_pfn > max_low_pfn_mapped) {
+ start_pfn = (aper_base>>PAGE_SHIFT);
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ }
return 0;
nommu:
@@ -677,11 +720,11 @@ void gart_iommu_shutdown(void)
u32 ctl;
dev = k8_northbridges[i];
- pci_read_config_dword(dev, 0x90, &ctl);
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
- ctl &= ~1;
+ ctl &= ~GARTEN;
- pci_write_config_dword(dev, 0x90, ctl);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -716,10 +759,10 @@ void __init gart_iommu_init(void)
return;
if (no_iommu ||
- (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
+ (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
(no_agp && init_k8_gatt(&info) < 0)) {
- if (end_pfn > MAX_DMA32_PFN) {
+ if (max_pfn > MAX_DMA32_PFN) {
printk(KERN_WARNING "More than 4GB of memory "
"but GART IOMMU not available.\n"
KERN_WARNING "falling back to iommu=soft.\n");
@@ -788,10 +831,10 @@ void __init gart_iommu_init(void)
wbinvd();
/*
- * Try to workaround a bug (thanks to BenH)
+ * Try to workaround a bug (thanks to BenH):
* Set unmapped entries to a scratch page instead of 0.
* Any prefetches that hit unmapped entries won't get an bus abort
- * then.
+ * then. (P2P bridge may be prefetching on DMA reads).
*/
scratch = get_zeroed_page(GFP_KERNEL);
if (!scratch)
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 490da7f4b8d..82299cd1d04 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -38,7 +38,7 @@ const struct dma_mapping_ops swiotlb_dma_ops = {
void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
- if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
+ if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
if (swiotlb_force)
swiotlb = 1;
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
new file mode 100644
index 00000000000..675a48c404a
--- /dev/null
+++ b/arch/x86/kernel/probe_roms_32.c
@@ -0,0 +1,166 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/mmzone.h>
+#include <linux/ioport.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+
+
+#include <asm/e820.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <setup_arch.h>
+
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .end = 0xfffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .end = 0xeffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+ .name = "Adapter ROM",
+ .start = 0xc8000,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .end = 0xc7fff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+ const unsigned short * const ptr = (const unsigned short *)rom;
+ unsigned short sig;
+
+ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+ unsigned char sum, c;
+
+ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ sum += c;
+ return !length && !sum;
+}
+
+void __init probe_roms(void)
+{
+ const unsigned char *rom;
+ unsigned long start, length, upper;
+ unsigned char c;
+ int i;
+
+ /* video rom */
+ upper = adapter_rom_resources[0].start;
+ for (start = video_rom_resource.start; start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ video_rom_resource.start = start;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+
+ request_resource(&iomem_resource, &video_rom_resource);
+ break;
+ }
+
+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ request_resource(&iomem_resource, &system_rom_resource);
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt(extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ if (romchecksum(rom, length)) {
+ request_resource(&iomem_resource, &extension_rom_resource);
+ upper = extension_rom_resource.start;
+ }
+ }
+
+ /* check for adapter roms on 2k boundaries */
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ adapter_rom_resources[i].start = start;
+ adapter_rom_resources[i].end = start + length - 1;
+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+ start = adapter_rom_resources[i++].end & ~2047UL;
+ }
+}
+
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba370dc8685..4d629c62f4f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,6 +6,13 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
+#include <linux/clockchips.h>
+#include <asm/system.h>
+
+unsigned long idle_halt;
+EXPORT_SYMBOL(idle_halt);
+unsigned long idle_nomwait;
+EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
@@ -45,6 +52,76 @@ void arch_task_cache_init(void)
SLAB_PANIC, NULL);
}
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+#ifdef CONFIG_X86_32
+/*
+ * This halt magic was a workaround for ancient floppy DMA
+ * wreckage. It should be safe to remove.
+ */
+static int hlt_counter;
+void disable_hlt(void)
+{
+ hlt_counter++;
+}
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+ hlt_counter--;
+}
+EXPORT_SYMBOL(enable_hlt);
+
+static inline int hlt_use_halt(void)
+{
+ return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+}
+#else
+static inline int hlt_use_halt(void)
+{
+ return 1;
+}
+#endif
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+ if (hlt_use_halt()) {
+ current_thread_info()->status &= ~TS_POLLING;
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+
+ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+ else
+ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+ } else {
+ local_irq_enable();
+ /* loop is done by the caller */
+ cpu_relax();
+ }
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
static void do_nothing(void *unused)
{
}
@@ -61,7 +138,7 @@ void cpu_idle_wait(void)
{
smp_mb();
/* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 0, 1);
+ smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
@@ -122,44 +199,129 @@ static void poll_idle(void)
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
+
+#define MWAIT_INFO 0x05
+#define MWAIT_ECX_EXTENDED_INFO 0x01
+#define MWAIT_EDX_C1 0xf0
+
static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
{
+ u32 eax, ebx, ecx, edx;
+
if (force_mwait)
return 1;
- if (c->x86_vendor == X86_VENDOR_AMD) {
- switch(c->x86) {
- case 0x10:
- case 0x11:
- return 0;
- }
- }
+ if (c->cpuid_level < MWAIT_INFO)
+ return 0;
+
+ cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+ /* Check, whether EDX has extended info about MWAIT */
+ if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+ return 1;
+
+ /*
+ * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+ * C1 supports MWAIT
+ */
+ return (edx & MWAIT_EDX_C1);
+}
+
+/*
+ * Check for AMD CPUs, which have potentially C1E support
+ */
+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_AMD)
+ return 0;
+
+ if (c->x86 < 0x0F)
+ return 0;
+
+ /* Family 0x0f models < rev F do not have C1E */
+ if (c->x86 == 0x0f && c->x86_model < 0x40)
+ return 0;
+
return 1;
}
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+/*
+ * C1E aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void c1e_idle(void)
{
- static int selected;
+ static cpumask_t c1e_mask = CPU_MASK_NONE;
+ static int c1e_detected;
- if (selected)
+ if (need_resched())
return;
+
+ if (!c1e_detected) {
+ u32 lo, hi;
+
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+ c1e_detected = 1;
+ mark_tsc_unstable("TSC halt in C1E");
+ printk(KERN_INFO "System has C1E enabled\n");
+ }
+ }
+
+ if (c1e_detected) {
+ int cpu = smp_processor_id();
+
+ if (!cpu_isset(cpu, c1e_mask)) {
+ cpu_set(cpu, c1e_mask);
+ /*
+ * Force broadcast so ACPI can not interfere. Needs
+ * to run with interrupts enabled as it uses
+ * smp_function_call.
+ */
+ local_irq_enable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+ &cpu);
+ printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+ cpu);
+ local_irq_disable();
+ }
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
+ default_idle();
+
+ /*
+ * The switch back from broadcast mode needs to be
+ * called with interrupts disabled.
+ */
+ local_irq_disable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ local_irq_enable();
+ } else
+ default_idle();
+}
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
}
#endif
+ if (pm_idle)
+ return;
+
if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
/*
- * Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
- if (!pm_idle) {
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
- selected = 1;
+ printk(KERN_INFO "using mwait in idle threads.\n");
+ pm_idle = mwait_idle;
+ } else if (check_c1e_idle(c)) {
+ printk(KERN_INFO "using C1E aware idle routine\n");
+ pm_idle = c1e_idle;
+ } else
+ pm_idle = default_idle;
}
static int __init idle_setup(char *str)
@@ -169,7 +331,27 @@ static int __init idle_setup(char *str)
pm_idle = poll_idle;
} else if (!strcmp(str, "mwait"))
force_mwait = 1;
- else
+ else if (!strcmp(str, "halt")) {
+ /*
+ * When the boot option of idle=halt is added, halt is
+ * forced to be used for CPU idle. In such case CPU C2/C3
+ * won't be used again.
+ * To continue to load the CPU idle driver, don't touch
+ * the boot_option_idle_override.
+ */
+ pm_idle = default_idle;
+ idle_halt = 1;
+ return 0;
+ } else if (!strcmp(str, "nomwait")) {
+ /*
+ * If the boot option of "idle=nomwait" is added,
+ * it means that mwait will be disabled for CPU C2/C3
+ * states. In such case it won't touch the variable
+ * of boot_option_idle_override.
+ */
+ idle_nomwait = 1;
+ return 0;
+ } else
return -1;
boot_option_idle_override = 1;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6d5483356e7..0c3927accb0 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,11 +58,6 @@
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-static int hlt_counter;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
@@ -77,57 +72,24 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
return ((unsigned long *)tsk->thread.sp)[3];
}
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
-void disable_hlt(void)
+static void cpu_exit_clear(void)
{
- hlt_counter++;
-}
+ int cpu = raw_smp_processor_id();
-EXPORT_SYMBOL(disable_hlt);
+ idle_task_exit();
-void enable_hlt(void)
-{
- hlt_counter--;
-}
+ cpu_uninit();
+ irq_ctx_exit(cpu);
-EXPORT_SYMBOL(enable_hlt);
-
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
- if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
- } else {
- local_irq_enable();
- /* loop is done by the caller */
- cpu_relax();
- }
+ numa_remove_cpu(cpu);
}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
-#endif
-#ifdef CONFIG_HOTPLUG_CPU
-#include <asm/nmi.h>
/* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead(void)
{
@@ -168,24 +130,22 @@ void cpu_idle(void)
while (1) {
tick_nohz_stop_sched_tick();
while (!need_resched()) {
- void (*idle)(void);
check_pgt_cache();
rmb();
- idle = pm_idle;
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, 0);
- if (!idle)
- idle = default_idle;
-
if (cpu_is_offline(cpu))
play_dead();
local_irq_disable();
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
- idle();
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
+ pm_idle();
+ start_critical_timings();
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
@@ -333,6 +293,7 @@ void flush_thread(void)
/*
* Forget coprocessor state..
*/
+ tsk->fpu_counter = 0;
clear_fpu(tsk);
clear_used_math();
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ac54ff56df8..a8e53626ac9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,15 +56,6 @@ asmlinkage extern void ret_from_fork(void);
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
@@ -94,25 +85,6 @@ void exit_idle(void)
__exit_idle();
}
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
@@ -150,12 +122,9 @@ void cpu_idle(void)
while (1) {
tick_nohz_stop_sched_tick();
while (!need_resched()) {
- void (*idle)(void);
rmb();
- idle = pm_idle;
- if (!idle)
- idle = default_idle;
+
if (cpu_is_offline(smp_processor_id()))
play_dead();
/*
@@ -165,7 +134,10 @@ void cpu_idle(void)
*/
local_irq_disable();
enter_idle();
- idle();
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
+ pm_idle();
+ start_critical_timings();
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
@@ -294,6 +266,7 @@ void flush_thread(void)
/*
* Forget coprocessor state..
*/
+ tsk->fpu_counter = 0;
clear_fpu(tsk);
clear_used_math();
}
@@ -365,10 +338,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs;
- asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
- asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
- asm("mov %%es,%0" : "=m" (p->thread.es));
- asm("mov %%ds,%0" : "=m" (p->thread.ds));
+ savesegment(gs, p->thread.gsindex);
+ savesegment(fs, p->thread.fsindex);
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -407,7 +380,9 @@ out:
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+ loadsegment(fs, 0);
+ loadsegment(es, 0);
+ loadsegment(ds, 0);
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
@@ -566,6 +541,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ unsigned fsindex, gsindex;
/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter>5)
@@ -580,22 +556,38 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
- asm volatile("mov %%es,%0" : "=m" (prev->es));
+ savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
loadsegment(es, next->es);
-
- asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
+
+ savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
+
+ /* We must save %fs and %gs before load_TLS() because
+ * %fs and %gs may be cleared by load_TLS().
+ *
+ * (e.g. xen_load_tls())
+ */
+ savesegment(fs, fsindex);
+ savesegment(gs, gsindex);
+
load_TLS(next, cpu);
+ /*
+ * Leave lazy mode, flushing any hypercalls made here.
+ * This must be done before restoring TLS segments so
+ * the GDT and LDT are properly updated, and must be
+ * done before math_state_restore, so the TS bit is up
+ * to date.
+ */
+ arch_leave_lazy_cpu_mode();
+
/*
* Switch FS and GS.
*/
{
- unsigned fsindex;
- asm volatile("movl %%fs,%0" : "=r" (fsindex));
/* segment register != 0 always requires a reload.
also reload when it has changed.
when prev process used 64bit base always reload
@@ -613,10 +605,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex;
- }
- {
- unsigned gsindex;
- asm volatile("movl %%gs,%0" : "=r" (gsindex));
+
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
@@ -797,7 +786,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
set_32bit_tls(task, FS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
- asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+ loadsegment(fs, FS_TLS_SEL);
}
task->thread.fsindex = FS_TLS_SEL;
task->thread.fs = 0;
@@ -807,7 +796,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
if (doit) {
/* set the selector to 0 to not confuse
__switch_to */
- asm volatile("movl %0,%%fs" :: "r" (0));
+ loadsegment(fs, 0);
ret = checking_wrmsrl(MSR_FS_BASE, addr);
}
}
@@ -830,7 +819,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
else if (doit) {
- asm("movl %%gs,%0" : "=r" (gsindex));
+ savesegment(gs, gsindex);
if (gsindex)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7835f28293..77040b6070e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -943,13 +943,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_XFP,
0, sizeof(struct user_fxsr_struct),
- datap);
+ datap) ? -EIO : 0;
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
REGSET_XFP,
0, sizeof(struct user_fxsr_struct),
- datap);
+ datap) ? -EIO : 0;
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 00000000000..05fbe9a0325
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
+/* paravirtual clock -- common code used by kvm/xen
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <asm/pvclock.h>
+
+/*
+ * These are perodically updated
+ * xen: magic shared_info page
+ * kvm: gpa registered via msr
+ * and then copied here.
+ */
+struct pvclock_shadow_time {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ int tsc_shift;
+ u32 version;
+};
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+{
+ u64 delta = native_read_tsc() - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+ struct pvclock_vcpu_time_info *src)
+{
+ do {
+ dst->version = src->version;
+ rmb(); /* fetch version before data */
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb(); /* test version after fetching data */
+ } while ((src->version & 1) || (dst->version != src->version));
+
+ return dst->version;
+}
+
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ struct pvclock_shadow_time shadow;
+ unsigned version;
+ cycle_t ret, offset;
+
+ do {
+ version = pvclock_get_time_values(&shadow, src);
+ barrier();
+ offset = pvclock_get_nsec_offset(&shadow);
+ ret = shadow.system_timestamp + offset;
+ barrier();
+ } while (version != src->version);
+
+ return ret;
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+ struct pvclock_vcpu_time_info *vcpu_time,
+ struct timespec *ts)
+{
+ u32 version;
+ u64 delta;
+ struct timespec now;
+
+ /* get wallclock at system boot */
+ do {
+ version = wall_clock->version;
+ rmb(); /* fetch version before time */
+ now.tv_sec = wall_clock->sec;
+ now.tv_nsec = wall_clock->nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+ delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
+ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d89a648fe71..d1385881810 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -65,6 +65,7 @@ static enum {
ICH_FORCE_HPET_RESUME,
VT8237_FORCE_HPET_RESUME,
NVIDIA_FORCE_HPET_RESUME,
+ ATI_FORCE_HPET_RESUME,
} force_hpet_resume_type;
static void __iomem *rcba_base;
@@ -158,6 +159,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
+ ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
@@ -174,6 +177,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
static struct pci_dev *cached_dev;
+static void hpet_print_force_info(void)
+{
+ printk(KERN_INFO "HPET not enabled in BIOS. "
+ "You might try hpet=force boot option\n");
+}
+
static void old_ich_force_hpet_resume(void)
{
u32 val;
@@ -253,8 +262,12 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
{
if (hpet_force_user)
old_ich_force_enable_hpet(dev);
+ else
+ hpet_print_force_info();
}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
+ old_ich_force_enable_hpet_user);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
old_ich_force_enable_hpet_user);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
@@ -290,9 +303,14 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
{
u32 uninitialized_var(val);
- if (!hpet_force_user || hpet_address || force_hpet_address)
+ if (hpet_address || force_hpet_address)
return;
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
pci_read_config_dword(dev, 0x68, &val);
/*
* Bit 7 is HPET enable bit.
@@ -330,6 +348,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
vt8237_force_enable_hpet);
+static void ati_force_hpet_resume(void)
+{
+ pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static void ati_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ pci_write_config_dword(dev, 0x14, 0xfed00000);
+ pci_read_config_dword(dev, 0x14, &val);
+ force_hpet_address = val;
+ force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+ return;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+ ati_force_enable_hpet);
+
/*
* Undocumented chipset feature taken from LinuxBIOS.
*/
@@ -343,8 +391,13 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
{
u32 uninitialized_var(val);
- if (!hpet_force_user || hpet_address || force_hpet_address)
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
return;
+ }
pci_write_config_dword(dev, 0x44, 0xfed00001);
pci_read_config_dword(dev, 0x44, &val);
@@ -397,6 +450,9 @@ void force_hpet_resume(void)
case NVIDIA_FORCE_HPET_RESUME:
nvidia_force_hpet_resume();
return;
+ case ATI_FORCE_HPET_RESUME:
+ ati_force_hpet_resume();
+ return;
default:
break;
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f6be7d5f82f..f8a62160e15 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -27,7 +27,7 @@
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
-static long no_idt[3];
+static const struct desc_ptr no_idt = {};
static int reboot_mode;
enum reboot_type reboot_type = BOOT_KBD;
int reboot_force;
@@ -201,15 +201,15 @@ core_initcall(reboot_init);
controller to pulse the CPU reset line, which is more thorough, but
doesn't work with at least one type of 486 motherboard. It is easy
to stop this code working; hence the copious comments. */
-static unsigned long long
+static const unsigned long long
real_mode_gdt_entries [3] =
{
0x0000000000000000ULL, /* Null descriptor */
- 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
- 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
+ 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
+ 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
};
-static struct desc_ptr
+static const struct desc_ptr
real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
real_mode_idt = { 0x3ff, 0 };
@@ -231,7 +231,7 @@ real_mode_idt = { 0x3ff, 0 };
More could be done here to set up the registers as if a CPU reset had
occurred; hopefully real BIOSs don't assume much. */
-static unsigned char real_mode_switch [] =
+static const unsigned char real_mode_switch [] =
{
0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
@@ -245,7 +245,7 @@ static unsigned char real_mode_switch [] =
0x24, 0x10, /* f: andb $0x10,al */
0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
};
-static unsigned char jump_to_bios [] =
+static const unsigned char jump_to_bios [] =
{
0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
};
@@ -255,7 +255,7 @@ static unsigned char jump_to_bios [] =
* specified by the code and length parameters.
* We assume that length will aways be less that 100!
*/
-void machine_real_restart(unsigned char *code, int length)
+void machine_real_restart(const unsigned char *code, int length)
{
local_irq_disable();
@@ -368,7 +368,7 @@ static void native_machine_emergency_restart(void)
}
case BOOT_TRIPLE:
- load_idt((const struct desc_ptr *)&no_idt);
+ load_idt(&no_idt);
__asm__ __volatile__("int3");
reboot_type = BOOT_KBD;
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index dec0b5ec25c..61a837743fe 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -49,7 +49,7 @@ struct device_fixup {
void (*reboot_fixup)(struct pci_dev *);
};
-static struct device_fixup fixups_table[] = {
+static const struct device_fixup fixups_table[] = {
{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
@@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = {
*/
void mach_reboot_fixups(void)
{
- struct device_fixup *cur;
+ const struct device_fixup *cur;
struct pci_dev *dev;
int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6f80b852a19..531b55b8e81 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1,139 +1,894 @@
-#include <linux/kernel.h>
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ * Memory region support
+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ * Added E820 sanitization routine (removes overlapping memory regions);
+ * Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ * Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
#include <linux/module.h>
+#include <linux/efi.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/kexec.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
#include <linux/percpu.h>
+#include <linux/crash_dump.h>
+
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/efi.h>
+#include <asm/sections.h>
+#include <asm/dmi.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/vmi.h>
+#include <setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/bugs.h>
+
+#include <asm/system.h>
+#include <asm/vsyscall.h>
#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/gart.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <mach_apic.h>
+#include <asm/paravirt.h>
+
#include <asm/percpu.h>
#include <asm/sections.h>
-#include <asm/processor.h>
-#include <asm/setup.h>
#include <asm/topology.h>
-#include <asm/mpspec.h>
#include <asm/apicdef.h>
+#ifdef CONFIG_X86_64
+#include <asm/numa_64.h>
+#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-EXPORT_SYMBOL(boot_cpu_physical_apicid);
+#ifndef ARCH_SETUP
+#define ARCH_SETUP
+#endif
+
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+
+#ifdef CONFIG_X86_32
+/* This value is set up by the early boot code to point to the value
+ immediately after the boot time page tables. It contains a *physical*
+ address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
+unsigned long init_pg_tables_end __initdata = ~0UL;
+
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+EXPORT_SYMBOL(boot_cpu_data);
+static void set_mca_bus(int x)
+{
+#ifdef CONFIG_MCA
+ MCA_bus = x;
+#endif
+}
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#else
+struct ist_info ist_info;
+#endif
+
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
#endif
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas. These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ * from boot_params into a safe place.
+ *
*/
-static void __init setup_per_cpu_maps(void)
+static inline void copy_edd(void)
+{
+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+ sizeof(edd.mbr_signature));
+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+ edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+#ifdef CONFIG_X86_32
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+static void __init relocate_initrd(void)
{
- int cpu;
- for_each_possible_cpu(cpu) {
- per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
- per_cpu(x86_bios_cpu_apicid, cpu) =
- x86_bios_cpu_apicid_init[cpu];
-#ifdef CONFIG_NUMA
- per_cpu(x86_cpu_to_node_map, cpu) =
- x86_cpu_to_node_map_init[cpu];
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 ramdisk_here;
+ unsigned long slop, clen, mapaddr;
+ char *p, *q;
+
+ /* We need to move the initrd down into lowmem */
+ ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+ PAGE_SIZE);
+
+ if (ramdisk_here == -1ULL)
+ panic("Cannot find place for new RAMDISK of size %lld\n",
+ ramdisk_size);
+
+ /* Note: this includes all the lowmem currently occupied by
+ the initrd, we rely on that fact to keep the data intact. */
+ reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+ "NEW RAMDISK");
+ initrd_start = ramdisk_here + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+ ramdisk_here, ramdisk_here + ramdisk_size);
+
+ q = (char *)initrd_start;
+
+ /* Copy any lowmem portion of the initrd */
+ if (ramdisk_image < end_of_lowmem) {
+ clen = end_of_lowmem - ramdisk_image;
+ p = (char *)__va(ramdisk_image);
+ memcpy(q, p, clen);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+
+ /* Copy the highmem portion of the initrd */
+ while (ramdisk_size) {
+ slop = ramdisk_image & ~PAGE_MASK;
+ clen = ramdisk_size;
+ if (clen > MAX_MAP_CHUNK-slop)
+ clen = MAX_MAP_CHUNK-slop;
+ mapaddr = ramdisk_image & PAGE_MASK;
+ p = early_ioremap(mapaddr, clen+slop);
+ memcpy(q, p+slop, clen);
+ early_iounmap(p, clen+slop);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+ /* high pages is not converted by early_res_to_bootmem */
+ ramdisk_image = boot_params.hdr.ramdisk_image;
+ ramdisk_size = boot_params.hdr.ramdisk_size;
+ printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+ " %08llx - %08llx\n",
+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
+}
#endif
+
+static void __init reserve_initrd(void)
+{
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ initrd_start = 0;
+
+ if (ramdisk_size >= (end_of_lowmem>>1)) {
+ free_early(ramdisk_image, ramdisk_end);
+ printk(KERN_ERR "initrd too large to handle, "
+ "disabling initrd\n");
+ return;
+ }
+
+ printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
+ ramdisk_end);
+
+
+ if (ramdisk_end <= end_of_lowmem) {
+ /* All in lowmem, easy case */
+ /*
+ * don't need to reserve again, already reserved early
+ * in i386_start_kernel
+ */
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ return;
}
- /* indicate the early static arrays will soon be gone */
- x86_cpu_to_apicid_early_ptr = NULL;
- x86_bios_cpu_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = NULL;
+#ifdef CONFIG_X86_32
+ relocate_initrd();
+#else
+ printk(KERN_ERR "initrd extends beyond end of memory "
+ "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
+ ramdisk_end, end_of_lowmem);
+ initrd_start = 0;
#endif
+ free_early(ramdisk_image, ramdisk_end);
}
+#else
+static void __init reserve_initrd(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+static void __init parse_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, PAGE_SIZE);
+ switch (data->type) {
+ case SETUP_E820_EXT:
+ parse_e820_ext(data, pa_data);
+ break;
+ default:
+ break;
+ }
+ pa_data = data->next;
+ early_iounmap(data, PAGE_SIZE);
+ }
+}
+
+static void __init e820_reserve_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+ int found = 0;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ e820_update_range(pa_data, sizeof(*data)+data->len,
+ E820_RAM, E820_RESERVED_KERN);
+ found = 1;
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+ if (!found)
+ return;
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
-EXPORT_SYMBOL(cpumask_of_cpu_map);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ printk(KERN_INFO "extended physical RAM map:\n");
+ e820_print_map("reserve setup_data");
+}
-/* requires nr_cpu_ids to be initialized */
-static void __init setup_cpumask_of_cpu(void)
+static void __init reserve_early_setup_data(void)
{
- int i;
+ struct setup_data *data;
+ u64 pa_data;
+ char buf[32];
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ sprintf(buf, "setup data %x", data->type);
+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+}
+
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+#ifdef CONFIG_KEXEC
+
+/**
+ * Reserve @size bytes of crashkernel memory at any suitable offset.
+ *
+ * @size: Size of the crashkernel memory to reserve.
+ * Returns the base address on success, and -1ULL on failure.
+ */
+unsigned long long find_and_reserve_crashkernel(unsigned long long size)
+{
+ const unsigned long long alignment = 16<<20; /* 16M */
+ unsigned long long start = 0LL;
+
+ while (1) {
+ int ret;
+
+ start = find_e820_area(start, ULONG_MAX, size, alignment);
+ if (start == -1ULL)
+ return start;
+
+ /* try to reserve it */
+ ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
+ if (ret >= 0)
+ return start;
+
+ start += alignment;
+ }
+}
+
+static inline unsigned long long get_total_mem(void)
+{
+ unsigned long long total;
+
+ total = max_low_pfn - min_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ total += highend_pfn - highstart_pfn;
+#endif
+
+ return total << PAGE_SHIFT;
+}
+
+static void __init reserve_crashkernel(void)
+{
+ unsigned long long total_mem;
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ total_mem = get_total_mem();
- /* alloc_bootmem zeroes memory */
- cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
- for (i = 0; i < nr_cpu_ids; i++)
- cpu_set(i, cpumask_of_cpu_map[i]);
+ ret = parse_crashkernel(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0)
+ return;
+
+ /* 0 means: find the address automatically */
+ if (crash_base <= 0) {
+ crash_base = find_and_reserve_crashkernel(crash_size);
+ if (crash_base == -1ULL) {
+ pr_info("crashkernel reservation failed. "
+ "No suitable area found.\n");
+ return;
+ }
+ } else {
+ ret = reserve_bootmem_generic(crash_base, crash_size,
+ BOOTMEM_EXCLUSIVE);
+ if (ret < 0) {
+ pr_info("crashkernel reservation failed - "
+ "memory is in use\n");
+ return;
+ }
+ }
+
+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+ "for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
}
#else
-static inline void setup_cpumask_of_cpu(void) { }
+static void __init reserve_crashkernel(void)
+{
+}
#endif
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
+static struct resource standard_io_resources[] = {
+ { .name = "dma1", .start = 0x00, .end = 0x1f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic1", .start = 0x20, .end = 0x21,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer0", .start = 0x40, .end = 0x43,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer1", .start = 0x50, .end = 0x53,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x60, .end = 0x60,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x64, .end = 0x64,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "fpu", .start = 0xf0, .end = 0xff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+static void __init reserve_standard_io_resources(void)
+{
+ int i;
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
*/
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
#endif
/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization. Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
*/
-void __init setup_per_cpu_areas(void)
+
+void __init setup_arch(char **cmdline_p)
{
- int i, highest_cpu = 0;
- unsigned long size;
+#ifdef CONFIG_X86_32
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+ visws_early_detect();
+ pre_setup_arch_hook();
+ early_cpu_init();
+#else
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+#endif
-#ifdef CONFIG_HOTPLUG_CPU
- prefill_possible_map();
+ early_ioremap_init();
+
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+ if (boot_params.sys_desc_table.length != 0) {
+ set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+ machine_id = boot_params.sys_desc_table.table[0];
+ machine_submodel_id = boot_params.sys_desc_table.table[1];
+ BIOS_revision = boot_params.sys_desc_table.table[2];
+ }
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+#ifdef CONFIG_X86_32
+ "EL32",
+#else
+ "EL64",
+#endif
+ 4)) {
+ efi_enabled = 1;
+ efi_reserve_early();
+ }
#endif
- /* Copy section for each CPU (we discard the original) */
- size = PERCPU_ENOUGH_ROOM;
- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
- size);
+ ARCH_SETUP
- for_each_possible_cpu(i) {
- char *ptr;
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
+ setup_memory_map();
+ parse_setup_data();
+ /* update the e820_saved too */
+ e820_reserve_setup_data();
+
+ copy_edd();
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+ init_mm.start_code = (unsigned long) _text;
+ init_mm.end_code = (unsigned long) _etext;
+ init_mm.end_data = (unsigned long) _edata;
+#ifdef CONFIG_X86_32
+ init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
#else
- int node = early_cpu_to_node(i);
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = alloc_bootmem_pages(size);
- printk(KERN_INFO
- "cpu %d has no node or node-local memory\n", i);
- }
- else
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ init_mm.brk = (unsigned long) &_end;
#endif
- if (!ptr)
- panic("Cannot allocate cpu data for CPU %d\n", i);
+
+ code_resource.start = virt_to_phys(_text);
+ code_resource.end = virt_to_phys(_etext)-1;
+ data_resource.start = virt_to_phys(_etext);
+ data_resource.end = virt_to_phys(_edata)-1;
+ bss_resource.start = virt_to_phys(&__bss_start);
+ bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
#ifdef CONFIG_X86_64
- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+ early_cpu_init();
+#endif
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ parse_early_param();
+
+ /* after early param, so could get panic from serial */
+ reserve_early_setup_data();
+
+ if (acpi_mps_check()) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ disable_apic = 1;
+#endif
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ }
+
+#ifdef CONFIG_PCI
+ if (pci_early_dump_regs)
+ early_dump_pci_devices();
+#endif
+
+ finish_e820_parsing();
+
+#ifdef CONFIG_X86_32
+ probe_roms();
+#endif
+
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+
+ if (efi_enabled)
+ efi_init();
+
+#ifdef CONFIG_X86_32
+ if (ppro_with_ram_bug()) {
+ e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+ E820_RESERVED);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ printk(KERN_INFO "fixed physical RAM map:\n");
+ e820_print_map("bad_ppro");
+ }
+#else
+ early_gart_iommu_check();
+#endif
+
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ max_pfn = e820_end_of_ram_pfn();
+
+ /* preallocate 4k for mptable mpc */
+ early_reserve_e820_mpc_new();
+ /* update e820 for memory not covered by WB MTRRs */
+ mtrr_bp_init();
+ if (mtrr_trim_uncached_memory(max_pfn))
+ max_pfn = e820_end_of_ram_pfn();
+
+#ifdef CONFIG_X86_32
+ /* max_low_pfn get updated here */
+ find_low_pfn_range();
#else
- __per_cpu_offset[i] = ptr - __per_cpu_start;
+ num_physpages = max_pfn;
+
+ check_efer();
+
+ /* How many end-of-memory variables you have, grandma! */
+ /* need this before calling reserve_initrd */
+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+ max_low_pfn = e820_end_of_low_ram_pfn();
+ else
+ max_low_pfn = max_pfn;
+
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
- highest_cpu = i;
+ /* max_pfn_mapped is updated here */
+ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+ max_pfn_mapped = max_low_pfn_mapped;
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ max_pfn_mapped = init_memory_mapping(1UL<<32,
+ max_pfn<<PAGE_SHIFT);
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
}
+#endif
- nr_cpu_ids = highest_cpu + 1;
- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+ /*
+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+ */
- /* Setup percpu data maps */
- setup_per_cpu_maps();
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ if (init_ohci1394_dma_early)
+ init_ohci1394_dma_on_all_controllers();
+#endif
- /* Setup cpumask_of_cpu map */
- setup_cpumask_of_cpu();
-}
+ reserve_initrd();
+
+#ifdef CONFIG_X86_64
+ vsmp_init();
+#endif
+
+ dmi_scan_machine();
+
+ io_delay_init();
+
+ /*
+ * Parse the ACPI tables for possible boot-time SMP configuration.
+ */
+ acpi_boot_table_init();
+
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * Parse SRAT to discover nodes.
+ */
+ acpi_numa_init();
+#endif
+
+ initmem_init(0, max_pfn);
+
+#ifdef CONFIG_X86_64
+ dma32_reserve_bootmem();
+#endif
+
+#ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+ */
+ acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+#endif
+ reserve_crashkernel();
+
+ reserve_ibft_region();
+
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
+ /*
+ * Must be after max_low_pfn is determined, and before kernel
+ * pagetables are setup.
+ */
+ vmi_init();
+#endif
+
+ paging_init();
+
+#ifdef CONFIG_X86_64
+ map_vsyscall();
+#endif
+
+#ifdef CONFIG_X86_GENERICARCH
+ generic_apic_probe();
+#endif
+
+ early_quirks();
+
+ /*
+ * Read APIC and some other early information from ACPI tables.
+ */
+ acpi_boot_init();
+
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ get_smp_config();
+#endif
+ prefill_possible_map();
+#ifdef CONFIG_X86_64
+ init_cpu_to_node();
+#endif
+
+#ifdef CONFIG_X86_NUMAQ
+ /*
+ * need to check online nodes num, call it
+ * here before time_init/tsc_init
+ */
+ numaq_tsc_disable();
+#endif
+
+ init_apic_mappings();
+ ioapic_init_mappings();
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
+ if (def_to_bigsmp)
+ printk(KERN_WARNING "More than 8 CPUs detected and "
+ "CONFIG_X86_PC cannot handle it.\nUse "
+ "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+ kvm_guest_init();
+
+ e820_reserve_resources();
+ e820_mark_nosave_regions(max_low_pfn);
+
+#ifdef CONFIG_X86_32
+ request_resource(&iomem_resource, &video_ram_resource);
+#endif
+ reserve_standard_io_resources();
+
+ e820_setup_gap();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
#endif
+#endif
+}
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
deleted file mode 100644
index aee0e820077..00000000000
--- a/arch/x86/kernel/setup64.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * X86-64 specific CPU setup.
- * Copyright (C) 1995 Linus Torvalds
- * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
- * See setup.c for older changelog.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/mmu_context.h>
-#include <asm/smp.h>
-#include <asm/i387.h>
-#include <asm/percpu.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/genapic.h>
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
-
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-
-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
-
-static int do_not_nx __cpuinitdata = 0;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on Enable(default)
-off Disable
-*/
-static int __init nonx_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
- do_not_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- do_not_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- }
- return 0;
-}
-early_param("noexec", nonx_setup);
-
-int force_personality32 = 0;
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-off PROT_READ implies PROT_EXEC
-*/
-static int __init nonx32_setup(char *str)
-{
- if (!strcmp(str, "on"))
- force_personality32 &= ~READ_IMPLIES_EXEC;
- else if (!strcmp(str, "off"))
- force_personality32 |= READ_IMPLIES_EXEC;
- return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-void pda_init(int cpu)
-{
- struct x8664_pda *pda = cpu_pda(cpu);
-
- /* Setup up data that may be needed in __get_free_pages early */
- asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
- /* Memory clobbers used to order PDA accessed */
- mb();
- wrmsrl(MSR_GS_BASE, pda);
- mb();
-
- pda->cpunumber = cpu;
- pda->irqcount = -1;
- pda->kernelstack =
- (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
- pda->active_mm = &init_mm;
- pda->mmu_state = 0;
-
- if (cpu == 0) {
- /* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
- pda->irqstackptr = boot_cpu_stack;
- } else {
- pda->irqstackptr = (char *)
- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
- if (!pda->irqstackptr)
- panic("cannot allocate irqstack for cpu %d", cpu);
- }
-
-
- pda->irqstackptr += IRQSTACKSIZE-64;
-}
-
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
- /*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to set CS/DS
- * but only a 32bit target. LSTAR sets the 64bit rip.
- */
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
-
-#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init ();
-#endif
-
- /* Flags to clear on syscall */
- wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
-}
-
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || do_not_nx) {
- __supported_pte_mask &= ~_PAGE_NX;
- }
-}
-
-unsigned long kernel_eflags;
-
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init (void)
-{
- int cpu = stack_smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
- unsigned long v;
- char *estacks = NULL;
- struct task_struct *me;
- int i;
-
- /* CPU 0 is initialised in head64.c */
- if (cpu != 0) {
- pda_init(cpu);
- } else
- estacks = boot_exception_stacks;
-
- me = current;
-
- if (cpu_test_and_set(cpu, cpu_initialized))
- panic("CPU#%d already initialized!\n", cpu);
-
- printk("Initializing CPU#%d\n", cpu);
-
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
- /*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
- if (cpu)
- memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
-
- cpu_gdt_descr[cpu].size = GDT_SIZE;
- load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
- load_idt((const struct desc_ptr *)&idt_descr);
-
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
-
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
-
- check_efer();
-
- /*
- * set up and load the per-CPU TSS
- */
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
- if (cpu) {
- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
- if (!estacks)
- panic("Cannot allocate exception stack %ld %d\n",
- v, cpu);
- }
- estacks += PAGE_SIZE << order[v];
- orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
- }
-
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
-
- atomic_inc(&init_mm.mm_count);
- me->active_mm = &init_mm;
- if (me->mm)
- BUG();
- enter_lazy_tlb(&init_mm, me);
-
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
-
-#ifdef CONFIG_KGDB
- /*
- * If the kgdb is connected no debug regs should be altered. This
- * is only applicable when KGDB and a KGDB I/O module are built
- * into the kernel and you are using early debugging with
- * kgdbwait. KGDB will control the kernel HW breakpoint registers.
- */
- if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
- arch_kgdb_ops.correct_hw_break();
- else {
-#endif
- /*
- * Clear all 6 debug registers:
- */
-
- set_debugreg(0UL, 0);
- set_debugreg(0UL, 1);
- set_debugreg(0UL, 2);
- set_debugreg(0UL, 3);
- set_debugreg(0UL, 6);
- set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
- /* If the kgdb is connected no debug regs should be altered. */
- }
-#endif
-
- fpu_init();
-
- raw_local_save_flags(kernel_eflags);
-
- if (is_uv_system())
- uv_cpu_init();
-}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
deleted file mode 100644
index 2c5f8b213e8..00000000000
--- a/arch/x86/kernel/setup_32.c
+++ /dev/null
@@ -1,958 +0,0 @@
-/*
- * Copyright (C) 1995 Linus Torvalds
- *
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- * Memory region support
- * David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- * Added E820 sanitization routine (removes overlapping memory regions);
- * Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- * Patrick Mochel <mochel@osdl.org>, March 2002
- *
- * Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/bootmem.h>
-#include <linux/seq_file.h>
-#include <linux/console.h>
-#include <linux/mca.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kexec.h>
-#include <linux/crash_dump.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
-#include <linux/pci.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <video/edid.h>
-
-#include <asm/mtrr.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/mmzone.h>
-#include <asm/setup.h>
-#include <asm/arch_hooks.h>
-#include <asm/sections.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/io.h>
-#include <asm/vmi.h>
-#include <setup_arch.h>
-#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
-
-/* This value is set up by the early boot code to point to the value
- immediately after the boot time page tables. It contains a *physical*
- address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
-/*
- * Machine setup..
- */
-static struct resource data_resource = {
- .name = "Kernel data",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource code_resource = {
- .name = "Kernel code",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource bss_resource = {
- .name = "Kernel bss",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource video_ram_resource = {
- .name = "Video RAM area",
- .start = 0xa0000,
- .end = 0xbffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
- .name = "dma1",
- .start = 0x0000,
- .end = 0x001f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "pic1",
- .start = 0x0020,
- .end = 0x0021,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "timer0",
- .start = 0x0040,
- .end = 0x0043,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "timer1",
- .start = 0x0050,
- .end = 0x0053,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "keyboard",
- .start = 0x0060,
- .end = 0x0060,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "keyboard",
- .start = 0x0064,
- .end = 0x0064,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "dma page reg",
- .start = 0x0080,
- .end = 0x008f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "pic2",
- .start = 0x00a0,
- .end = 0x00a1,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "dma2",
- .start = 0x00c0,
- .end = 0x00df,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
- .name = "fpu",
- .start = 0x00f0,
- .end = 0x00ff,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
-/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
-EXPORT_SYMBOL(boot_cpu_data);
-
-unsigned int def_to_bigsmp;
-
-#ifndef CONFIG_X86_PAE
-unsigned long mmu_cr4_features;
-#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
-#endif
-
-/* for MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-/* user-defined highmem size */
-static unsigned int highmem_pages = -1;
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct apm_info apm_info;
-EXPORT_SYMBOL(apm_info);
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-struct ist_info ist_info;
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
- defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
-EXPORT_SYMBOL(ist_info);
-#endif
-
-extern void early_cpu_init(void);
-extern int root_mountflags;
-
-unsigned long saved_video_mode;
-
-#define RAMDISK_IMAGE_START_MASK 0x07FF
-#define RAMDISK_PROMPT_FLAG 0x8000
-#define RAMDISK_LOAD_FLAG 0x4000
-
-static char __initdata command_line[COMMAND_LINE_SIZE];
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
-struct boot_params boot_params;
-#endif
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-/**
- * copy_edd() - Copy the BIOS EDD information
- * from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
- sizeof(edd.mbr_signature));
- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
- edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem= [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp(arg, "nopentium") == 0) {
- setup_clear_cpu_cap(X86_FEATURE_PSE);
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long mem_size;
-
- mem_size = memparse(arg, &arg);
- limit_regions(mem_size);
- user_defined_memmap = 1;
- }
- return 0;
-}
-early_param("mem", parse_mem);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
-static int __init parse_elfcorehdr(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- elfcorehdr_addr = memparse(arg, &arg);
- return 0;
-}
-early_param("elfcorehdr", parse_elfcorehdr);
-#endif /* CONFIG_PROC_VMCORE */
-
-/*
- * highmem=size forces highmem to be exactly 'size' bytes.
- * This works even on boxes that have no highmem otherwise.
- * This also works to reduce highmem size on bigger boxes.
- */
-static int __init parse_highmem(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
- return 0;
-}
-early_param("highmem", parse_highmem);
-
-/*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the
- * vmalloc area - the default is 128m.
- */
-static int __init parse_vmalloc(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- __VMALLOC_RESERVE = memparse(arg, &arg);
- return 0;
-}
-early_param("vmalloc", parse_vmalloc);
-
-/*
- * reservetop=size reserves a hole at the top of the kernel address space which
- * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
- * so relocating the fixmap can be done before paging initialization.
- */
-static int __init parse_reservetop(char *arg)
-{
- unsigned long address;
-
- if (!arg)
- return -EINVAL;
-
- address = memparse(arg, &arg);
- reserve_top_address(address);
- return 0;
-}
-early_param("reservetop", parse_reservetop);
-
-/*
- * Determine low and high memory ranges:
- */
-unsigned long __init find_max_low_pfn(void)
-{
- unsigned long max_low_pfn;
-
- max_low_pfn = max_pfn;
- if (max_low_pfn > MAXMEM_PFN) {
- if (highmem_pages == -1)
- highmem_pages = max_pfn - MAXMEM_PFN;
- if (highmem_pages + MAXMEM_PFN < max_pfn)
- max_pfn = MAXMEM_PFN + highmem_pages;
- if (highmem_pages + MAXMEM_PFN > max_pfn) {
- printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
- highmem_pages = 0;
- }
- max_low_pfn = MAXMEM_PFN;
-#ifndef CONFIG_HIGHMEM
- /* Maximum memory usable is what is directly addressable */
- printk(KERN_WARNING "Warning only %ldMB will be used.\n",
- MAXMEM>>20);
- if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
- else
- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
- max_pfn = MAXMEM_PFN;
-#else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_HIGHMEM64G
- if (max_pfn > MAX_NONPAE_PFN) {
- max_pfn = MAX_NONPAE_PFN;
- printk(KERN_WARNING "Warning only 4GB will be used.\n");
- printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
- }
-#endif /* !CONFIG_HIGHMEM64G */
-#endif /* !CONFIG_HIGHMEM */
- } else {
- if (highmem_pages == -1)
- highmem_pages = 0;
-#ifdef CONFIG_HIGHMEM
- if (highmem_pages >= max_pfn) {
- printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
- highmem_pages = 0;
- }
- if (highmem_pages) {
- if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
- printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
- highmem_pages = 0;
- }
- max_low_pfn -= highmem_pages;
- }
-#else
- if (highmem_pages)
- printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
-#endif
- }
- return max_low_pfn;
-}
-
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
-}
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-static void __init setup_bootmem_allocator(void);
-static unsigned long __init setup_memory(void)
-{
- /*
- * partially used pages are not usable - thus
- * we are rounding upwards:
- */
- min_low_pfn = PFN_UP(init_pg_tables_end);
-
- max_low_pfn = find_max_low_pfn();
-
-#ifdef CONFIG_HIGHMEM
- highstart_pfn = highend_pfn = max_pfn;
- if (max_pfn > max_low_pfn) {
- highstart_pfn = max_low_pfn;
- }
- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
- pages_to_mb(highend_pfn - highstart_pfn));
- num_physpages = highend_pfn;
- high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
- num_physpages = max_low_pfn;
- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif
-#ifdef CONFIG_FLATMEM
- max_mapnr = num_physpages;
-#endif
- printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
- pages_to_mb(max_low_pfn));
-
- setup_bootmem_allocator();
-
- return max_low_pfn;
-}
-
-static void __init zone_sizes_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] =
- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
- add_active_range(0, 0, highend_pfn);
-#else
- add_active_range(0, 0, max_low_pfn);
-#endif
-
- free_area_init_nodes(max_zone_pfns);
-}
-#else
-extern unsigned long __init setup_memory(void);
-extern void zone_sizes_init(void);
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-
-static inline unsigned long long get_total_mem(void)
-{
- unsigned long long total;
-
- total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
- total += highend_pfn - highstart_pfn;
-#endif
-
- return total << PAGE_SHIFT;
-}
-
-#ifdef CONFIG_KEXEC
-static void __init reserve_crashkernel(void)
-{
- unsigned long long total_mem;
- unsigned long long crash_size, crash_base;
- int ret;
-
- total_mem = get_total_mem();
-
- ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret == 0 && crash_size > 0) {
- if (crash_base > 0) {
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- reserve_bootmem(crash_base, crash_size,
- BOOTMEM_DEFAULT);
- } else
- printk(KERN_INFO "crashkernel reservation failed - "
- "you have to specify a base address\n");
- }
-}
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-
-static bool do_relocate_initrd = false;
-
-static void __init reserve_initrd(void)
-{
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
- unsigned long ramdisk_here;
-
- initrd_start = 0;
-
- if (!boot_params.hdr.type_of_loader ||
- !ramdisk_image || !ramdisk_size)
- return; /* No initrd provided by bootloader */
-
- if (ramdisk_end < ramdisk_image) {
- printk(KERN_ERR "initrd wraps around end of memory, "
- "disabling initrd\n");
- return;
- }
- if (ramdisk_size >= end_of_lowmem/2) {
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
- }
- if (ramdisk_end <= end_of_lowmem) {
- /* All in lowmem, easy case */
- reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
- initrd_start = ramdisk_image + PAGE_OFFSET;
- initrd_end = initrd_start+ramdisk_size;
- return;
- }
-
- /* We need to move the initrd down into lowmem */
- ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
-
- /* Note: this includes all the lowmem currently occupied by
- the initrd, we rely on that fact to keep the data intact. */
- reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
- initrd_start = ramdisk_here + PAGE_OFFSET;
- initrd_end = initrd_start + ramdisk_size;
-
- do_relocate_initrd = true;
-}
-
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
-
-static void __init relocate_initrd(void)
-{
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
- unsigned long ramdisk_here;
- unsigned long slop, clen, mapaddr;
- char *p, *q;
-
- if (!do_relocate_initrd)
- return;
-
- ramdisk_here = initrd_start - PAGE_OFFSET;
-
- q = (char *)initrd_start;
-
- /* Copy any lowmem portion of the initrd */
- if (ramdisk_image < end_of_lowmem) {
- clen = end_of_lowmem - ramdisk_image;
- p = (char *)__va(ramdisk_image);
- memcpy(q, p, clen);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-
- /* Copy the highmem portion of the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_ioremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_iounmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-}
-
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-void __init setup_bootmem_allocator(void)
-{
- unsigned long bootmap_size;
- /*
- * Initialize the boot-time allocator (with low memory only):
- */
- bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
- register_bootmem_low_pages(max_low_pfn);
-
- /*
- * Reserve the bootmem bitmap itself as well. We do this in two
- * steps (first step was init_bootmem()) because this catches
- * the (very unlikely) case of us accidentally initializing the
- * bootmem allocator with an invalid RAM area.
- */
- reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
- BOOTMEM_DEFAULT);
-
- /*
- * reserve physical page 0 - it's a special BIOS page on many boxes,
- * enabling clean reboots, SMP operation, laptop functions.
- */
- reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
- /* reserve EBDA region */
- reserve_ebda_region();
-
-#ifdef CONFIG_SMP
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-#ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
- */
- acpi_reserve_bootmem();
-#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
-#endif
-#ifdef CONFIG_BLK_DEV_INITRD
- reserve_initrd();
-#endif
- numa_kva_reserve();
- reserve_crashkernel();
-
- reserve_ibft_region();
-}
-
-/*
- * The node 0 pgdat is initialized before all of these because
- * it's needed for bootmem. node>0 pgdats have their virtual
- * space allocated before the pagetables are in place to access
- * them, so they can't be cleared then.
- *
- * This should all compile down to nothing when NUMA is off.
- */
-static void __init remapped_pgdat_init(void)
-{
- int nid;
-
- for_each_online_node(nid) {
- if (nid != 0)
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
- }
-}
-
-#ifdef CONFIG_MCA
-static void set_mca_bus(int x)
-{
- MCA_bus = x;
-}
-#else
-static void set_mca_bus(int x) { }
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
- return machine_specific_memory_setup();
-}
-
-#ifdef CONFIG_NUMA
-/*
- * In the golden day, when everything among i386 and x86_64 will be
- * integrated, this will not live here
- */
-void *x86_cpu_to_node_map_early_ptr;
-int x86_cpu_to_node_map_init[NR_CPUS] = {
- [0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-#endif
-
-/*
- * Determine if we were loaded by an EFI loader. If so, then we have also been
- * passed the efi memmap, systab, etc., so we should use these data structures
- * for initialization. Note, the efi init code path is determined by the
- * global efi_enabled. This allows the same kernel image to be used on existing
- * systems (with a traditional BIOS) as well as on EFI systems.
- */
-void __init setup_arch(char **cmdline_p)
-{
- unsigned long max_low_pfn;
-
- memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
- pre_setup_arch_hook();
- early_cpu_init();
- early_ioremap_init();
-
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL32", 4))
- efi_enabled = 1;
-#endif
-
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
- saved_video_mode = boot_params.hdr.vid_mode;
- if( boot_params.sys_desc_table.length != 0 ) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
- machine_id = boot_params.sys_desc_table.table[0];
- machine_submodel_id = boot_params.sys_desc_table.table[1];
- BIOS_revision = boot_params.sys_desc_table.table[2];
- }
- bootloader_type = boot_params.hdr.type_of_loader;
-
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
- ARCH_SETUP
-
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- print_memory_map(memory_setup());
-
- copy_edd();
-
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
- parse_early_param();
-
- if (user_defined_memmap) {
- printk(KERN_INFO "user-defined physical RAM map:\n");
- print_memory_map("user");
- }
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
- if (efi_enabled)
- efi_init();
-
- /* update e820 for memory not covered by WB MTRRs */
- propagate_e820_map();
- mtrr_bp_init();
- if (mtrr_trim_uncached_memory(max_pfn))
- propagate_e820_map();
-
- max_low_pfn = setup_memory();
-
-#ifdef CONFIG_KVM_CLOCK
- kvmclock_init();
-#endif
-
-#ifdef CONFIG_VMI
- /*
- * Must be after max_low_pfn is determined, and before kernel
- * pagetables are setup.
- */
- vmi_init();
-#endif
- kvm_guest_init();
-
- /*
- * NOTE: before this point _nobody_ is allowed to allocate
- * any memory using the bootmem allocator. Although the
- * allocator is now initialised only the first 8Mb of the kernel
- * virtual address space has been mapped. All allocations before
- * paging_init() has completed must use the alloc_bootmem_low_pages()
- * variant (which allocates DMA'able memory) and care must be taken
- * not to exceed the 8Mb limit.
- */
-
-#ifdef CONFIG_SMP
- smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
- paging_init();
-
- /*
- * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
- */
-
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- if (init_ohci1394_dma_early)
- init_ohci1394_dma_on_all_controllers();
-#endif
-
- remapped_pgdat_init();
- sparse_init();
- zone_sizes_init();
-
- /*
- * NOTE: at this point the bootmem allocator is fully available.
- */
-
-#ifdef CONFIG_BLK_DEV_INITRD
- relocate_initrd();
-#endif
-
- paravirt_post_allocator_init();
-
- dmi_scan_machine();
-
- io_delay_init();
-
-#ifdef CONFIG_X86_SMP
- /*
- * setup to use the early static init tables during kernel startup
- * X86_SMP will exclude sub-arches that don't deal well with it.
- */
- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
-#ifdef CONFIG_X86_GENERICARCH
- generic_apic_probe();
-#endif
-
-#ifdef CONFIG_ACPI
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
-#endif
-
- early_quirks();
-
-#ifdef CONFIG_ACPI
- acpi_boot_init();
-
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
- if (def_to_bigsmp)
- printk(KERN_WARNING "More than 8 CPUs detected and "
- "CONFIG_X86_PC cannot handle it.\nUse "
- "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-#endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- if (smp_found_config)
- get_smp_config();
-#endif
-
- e820_register_memory();
- e820_mark_nosave_regions();
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
-#endif
-#endif
-}
-
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
- int i;
-
- printk(KERN_INFO "Setting up standard PCI resources\n");
- init_iomem_resources(&code_resource, &data_resource, &bss_resource);
-
- request_resource(&iomem_resource, &video_ram_resource);
-
- /* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
- request_resource(&ioport_resource, &standard_io_resources[i]);
- return 0;
-}
-
-subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
deleted file mode 100644
index 6dff1286ad8..00000000000
--- a/arch/x86/kernel/setup_64.c
+++ /dev/null
@@ -1,1194 +0,0 @@
-/*
- * Copyright (C) 1995 Linus Torvalds
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/initrd.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <linux/console.h>
-#include <linux/seq_file.h>
-#include <linux/crash_dump.h>
-#include <linux/root_dev.h>
-#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/efi.h>
-#include <linux/acpi.h>
-#include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/mmzone.h>
-#include <linux/kexec.h>
-#include <linux/cpufreq.h>
-#include <linux/dmi.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/sort.h>
-#include <linux/uaccess.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <asm/mtrr.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/vsyscall.h>
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/msr.h>
-#include <asm/desc.h>
-#include <video/edid.h>
-#include <asm/e820.h>
-#include <asm/dma.h>
-#include <asm/gart.h>
-#include <asm/mpspec.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/numa.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-#include <asm/cacheflush.h>
-#include <asm/mce.h>
-#include <asm/ds.h>
-#include <asm/topology.h>
-#include <asm/trampoline.h>
-#include <asm/pat.h>
-
-#include <mach_apic.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define ARCH_SETUP
-#endif
-
-/*
- * Machine setup..
- */
-
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
-unsigned long mmu_cr4_features;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-unsigned long saved_video_mode;
-
-int force_mwait __cpuinitdata;
-
-/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct sys_desc_table_struct {
- unsigned short length;
- unsigned char table[0];
-};
-
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-
-extern int root_mountflags;
-
-char __initdata command_line[COMMAND_LINE_SIZE];
-
-static struct resource standard_io_resources[] = {
- { .name = "dma1", .start = 0x00, .end = 0x1f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "pic1", .start = 0x20, .end = 0x21,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "timer0", .start = 0x40, .end = 0x43,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "timer1", .start = 0x50, .end = 0x53,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "keyboard", .start = 0x60, .end = 0x60,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "keyboard", .start = 0x64, .end = 0x64,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "dma page reg", .start = 0x80, .end = 0x8f,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "pic2", .start = 0xa0, .end = 0xa1,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "dma2", .start = 0xc0, .end = 0xdf,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO },
- { .name = "fpu", .start = 0xf0, .end = 0xff,
- .flags = IORESOURCE_BUSY | IORESOURCE_IO }
-};
-
-#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
-
-static struct resource data_resource = {
- .name = "Kernel data",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_RAM,
-};
-static struct resource code_resource = {
- .name = "Kernel code",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_RAM,
-};
-static struct resource bss_resource = {
- .name = "Kernel bss",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_RAM,
-};
-
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
- char *end;
- if (!arg)
- return -EINVAL;
- elfcorehdr_addr = memparse(arg, &end);
- return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
-#ifndef CONFIG_NUMA
-static void __init
-contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long bootmap_size, bootmap;
-
- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
- PAGE_SIZE);
- if (bootmap == -1L)
- panic("Cannot find bootmem map of size %ld\n", bootmap_size);
- bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
- e820_register_active_regions(0, start_pfn, end_pfn);
- free_bootmem_with_active_regions(0, end_pfn);
- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
-}
-#endif
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-/**
- * copy_edd() - Copy the BIOS EDD information
- * from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
- memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
- sizeof(edd.mbr_signature));
- memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
- edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
- edd.edd_info_nr = boot_params.eddbuf_entries;
-}
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-#ifdef CONFIG_KEXEC
-static void __init reserve_crashkernel(void)
-{
- unsigned long long total_mem;
- unsigned long long crash_size, crash_base;
- int ret;
-
- total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
-
- ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret == 0 && crash_size) {
- if (crash_base <= 0) {
- printk(KERN_INFO "crashkernel reservation failed - "
- "you have to specify a base address\n");
- return;
- }
-
- if (reserve_bootmem(crash_base, crash_size,
- BOOTMEM_EXCLUSIVE) < 0) {
- printk(KERN_INFO "crashkernel reservation failed - "
- "memory is in use\n");
- return;
- }
-
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
- }
-}
-#else
-static inline void __init reserve_crashkernel(void)
-{}
-#endif
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
- machine_specific_memory_setup();
-}
-
-static void __init parse_setup_data(void)
-{
- struct setup_data *data;
- unsigned long pa_data;
-
- if (boot_params.hdr.version < 0x0209)
- return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_ioremap(pa_data, PAGE_SIZE);
- switch (data->type) {
- default:
- break;
- }
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
- free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
- pa_data = data->next;
- early_iounmap(data, PAGE_SIZE);
- }
-}
-
-#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __init check_enable_amd_mmconf_dmi(void);
-#else
-void __cpuinit fam10h_check_enable_mmcfg(void)
-{
-}
-void __init check_enable_amd_mmconf_dmi(void)
-{
-}
-#endif
-
-/*
- * setup_arch - architecture-specific boot-time initializations
- *
- * Note: On x86_64, fixmaps are ready for use even before this is called.
- */
-void __init setup_arch(char **cmdline_p)
-{
- unsigned i;
-
- printk(KERN_INFO "Command line: %s\n", boot_command_line);
-
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
- saved_video_mode = boot_params.hdr.vid_mode;
- bootloader_type = boot_params.hdr.type_of_loader;
-
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL64", 4))
- efi_enabled = 1;
-#endif
-
- ARCH_SETUP
-
- memory_setup();
- copy_edd();
-
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) &_text;
- init_mm.end_code = (unsigned long) &_etext;
- init_mm.end_data = (unsigned long) &_edata;
- init_mm.brk = (unsigned long) &_end;
-
- code_resource.start = virt_to_phys(&_text);
- code_resource.end = virt_to_phys(&_etext)-1;
- data_resource.start = virt_to_phys(&_etext);
- data_resource.end = virt_to_phys(&_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
- early_identify_cpu(&boot_cpu_data);
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
- parse_setup_data();
-
- parse_early_param();
-
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- if (init_ohci1394_dma_early)
- init_ohci1394_dma_on_all_controllers();
-#endif
-
- finish_e820_parsing();
-
- /* after parse_early_param, so could debug it */
- insert_resource(&iomem_resource, &code_resource);
- insert_resource(&iomem_resource, &data_resource);
- insert_resource(&iomem_resource, &bss_resource);
-
- early_gart_iommu_check();
-
- e820_register_active_regions(0, 0, -1UL);
- /*
- * partially used pages are not usable - thus
- * we are rounding upwards:
- */
- end_pfn = e820_end_of_ram();
- /* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
- if (mtrr_trim_uncached_memory(end_pfn)) {
- e820_register_active_regions(0, 0, -1UL);
- end_pfn = e820_end_of_ram();
- }
-
- num_physpages = end_pfn;
-
- check_efer();
-
- max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
- if (efi_enabled)
- efi_init();
-
- vsmp_init();
-
- dmi_scan_machine();
-
- io_delay_init();
-
-#ifdef CONFIG_KVM_CLOCK
- kvmclock_init();
-#endif
-
-#ifdef CONFIG_SMP
- /* setup to use the early static init tables during kernel startup */
- x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
- x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
-#ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
-#endif
-#endif
-
-#ifdef CONFIG_ACPI
- /*
- * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
- * Call this early for SRAT node setup.
- */
- acpi_boot_table_init();
-#endif
-
- /* How many end-of-memory variables you have, grandma! */
- max_low_pfn = end_pfn;
- max_pfn = end_pfn;
- high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
-
- /* Remove active ranges so rediscovery with NUMA-awareness happens */
- remove_all_active_ranges();
-
-#ifdef CONFIG_ACPI_NUMA
- /*
- * Parse SRAT to discover nodes.
- */
- acpi_numa_init();
-#endif
-
-#ifdef CONFIG_NUMA
- numa_initmem_init(0, end_pfn);
-#else
- contig_initmem_init(0, end_pfn);
-#endif
-
- dma32_reserve_bootmem();
-
-#ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
- */
- acpi_reserve_bootmem();
-#endif
-
- if (efi_enabled)
- efi_reserve_bootmem();
-
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
-#ifdef CONFIG_BLK_DEV_INITRD
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
- unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
-
- if (ramdisk_end <= end_of_mem) {
- /*
- * don't need to reserve again, already reserved early
- * in x86_64_start_kernel, and early_res_to_bootmem
- * convert that to reserved in bootmem
- */
- initrd_start = ramdisk_image + PAGE_OFFSET;
- initrd_end = initrd_start+ramdisk_size;
- } else {
- free_bootmem(ramdisk_image, ramdisk_size);
- printk(KERN_ERR "initrd extends beyond end of memory "
- "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
- ramdisk_end, end_of_mem);
- initrd_start = 0;
- }
- }
-#endif
- reserve_crashkernel();
-
- reserve_ibft_region();
-
- paging_init();
- map_vsyscall();
-
- early_quirks();
-
-#ifdef CONFIG_ACPI
- /*
- * Read APIC and some other early information from ACPI tables.
- */
- acpi_boot_init();
-#endif
-
- init_cpu_to_node();
-
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- get_smp_config();
- init_apic_mappings();
- ioapic_init_mappings();
-
- kvm_guest_init();
-
- /*
- * We trust e820 completely. No explicit ROM probing in memory.
- */
- e820_reserve_resources();
- e820_mark_nosave_regions();
-
- /* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
- request_resource(&ioport_resource, &standard_io_resources[i]);
-
- e820_setup_gap();
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
-#endif
-#endif
-
- /* do this before identify_cpu for boot cpu */
- check_enable_amd_mmconf_dmi();
-}
-
-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
- unsigned int *v;
-
- if (c->extended_cpuid_level < 0x80000004)
- return 0;
-
- v = (unsigned int *) c->x86_model_id;
- cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
- cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
- cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
- c->x86_model_id[48] = 0;
- return 1;
-}
-
-
-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
- unsigned int n, dummy, eax, ebx, ecx, edx;
-
- n = c->extended_cpuid_level;
-
- if (n >= 0x80000005) {
- cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
- "D cache %dK (%d bytes/line)\n",
- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
- c->x86_cache_size = (ecx>>24) + (edx>>24);
- /* On K8 L1 TLB is inclusive, so don't count it */
- c->x86_tlbsize = 0;
- }
-
- if (n >= 0x80000006) {
- cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
- ecx = cpuid_ecx(0x80000006);
- c->x86_cache_size = ecx >> 16;
- c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
- printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
- c->x86_cache_size, ecx & 0xFF);
- }
- if (n >= 0x80000008) {
- cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- }
-}
-
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
- int i, node;
-
- for (i = apicid - 1; i >= 0; i--) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned bits;
-#ifdef CONFIG_NUMA
- int cpu = smp_processor_id();
- int node = 0;
- unsigned apicid = hard_smp_processor_id();
-#endif
- bits = c->x86_coreid_bits;
-
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
- node = c->phys_proc_id;
- if (apicid_to_node[apicid] != NUMA_NO_NODE)
- node = apicid_to_node[apicid];
- if (!node_online(node)) {
- /* Two possibilities here:
- - The CPU is missing memory and no node was created.
- In that case try picking one from a nearby CPU
- - The APIC IDs differ from the HyperTransport node IDs
- which the K8 northbridge parsing fills in.
- Assume they are all increased by a constant offset,
- but in the same order as the HT nodeids.
- If that doesn't result in a usable node fall back to the
- path for the previous case. */
-
- int ht_nodeid = c->initial_apicid;
-
- if (ht_nodeid >= 0 &&
- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = apicid_to_node[ht_nodeid];
- /* Pick a nearby node */
- if (!node_online(node))
- node = nearby_node(apicid);
- }
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
-
- c->x86_coreid_bits = bits;
-
-#endif
-}
-
-#define ENABLE_C1E_MASK 0x18000000
-#define CPUID_PROCESSOR_SIGNATURE 1
-#define CPUID_XFAM 0x0ff00000
-#define CPUID_XFAM_K8 0x00000000
-#define CPUID_XFAM_10H 0x00100000
-#define CPUID_XFAM_11H 0x00200000
-#define CPUID_XMOD 0x000f0000
-#define CPUID_XMOD_REV_F 0x00040000
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
- u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-
- switch (eax & CPUID_XFAM) {
- case CPUID_XFAM_K8:
- if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
- break;
- case CPUID_XFAM_10H:
- case CPUID_XFAM_11H:
- rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
- if (lo & ENABLE_C1E_MASK)
- return 1;
- break;
- default:
- /* err on the side of caution */
- return 1;
- }
- return 0;
-}
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
- early_init_amd_mc(c);
-
- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
- if (c->x86_power & (1<<8))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
- unsigned level;
-
-#ifdef CONFIG_SMP
- unsigned long value;
-
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 15) {
- rdmsrl(MSR_K8_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K8_HWCR, value);
- }
-#endif
-
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_cpu_cap(c, 0*32+31);
-
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- level = cpuid_eax(1);
- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
- level >= 0x0f58))
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- if (c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- level = get_model_name(c);
- if (!level) {
- switch (c->x86) {
- case 15:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
- display_cacheinfo(c);
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008)
- amd_detect_cmp(c);
-
- if (c->extended_cpuid_level >= 0x80000006 &&
- (cpuid_edx(0x80000006) & 0xf000))
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
-
- if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_K8);
-
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
- if (c->x86 == 0x10)
- fam10h_check_enable_mmcfg();
-
- if (amd_apic_timer_broken())
- disable_apic_timer = 1;
-
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
- (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
- if (!cpu_has(c, X86_FEATURE_HT))
- return;
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
-
- if (smp_num_siblings > NR_CPUS) {
- printk(KERN_WARNING "CPU: Unsupported number of "
- "siblings %d", smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
-
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = phys_pkg_id(index_msb);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
-
- c->cpu_core_id = phys_pkg_id(index_msb) &
- ((1 << core_bits) - 1);
- }
-out:
- if ((c->x86_max_cores * smp_num_siblings) > 1) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- }
-
-#endif
-}
-
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
- unsigned int eax, t;
-
- if (c->cpuid_level < 4)
- return 1;
-
- cpuid_count(4, 0, &eax, &t, &t, &t);
-
- if (eax & 0x1f)
- return ((eax >> 26) + 1);
- else
- return 1;
-}
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
- unsigned node;
- int cpu = smp_processor_id();
- int apicid = hard_smp_processor_id();
-
- /* Don't do the funky fallback heuristics the AMD version employs
- for now. */
- node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE || !node_online(node))
- node = first_node(node_online_map);
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
- /* Cache sizes */
- unsigned n;
-
- init_intel_cacheinfo(c);
- if (c->cpuid_level > 9) {
- unsigned eax = cpuid_eax(10);
- /* Check for version and the number of counters */
- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
- set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
- }
-
- if (cpu_has_ds) {
- unsigned int l1, l2;
- rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
- set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
- set_cpu_cap(c, X86_FEATURE_PEBS);
- }
-
-
- if (cpu_has_bts)
- ds_init_intel(c);
-
- n = c->extended_cpuid_level;
- if (n >= 0x80000008) {
- unsigned eax = cpuid_eax(0x80000008);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- /* CPUID workaround for Intel 0F34 CPU */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 0xF && c->x86_model == 0x3 &&
- c->x86_mask == 0x4)
- c->x86_phys_bits = 36;
- }
-
- if (c->x86 == 15)
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- c->x86_max_cores = intel_num_cpu_cores(c);
-
- srat_detect_node();
-}
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x6 && c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
- /* Cache sizes */
- unsigned n;
-
- n = c->extended_cpuid_level;
- if (n >= 0x80000008) {
- unsigned eax = cpuid_eax(0x80000008);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- }
-
- if (c->x86 == 0x6 && c->x86_model >= 0xf) {
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- }
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
- char *v = c->x86_vendor_id;
-
- if (!strcmp(v, "AuthenticAMD"))
- c->x86_vendor = X86_VENDOR_AMD;
- else if (!strcmp(v, "GenuineIntel"))
- c->x86_vendor = X86_VENDOR_INTEL;
- else if (!strcmp(v, "CentaurHauls"))
- c->x86_vendor = X86_VENDOR_CENTAUR;
- else
- c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
- needed before check_bugs. Everything advanced is in identify_cpu
- below. */
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
- u32 tfms, xlvl;
-
- c->loops_per_jiffy = loops_per_jiffy;
- c->x86_cache_size = -1;
- c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->x86_model = c->x86_mask = 0; /* So far unknown... */
- c->x86_vendor_id[0] = '\0'; /* Unset */
- c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_clflush_size = 64;
- c->x86_cache_alignment = c->x86_clflush_size;
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
- c->extended_cpuid_level = 0;
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
- /* Get vendor name */
- cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
- (unsigned int *)&c->x86_vendor_id[0],
- (unsigned int *)&c->x86_vendor_id[8],
- (unsigned int *)&c->x86_vendor_id[4]);
-
- get_cpu_vendor(c);
-
- /* Initialize the standard set of capabilities */
- /* Note that the vendor-specific code below might override */
-
- /* Intel-defined flags: level 0x00000001 */
- if (c->cpuid_level >= 0x00000001) {
- __u32 misc;
- cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
- &c->x86_capability[0]);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xF) << 4;
- if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
- c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
- } else {
- /* Have CPUID level 0 only - unheard of */
- c->x86 = 4;
- }
-
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
-#ifdef CONFIG_SMP
- c->phys_proc_id = c->initial_apicid;
-#endif
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
- }
- if (xlvl >= 0x80000004)
- get_model_name(c); /* Default name */
- }
-
- /* Transmeta-defined flags: level 0x80860001 */
- xlvl = cpuid_eax(0x80860000);
- if ((xlvl & 0xffff0000) == 0x80860000) {
- /* Don't set x86_cpuid_level here for now to not confuse. */
- if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
- }
-
- c->extended_cpuid_level = cpuid_eax(0x80000000);
- if (c->extended_cpuid_level >= 0x80000007)
- c->x86_power = cpuid_edx(0x80000007);
-
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- early_init_amd(c);
- break;
- case X86_VENDOR_INTEL:
- early_init_intel(c);
- break;
- case X86_VENDOR_CENTAUR:
- early_init_centaur(c);
- break;
- }
-
- validate_pat_support(c);
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
- int i;
-
- early_identify_cpu(c);
-
- init_scattered_cpuid_features(c);
-
- c->apicid = phys_pkg_id(0);
-
- /*
- * Vendor-specific initialization. In this section we
- * canonicalize the feature flags, meaning if there are
- * features a certain CPU supports which CPUID doesn't
- * tell us, CPUID claiming incorrect flags, or other bugs,
- * we handle them here.
- *
- * At the end of this section, c->x86_capability better
- * indicate the features this CPU genuinely supports!
- */
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- init_amd(c);
- break;
-
- case X86_VENDOR_INTEL:
- init_intel(c);
- break;
-
- case X86_VENDOR_CENTAUR:
- init_centaur(c);
- break;
-
- case X86_VENDOR_UNKNOWN:
- default:
- display_cacheinfo(c);
- break;
- }
-
- detect_ht(c);
-
- /*
- * On SMP, boot_cpu_data holds the common feature set between
- * all CPUs; so make sure that we indicate which features are
- * common between the CPUs. The first time this routine gets
- * executed, c == &boot_cpu_data.
- */
- if (c != &boot_cpu_data) {
- /* AND the already accumulated flags with these */
- for (i = 0; i < NCAPINTS; i++)
- boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
- }
-
- /* Clear all flags overriden by options */
- for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
-#ifdef CONFIG_X86_MCE
- mcheck_init(c);
-#endif
- select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
- numa_add_cpu(smp_processor_id());
-#endif
-
-}
-
-void __cpuinit identify_boot_cpu(void)
-{
- identify_cpu(&boot_cpu_data);
-}
-
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
- BUG_ON(c == &boot_cpu_data);
- identify_cpu(c);
- mtrr_ap_init();
-}
-
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
- return 1;
-}
-__setup("noclflush", setup_noclflush);
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
- if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
-
- if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
- else
- printk(KERN_CONT "\n");
-}
-
-static __init int setup_disablecpuid(char *arg)
-{
- int bit;
- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
- return 1;
-}
-__setup("clearcpuid=", setup_disablecpuid);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
new file mode 100644
index 00000000000..cac68430d31
--- /dev/null
+++ b/arch/x86/kernel/setup_percpu.c
@@ -0,0 +1,399 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/percpu.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/topology.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/highmem.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int num_processors;
+unsigned disabled_cpus __cpuinitdata;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int max_physical_apicid;
+EXPORT_SYMBOL(boot_cpu_physical_apicid);
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+#endif
+
+/* map cpu index to physical APIC ID */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#define X86_64_NUMA 1
+
+/* map cpu index to node index */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+/* which logical CPUs are on which nodes */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/* setup node_to_cpumask_map */
+static void __init setup_node_to_cpumask_map(void);
+
+#else
+static inline void setup_node_to_cpumask_map(void) { }
+#endif
+
+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+/*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas. These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ */
+static void __init setup_per_cpu_maps(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(x86_cpu_to_apicid, cpu) =
+ early_per_cpu_map(x86_cpu_to_apicid, cpu);
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+#ifdef X86_64_NUMA
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ early_per_cpu_map(x86_cpu_to_node_map, cpu);
+#endif
+ }
+
+ /* indicate the early static arrays will soon be gone */
+ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+#ifdef X86_64_NUMA
+ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
+}
+
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+cpumask_t *cpumask_of_cpu_map __read_mostly;
+EXPORT_SYMBOL(cpumask_of_cpu_map);
+
+/* requires nr_cpu_ids to be initialized */
+static void __init setup_cpumask_of_cpu(void)
+{
+ int i;
+
+ /* alloc_bootmem zeroes memory */
+ cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+ for (i = 0; i < nr_cpu_ids; i++)
+ cpu_set(i, cpumask_of_cpu_map[i]);
+}
+#else
+static inline void setup_cpumask_of_cpu(void) { }
+#endif
+
+#ifdef CONFIG_X86_32
+/*
+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
+ * the same way
+ */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+static inline void setup_cpu_pda_map(void) { }
+
+#elif !defined(CONFIG_SMP)
+static inline void setup_cpu_pda_map(void) { }
+
+#else /* CONFIG_SMP && CONFIG_X86_64 */
+
+/*
+ * Allocate cpu_pda pointer table and array via alloc_bootmem.
+ */
+static void __init setup_cpu_pda_map(void)
+{
+ char *pda;
+ struct x8664_pda **new_cpu_pda;
+ unsigned long size;
+ int cpu;
+
+ size = roundup(sizeof(struct x8664_pda), cache_line_size());
+
+ /* allocate cpu_pda array and pointer table */
+ {
+ unsigned long tsize = nr_cpu_ids * sizeof(void *);
+ unsigned long asize = size * (nr_cpu_ids - 1);
+
+ tsize = roundup(tsize, cache_line_size());
+ new_cpu_pda = alloc_bootmem(tsize + asize);
+ pda = (char *)new_cpu_pda + tsize;
+ }
+
+ /* initialize pointer table to static pda's */
+ for_each_possible_cpu(cpu) {
+ if (cpu == 0) {
+ /* leave boot cpu pda in place */
+ new_cpu_pda[0] = cpu_pda(0);
+ continue;
+ }
+ new_cpu_pda[cpu] = (struct x8664_pda *)pda;
+ new_cpu_pda[cpu]->in_bootmem = 1;
+ pda += size;
+ }
+
+ /* point to new pointer table */
+ _cpu_pda = new_cpu_pda;
+}
+#endif
+
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{
+ ssize_t size = PERCPU_ENOUGH_ROOM;
+ char *ptr;
+ int cpu;
+
+ /* Setup cpu_pda map */
+ setup_cpu_pda_map();
+
+ /* Copy section for each CPU (we discard the original) */
+ size = PERCPU_ENOUGH_ROOM;
+ printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
+ size);
+
+ for_each_possible_cpu(cpu) {
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ ptr = alloc_bootmem_pages(size);
+#else
+ int node = early_cpu_to_node(cpu);
+ if (!node_online(node) || !NODE_DATA(node)) {
+ ptr = alloc_bootmem_pages(size);
+ printk(KERN_INFO
+ "cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ }
+ else
+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+#endif
+ per_cpu_offset(cpu) = ptr - __per_cpu_start;
+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+
+ }
+
+ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
+ NR_CPUS, nr_cpu_ids, nr_node_ids);
+
+ /* Setup percpu data maps */
+ setup_per_cpu_maps();
+
+ /* Setup node to cpumask map */
+ setup_node_to_cpumask_map();
+
+ /* Setup cpumask_of_cpu map */
+ setup_cpumask_of_cpu();
+}
+
+#endif
+
+#ifdef X86_64_NUMA
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ */
+static void __init setup_node_to_cpumask_map(void)
+{
+ unsigned int node, num = 0;
+ cpumask_t *map;
+
+ /* setup nr_node_ids if not done yet */
+ if (nr_node_ids == MAX_NUMNODES) {
+ for_each_node_mask(node, node_possible_map)
+ num = node;
+ nr_node_ids = num + 1;
+ }
+
+ /* allocate the map */
+ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+
+ Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
+ map, nr_node_ids);
+
+ /* node_to_cpumask() will now work */
+ node_to_cpumask_map = map;
+}
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+ if (cpu_pda(cpu) && node != NUMA_NO_NODE)
+ cpu_pda(cpu)->nodenumber = node;
+
+ if (cpu_to_node_map)
+ cpu_to_node_map[cpu] = node;
+
+ else if (per_cpu_offset(cpu))
+ per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+ else
+ Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+ numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+ cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+}
+
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+ int node = cpu_to_node(cpu);
+ cpumask_t *mask;
+ char buf[64];
+
+ if (node_to_cpumask_map == NULL) {
+ printk(KERN_ERR "node_to_cpumask_map NULL\n");
+ dump_stack();
+ return;
+ }
+
+ mask = &node_to_cpumask_map[node];
+ if (enable)
+ cpu_set(cpu, *mask);
+ else
+ cpu_clear(cpu, *mask);
+
+ cpulist_scnprintf(buf, sizeof(buf), *mask);
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
+ }
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, 0);
+}
+
+int cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+ printk(KERN_WARNING
+ "cpu_to_node(%d): usage too early!\n", cpu);
+ dump_stack();
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+ if (!per_cpu_offset(cpu)) {
+ printk(KERN_WARNING
+ "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+ dump_stack();
+ return NUMA_NO_NODE;
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ if (node_to_cpumask_map == NULL) {
+ printk(KERN_WARNING
+ "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
+ node);
+ dump_stack();
+ return (const cpumask_t *)&cpu_online_map;
+ }
+ if (node >= nr_node_ids) {
+ printk(KERN_WARNING
+ "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
+ node, nr_node_ids);
+ dump_stack();
+ return &cpu_mask_none;
+ }
+ return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(_node_to_cpumask_ptr);
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used. The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+ if (node_to_cpumask_map == NULL) {
+ printk(KERN_WARNING
+ "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+ dump_stack();
+ return cpu_online_map;
+ }
+ if (node >= nr_node_ids) {
+ printk(KERN_WARNING
+ "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+ node, nr_node_ids);
+ dump_stack();
+ return cpu_mask_none;
+ }
+ return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+#endif /* X86_64_NUMA */
+
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 0cb7aadc87c..361b7a4c640 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu)
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- atomic_t started;
- atomic_t finished;
- int wait;
-};
-
-void lock_ipi_call_lock(void)
+void native_send_call_func_single_ipi(int cpu)
{
- spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
- spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- struct call_data_struct data;
- int cpus = num_online_cpus() - 1;
-
- if (!cpus)
- return;
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- mb();
-
- /* Send a message to all other CPUs and wait for them to respond */
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (wait)
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
}
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on. Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-static int
-native_smp_call_function_mask(cpumask_t mask,
- void (*func)(void *), void *info,
- int wait)
+void native_send_call_func_ipi(cpumask_t mask)
{
- struct call_data_struct data;
cpumask_t allbutself;
- int cpus;
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- /* Holding any lock stops cpus from going down. */
- spin_lock(&call_lock);
allbutself = cpu_online_map;
cpu_clear(smp_processor_id(), allbutself);
- cpus_and(mask, mask, allbutself);
- cpus = cpus_weight(mask);
-
- if (!cpus) {
- spin_unlock(&call_lock);
- return 0;
- }
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- wmb();
-
- /* Send a message to other CPUs */
if (cpus_equal(mask, allbutself) &&
cpus_equal(cpu_online_map, cpu_callout_map))
send_IPI_allbutself(CALL_FUNCTION_VECTOR);
else
send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (wait)
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
- spin_unlock(&call_lock);
-
- return 0;
}
static void stop_this_cpu(void *dummy)
@@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy)
static void native_smp_send_stop(void)
{
- int nolock;
unsigned long flags;
if (reboot_force)
return;
- /* Don't deadlock on the call lock in panic */
- nolock = !spin_trylock(&call_lock);
+ smp_call_function(stop_this_cpu, NULL, 0);
local_irq_save(flags);
- __smp_call_function(stop_this_cpu, NULL, 0, 0);
- if (!nolock)
- spin_unlock(&call_lock);
disable_local_APIC();
local_irq_restore(flags);
}
@@ -301,33 +187,28 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
void smp_call_function_interrupt(struct pt_regs *regs)
{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
-
ack_APIC_irq();
- /*
- * Notify initiating CPU that I've grabbed the data and am
- * about to execute the function
- */
- mb();
- atomic_inc(&call_data->started);
- /*
- * At this point the info structure may be out of scope unless wait==1
- */
irq_enter();
- (*func)(info);
+ generic_smp_call_function_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
irq_exit();
+}
- if (wait) {
- mb();
- atomic_inc(&call_data->finished);
- }
+void smp_call_function_single_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ irq_enter();
+ generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
+ __get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
+ irq_exit();
}
struct smp_ops smp_ops = {
@@ -338,7 +219,8 @@ struct smp_ops smp_ops = {
.smp_send_stop = native_smp_send_stop,
.smp_send_reschedule = native_smp_send_reschedule,
- .smp_call_function_mask = native_smp_call_function_mask,
+
+ .send_call_func_ipi = native_send_call_func_ipi,
+ .send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);
-
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 56078d61c79..687376ab07e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -59,7 +59,6 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
-#include <asm/nmi.h>
#include <asm/vmi.h>
#include <asm/genapic.h>
#include <linux/mc146818rtc.h>
@@ -68,22 +67,6 @@
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
-/*
- * FIXME: For x86_64, those are defined in other files. But moving them here,
- * would make the setup areas dependent on smp, which is a loss. When we
- * integrate apic between arches, we can probably do a better job, but
- * right now, they'll stay here -- glommer
- */
-
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
- { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_early_ptr;
-
-u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
- = { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_bios_cpu_apicid_early_ptr;
-
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
static int low_mappings;
@@ -198,13 +181,12 @@ static void map_cpu_to_logical_apicid(void)
map_cpu_to_node(cpu, node);
}
-static void unmap_cpu_to_logical_apicid(int cpu)
+void numa_remove_cpu(int cpu)
{
cpu_2_logical_apicid[cpu] = BAD_APICID;
unmap_cpu_to_node(cpu);
}
#else
-#define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
#define map_cpu_to_logical_apicid() do {} while (0)
#endif
@@ -345,19 +327,12 @@ static void __cpuinit start_secondary(void *unused)
* lock helps us to not include this cpu in a currently in progress
* smp_call_function().
*/
- lock_ipi_call_lock();
-#ifdef CONFIG_X86_64
- spin_lock(&vector_lock);
-
- /* Setup the per cpu irq handling data structures */
- __setup_vector_irq(smp_processor_id());
- /*
- * Allow the master to continue.
- */
- spin_unlock(&vector_lock);
+ ipi_call_lock_irq();
+#ifdef CONFIG_X86_IO_APIC
+ setup_vector_irq(smp_processor_id());
#endif
cpu_set(smp_processor_id(), cpu_online_map);
- unlock_ipi_call_lock();
+ ipi_call_unlock_irq();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
setup_secondary_clock();
@@ -366,31 +341,8 @@ static void __cpuinit start_secondary(void *unused)
cpu_idle();
}
-#ifdef CONFIG_X86_32
-/*
- * Everything has been set up for the secondary
- * CPUs - they just need to reload everything
- * from the task structure
- * This function must not return.
- */
-void __devinit initialize_secondary(void)
-{
- /*
- * We don't actually need to load the full TSS,
- * basically just the stack pointer and the ip.
- */
-
- asm volatile(
- "movl %0,%%esp\n\t"
- "jmp *%1"
- :
- :"m" (current->thread.sp), "m" (current->thread.ip));
-}
-#endif
-
static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
/*
* Mask B, Pentium, but not Pentium MMX
*/
@@ -440,7 +392,6 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
valid_k7:
;
-#endif
}
static void __cpuinit smp_checks(void)
@@ -555,23 +506,6 @@ cpumask_t cpu_coregroup_map(int cpu)
return c->llc_shared_map;
}
-#ifdef CONFIG_X86_32
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
- trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- if (__pa(trampoline_base) >= 0x9F000)
- BUG();
-}
-#endif
-
static void impress_friends(void)
{
int cpu;
@@ -748,11 +682,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
* target processor state.
*/
startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-#ifdef CONFIG_X86_64
- (unsigned long)init_rsp);
-#else
(unsigned long)stack_start.sp);
-#endif
/*
* Run STARTUP IPI loop.
@@ -832,6 +762,45 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}
+#ifdef CONFIG_X86_64
+/*
+ * Allocate node local memory for the AP pda.
+ *
+ * Must be called after the _cpu_pda pointer table is initialized.
+ */
+static int __cpuinit get_local_pda(int cpu)
+{
+ struct x8664_pda *oldpda, *newpda;
+ unsigned long size = sizeof(struct x8664_pda);
+ int node = cpu_to_node(cpu);
+
+ if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+ return 0;
+
+ oldpda = cpu_pda(cpu);
+ newpda = kmalloc_node(size, GFP_ATOMIC, node);
+ if (!newpda) {
+ printk(KERN_ERR "Could not allocate node local PDA "
+ "for CPU %d on node %d\n", cpu, node);
+
+ if (oldpda)
+ return 0; /* have a usable pda */
+ else
+ return -1;
+ }
+
+ if (oldpda) {
+ memcpy(newpda, oldpda, size);
+ if (!after_bootmem)
+ free_bootmem((unsigned long)oldpda, size);
+ }
+
+ newpda->in_bootmem = 0;
+ cpu_pda(cpu) = newpda;
+ return 0;
+}
+#endif /* CONFIG_X86_64 */
+
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -848,28 +817,14 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
INIT_WORK(&c_idle.work, do_fork_idle);
-#ifdef CONFIG_X86_64
- /* allocate memory for gdts of secondary cpus. Hotplug is considered */
- if (!cpu_gdt_descr[cpu].address &&
- !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
- printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
- return -1;
- }
+#ifdef CONFIG_X86_64
/* Allocate node local memory for AP pdas */
- if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
- struct x8664_pda *newpda, *pda;
- int node = cpu_to_node(cpu);
- pda = cpu_pda(cpu);
- newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
- node);
- if (newpda) {
- memcpy(newpda, pda, sizeof(struct x8664_pda));
- cpu_pda(cpu) = newpda;
- } else
- printk(KERN_ERR
- "Could not allocate node local PDA for CPU %d on node %d\n",
- cpu, node);
+ if (cpu > 0) {
+ boot_error = get_local_pda(cpu);
+ if (boot_error)
+ goto restore_state;
+ /* if can't get pda memory, can't start cpu */
}
#endif
@@ -905,18 +860,15 @@ do_rest:
#ifdef CONFIG_X86_32
per_cpu(current_task, cpu) = c_idle.idle;
init_gdt(cpu);
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
- c_idle.idle->thread.ip = (unsigned long) start_secondary;
/* Stack for startup_32 can be just as for start_secondary onwards */
- stack_start.sp = (void *) c_idle.idle->thread.sp;
irq_ctx_init(cpu);
#else
cpu_pda(cpu)->pcurrent = c_idle.idle;
- init_rsp = c_idle.idle->thread.sp;
- load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
- initial_code = (unsigned long)start_secondary;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
#endif
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ initial_code = (unsigned long)start_secondary;
+ stack_start.sp = (void *) c_idle.idle->thread.sp;
/* start_ip had better be page-aligned! */
start_ip = setup_trampoline();
@@ -987,16 +939,14 @@ do_rest:
inquire_remote_apic(apicid);
}
}
-
- if (boot_error) {
- /* Try to put things back the way they were before ... */
- unmap_cpu_to_logical_apicid(cpu);
#ifdef CONFIG_X86_64
- clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+restore_state:
#endif
+ if (boot_error) {
+ /* Try to put things back the way they were before ... */
+ numa_remove_cpu(cpu); /* was set by numa_add_cpu */
cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
- cpu_clear(cpu, cpu_possible_map);
cpu_clear(cpu, cpu_present_map);
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
@@ -1088,14 +1038,12 @@ static __init void disable_smp(void)
{
cpu_present_map = cpumask_of_cpu(0);
cpu_possible_map = cpumask_of_cpu(0);
-#ifdef CONFIG_X86_32
smpboot_clear_io_apic_irqs();
-#endif
+
if (smp_found_config)
- phys_cpu_present_map =
- physid_mask_of_physid(boot_cpu_physical_apicid);
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
else
- phys_cpu_present_map = physid_mask_of_physid(0);
+ physid_set_mask_of_physid(0, &phys_cpu_present_map);
map_cpu_to_logical_apicid();
cpu_set(0, per_cpu(cpu_sibling_map, 0));
cpu_set(0, per_cpu(cpu_core_map, 0));
@@ -1158,12 +1106,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
* If SMP should be disabled, then really disable it!
*/
if (!max_cpus) {
- printk(KERN_INFO "SMP mode deactivated,"
- "forcing use of dummy APIC emulation.\n");
+ printk(KERN_INFO "SMP mode deactivated.\n");
smpboot_clear_io_apic();
-#ifdef CONFIG_X86_32
+
+ localise_nmi_watchdog();
+
connect_bsp_APIC();
-#endif
setup_local_APIC();
end_local_APIC_setup();
return -1;
@@ -1191,7 +1139,6 @@ static void __init smp_cpu_index_default(void)
void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
preempt_disable();
- nmi_watchdog_default();
smp_cpu_index_default();
current_cpu_data = boot_cpu_data;
cpu_callin_map = cpumask_of_cpu(0);
@@ -1218,9 +1165,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
preempt_enable();
-#ifdef CONFIG_X86_32
connect_bsp_APIC();
-#endif
+
/*
* Switch from PIC to APIC mode.
*/
@@ -1258,8 +1204,8 @@ void __init native_smp_prepare_boot_cpu(void)
int me = smp_processor_id();
#ifdef CONFIG_X86_32
init_gdt(me);
- switch_to_new_gdt();
#endif
+ switch_to_new_gdt();
/* already set me in cpu_online_map in boot_cpu_init() */
cpu_set(me, cpu_callout_map);
per_cpu(cpu_state, me) = CPU_ONLINE;
@@ -1279,23 +1225,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
#ifdef CONFIG_HOTPLUG_CPU
-# ifdef CONFIG_X86_32
-void cpu_exit_clear(void)
-{
- int cpu = raw_smp_processor_id();
-
- idle_task_exit();
-
- cpu_uninit();
- irq_ctx_exit(cpu);
-
- cpu_clear(cpu, cpu_callout_map);
- cpu_clear(cpu, cpu_callin_map);
-
- unmap_cpu_to_logical_apicid(cpu);
-}
-# endif /* CONFIG_X86_32 */
-
static void remove_siblinginfo(int cpu)
{
int sibling;
@@ -1349,12 +1278,20 @@ __init void prefill_possible_map(void)
int i;
int possible;
+ /* no processor from mptable or madt */
+ if (!num_processors)
+ num_processors = 1;
+
+#ifdef CONFIG_HOTPLUG_CPU
if (additional_cpus == -1) {
if (disabled_cpus > 0)
additional_cpus = disabled_cpus;
else
additional_cpus = 0;
}
+#else
+ additional_cpus = 0;
+#endif
possible = num_processors + additional_cpus;
if (possible > NR_CPUS)
possible = NR_CPUS;
@@ -1364,18 +1301,18 @@ __init void prefill_possible_map(void)
for (i = 0; i < possible; i++)
cpu_set(i, cpu_possible_map);
+
+ nr_cpu_ids = possible;
}
static void __ref remove_cpu_from_maps(int cpu)
{
cpu_clear(cpu, cpu_online_map);
-#ifdef CONFIG_X86_64
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
/* was set by cpu_init() */
clear_bit(cpu, (unsigned long *)&cpu_initialized);
- clear_node_cpumask(cpu);
-#endif
+ numa_remove_cpu(cpu);
}
int __cpu_disable(void)
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 3449064d141..99941b37eca 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -25,59 +25,3 @@ __cpuinit void init_gdt(int cpu)
per_cpu(cpu_number, cpu) = cpu;
}
#endif
-
-/**
- * smp_call_function(): Run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
- int wait)
-{
- return smp_call_function_mask(cpu_online_map, func, info, wait);
-}
-EXPORT_SYMBOL(smp_call_function);
-
-/**
- * smp_call_function_single - Run a function on a specific CPU
- * @cpu: The target CPU. Cannot be the calling CPU.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- /* prevent preemption and reschedule on another processor */
- int ret;
- int me = get_cpu();
- if (cpu == me) {
- local_irq_disable();
- func(info);
- local_irq_enable();
- put_cpu();
- return 0;
- }
-
- ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
-
- put_cpu();
- return ret;
-}
-EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
deleted file mode 100644
index 70e4a374b4e..00000000000
--- a/arch/x86/kernel/srat_32.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
-#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
-static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
-
-#define MAX_CHUNKS_PER_NODE 3
-#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
- unsigned long start_pfn;
- unsigned long end_pfn;
- u8 pxm; // proximity domain of node
- u8 nid; // which cnode contains this chunk?
- u8 bank; // which mem bank on this node
-};
-static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
-
-static int num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
-
-/* Identify CPU proximity domains */
-static void __init parse_cpu_affinity_structure(char *p)
-{
- struct acpi_srat_cpu_affinity *cpu_affinity =
- (struct acpi_srat_cpu_affinity *) p;
-
- if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
- return; /* empty entry */
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-
- apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-
- printk("CPU 0x%02X in proximity domain 0x%02X\n",
- cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-static void __init parse_memory_affinity_structure (char *sratp)
-{
- unsigned long long paddr, size;
- unsigned long start_pfn, end_pfn;
- u8 pxm;
- struct node_memory_chunk_s *p, *q, *pend;
- struct acpi_srat_mem_affinity *memory_affinity =
- (struct acpi_srat_mem_affinity *) sratp;
-
- if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
- return; /* empty entry */
-
- pxm = memory_affinity->proximity_domain & 0xff;
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, pxm);
-
- /* calculate info for memory chunk structure */
- paddr = memory_affinity->base_address;
- size = memory_affinity->length;
-
- start_pfn = paddr >> PAGE_SHIFT;
- end_pfn = (paddr + size) >> PAGE_SHIFT;
-
-
- if (num_memory_chunks >= MAXCHUNKS) {
- printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
- size/(1024*1024), paddr);
- return;
- }
-
- /* Insertion sort based on base address */
- pend = &node_memory_chunk[num_memory_chunks];
- for (p = &node_memory_chunk[0]; p < pend; p++) {
- if (start_pfn < p->start_pfn)
- break;
- }
- if (p < pend) {
- for (q = pend; q >= p; q--)
- *(q + 1) = *q;
- }
- p->start_pfn = start_pfn;
- p->end_pfn = end_pfn;
- p->pxm = pxm;
-
- num_memory_chunks++;
-
- printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
- start_pfn, end_pfn,
- memory_affinity->memory_type,
- pxm,
- ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
- "enabled and removable" : "enabled" ) );
-}
-
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
- /*
- * Only add present memory as told by the e820.
- * There is no guarantee from the SRAT that the memory it
- * enumerates is present at boot time because it represents
- * *possible* memory hotplug areas the same as normal RAM.
- */
- if (memory_chunk->start_pfn >= max_pfn) {
- printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
- memory_chunk->start_pfn, memory_chunk->end_pfn);
- return;
- }
- if (memory_chunk->nid != nid)
- return;
-
- if (!node_has_online_mem(nid))
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_start_pfn[nid] > memory_chunk->start_pfn)
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_end_pfn[nid] < memory_chunk->end_pfn)
- node_end_pfn[nid] = memory_chunk->end_pfn;
-}
-
-/* Parse the ACPI Static Resource Affinity Table */
-static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
-{
- u8 *start, *end, *p;
- int i, j, nid;
-
- start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
- p = start;
- end = (u8 *)sratp + sratp->header.length;
-
- memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
- memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
-
- num_memory_chunks = 0;
- while (p < end) {
- switch (*p) {
- case ACPI_SRAT_TYPE_CPU_AFFINITY:
- parse_cpu_affinity_structure(p);
- break;
- case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
- parse_memory_affinity_structure(p);
- break;
- default:
- printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
- break;
- }
- p += p[1];
- if (p[1] == 0) {
- printk("acpi20_parse_srat: Entry length value is zero;"
- " can't parse any further!\n");
- break;
- }
- }
-
- if (num_memory_chunks == 0) {
- printk("could not finy any ACPI SRAT memory areas.\n");
- goto out_fail;
- }
-
- /* Calculate total number of nodes in system from PXM bitmap and create
- * a set of sequential node IDs starting at zero. (ACPI doesn't seem
- * to specify the range of _PXM values.)
- */
- /*
- * MCD - we no longer HAVE to number nodes sequentially. PXM domain
- * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
- * 32, so we will continue numbering them in this manner until MAX_NUMNODES
- * approaches MAX_PXM_DOMAINS for i386.
- */
- nodes_clear(node_online_map);
- for (i = 0; i < MAX_PXM_DOMAINS; i++) {
- if (BMAP_TEST(pxm_bitmap, i)) {
- int nid = acpi_map_pxm_to_node(i);
- node_set_online(nid);
- }
- }
- BUG_ON(num_online_nodes() == 0);
-
- /* set cnode id in memory chunk structure */
- for (i = 0; i < num_memory_chunks; i++)
- node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-
- printk("pxm bitmap: ");
- for (i = 0; i < sizeof(pxm_bitmap); i++) {
- printk("%02X ", pxm_bitmap[i]);
- }
- printk("\n");
- printk("Number of logical nodes in system = %d\n", num_online_nodes());
- printk("Number of memory chunks in system = %d\n", num_memory_chunks);
-
- for (i = 0; i < MAX_APICID; i++)
- apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
-
- for (j = 0; j < num_memory_chunks; j++){
- struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
- printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
- j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
- node_read_chunk(chunk->nid, chunk);
- add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
- }
-
- for_each_online_node(nid) {
- unsigned long start = node_start_pfn[nid];
- unsigned long end = node_end_pfn[nid];
-
- memory_present(nid, start, end);
- node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
- }
- return 1;
-out_fail:
- return 0;
-}
-
-struct acpi_static_rsdt {
- struct acpi_table_rsdt table;
- u32 padding[7]; /* Allow for 7 more table entries */
-};
-
-int __init get_memcfg_from_srat(void)
-{
- struct acpi_table_header *header = NULL;
- struct acpi_table_rsdp *rsdp = NULL;
- struct acpi_table_rsdt *rsdt = NULL;
- acpi_native_uint rsdp_address = 0;
- struct acpi_static_rsdt saved_rsdt;
- int tables = 0;
- int i = 0;
-
- rsdp_address = acpi_os_get_root_pointer();
- if (!rsdp_address) {
- printk("%s: System description tables not found\n",
- __func__);
- goto out_err;
- }
-
- printk("%s: assigning address to rsdp\n", __func__);
- rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
- if (!rsdp) {
- printk("%s: Didn't find ACPI root!\n", __func__);
- goto out_err;
- }
-
- printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
- rsdp->oem_id);
-
- if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
- printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
- goto out_err;
- }
-
- rsdt = (struct acpi_table_rsdt *)
- early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
-
- if (!rsdt) {
- printk(KERN_WARNING
- "%s: ACPI: Invalid root system description tables (RSDT)\n",
- __func__);
- goto out_err;
- }
-
- header = &rsdt->header;
-
- if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
- printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
- goto out_err;
- }
-
- /*
- * The number of tables is computed by taking the
- * size of all entries (header size minus total
- * size of RSDT) divided by the size of each entry
- * (4-byte table pointers).
- */
- tables = (header->length - sizeof(struct acpi_table_header)) / 4;
-
- if (!tables)
- goto out_err;
-
- memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
-
- if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
- printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
- saved_rsdt.table.header.length);
- goto out_err;
- }
-
- printk("Begin SRAT table scan....\n");
-
- for (i = 0; i < tables; i++) {
- /* Map in header, then map in full table length. */
- header = (struct acpi_table_header *)
- early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
- if (!header)
- break;
- header = (struct acpi_table_header *)
- early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
- if (!header)
- break;
-
- if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
- continue;
-
- /* we've found the srat table. don't need to look at any more tables */
- return acpi20_parse_srat((struct acpi_table_srat *)header);
- }
-out_err:
- remove_all_active_ranges();
- printk("failed to get NUMA memory information from SRAT table\n");
- return 0;
-}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c28c342c162..a03e7f6d90c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace)
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
+EXPORT_SYMBOL_GPL(save_stack_trace);
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
@@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index ae751094eba..d67ce5f044b 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata;
static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
+#ifndef CONFIG_X86_NUMAQ
static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
+#endif
static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
{
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d2ab52cc1d6..7066cb855a6 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -19,8 +19,8 @@
#include <linux/utsname.h>
#include <linux/ipc.h>
-#include <asm/uaccess.h>
-#include <asm/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/unistd.h>
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
@@ -103,7 +103,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg)
*
* This is really horribly ugly.
*/
-asmlinkage int sys_ipc (uint call, int first, int second,
+asmlinkage int sys_ipc(uint call, int first, int second,
int third, void __user *ptr, long fifth)
{
int version, ret;
@@ -113,24 +113,24 @@ asmlinkage int sys_ipc (uint call, int first, int second,
switch (call) {
case SEMOP:
- return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
+ return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
case SEMTIMEDOP:
return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
(const struct timespec __user *)fifth);
case SEMGET:
- return sys_semget (first, second, third);
+ return sys_semget(first, second, third);
case SEMCTL: {
union semun fourth;
if (!ptr)
return -EINVAL;
if (get_user(fourth.__pad, (void __user * __user *) ptr))
return -EFAULT;
- return sys_semctl (first, second, third, fourth);
+ return sys_semctl(first, second, third, fourth);
}
case MSGSND:
- return sys_msgsnd (first, (struct msgbuf __user *) ptr,
+ return sys_msgsnd(first, (struct msgbuf __user *) ptr,
second, third);
case MSGRCV:
switch (version) {
@@ -138,45 +138,45 @@ asmlinkage int sys_ipc (uint call, int first, int second,
struct ipc_kludge tmp;
if (!ptr)
return -EINVAL;
-
+
if (copy_from_user(&tmp,
- (struct ipc_kludge __user *) ptr,
- sizeof (tmp)))
+ (struct ipc_kludge __user *) ptr,
+ sizeof(tmp)))
return -EFAULT;
- return sys_msgrcv (first, tmp.msgp, second,
+ return sys_msgrcv(first, tmp.msgp, second,
tmp.msgtyp, third);
}
default:
- return sys_msgrcv (first,
+ return sys_msgrcv(first,
(struct msgbuf __user *) ptr,
second, fifth, third);
}
case MSGGET:
- return sys_msgget ((key_t) first, second);
+ return sys_msgget((key_t) first, second);
case MSGCTL:
- return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
+ return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
case SHMAT:
switch (version) {
default: {
ulong raddr;
- ret = do_shmat (first, (char __user *) ptr, second, &raddr);
+ ret = do_shmat(first, (char __user *) ptr, second, &raddr);
if (ret)
return ret;
- return put_user (raddr, (ulong __user *) third);
+ return put_user(raddr, (ulong __user *) third);
}
case 1: /* iBCS2 emulator entry point */
if (!segment_eq(get_fs(), get_ds()))
return -EINVAL;
/* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
- return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
+ return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
}
- case SHMDT:
- return sys_shmdt ((char __user *)ptr);
+ case SHMDT:
+ return sys_shmdt((char __user *)ptr);
case SHMGET:
- return sys_shmget (first, second, third);
+ return sys_shmget(first, second, third);
case SHMCTL:
- return sys_shmctl (first, second,
+ return sys_shmctl(first, second,
(struct shmid_ds __user *) ptr);
default:
return -ENOSYS;
@@ -186,28 +186,28 @@ asmlinkage int sys_ipc (uint call, int first, int second,
/*
* Old cruft
*/
-asmlinkage int sys_uname(struct old_utsname __user * name)
+asmlinkage int sys_uname(struct old_utsname __user *name)
{
int err;
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof(*name));
up_read(&uts_sem);
- return err?-EFAULT:0;
+ return err? -EFAULT:0;
}
-asmlinkage int sys_olduname(struct oldold_utsname __user * name)
+asmlinkage int sys_olduname(struct oldold_utsname __user *name)
{
int error;
if (!name)
return -EFAULT;
- if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+ if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
return -EFAULT;
-
- down_read(&uts_sem);
-
+
+ down_read(&uts_sem);
+
error = __copy_to_user(&name->sysname, &utsname()->sysname,
__OLD_UTS_LEN);
error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
@@ -223,9 +223,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name)
error |= __copy_to_user(&name->machine, &utsname()->machine,
__OLD_UTS_LEN);
error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-
+
up_read(&uts_sem);
-
+
error = error ? -EFAULT : 0;
return error;
@@ -241,6 +241,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
long __res;
asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
: "=a" (__res)
- : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
+ : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
return __res;
}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2ff21f39893..059ca6ee59b 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -39,9 +39,6 @@
#include "do_timer.h"
-unsigned int cpu_khz; /* Detected as we calibrate the TSC */
-EXPORT_SYMBOL(cpu_khz);
-
int timer_ack;
unsigned long profile_pc(struct pt_regs *regs)
@@ -84,8 +81,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
if (timer_ack) {
/*
* Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to reset the IRR bit for do_slow_gettimeoffset().
- * This will also deassert NMI lines for the watchdog if run
+ * manually to deassert NMI lines for the watchdog if run
* on an 82489DX-based system.
*/
spin_lock(&i8259A_lock);
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c737849e2ef..e3d49c553af 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
/* calibrate_cpu is used on systems with fixed rate TSCs to determine
* processor frequency */
#define TICK_COUNT 100000000
-unsigned long __init native_calculate_cpu_khz(void)
+unsigned long __init calibrate_cpu(void)
{
int tsc_start, tsc_now;
int i, no_ctr_free;
@@ -116,23 +116,11 @@ void __init hpet_time_init(void)
void __init time_init(void)
{
- tsc_calibrate();
-
- cpu_khz = tsc_khz;
- if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
- cpu_khz = calculate_cpu_khz();
-
- if (unsynchronized_tsc())
- mark_tsc_unstable("TSCs unsynchronized");
-
+ tsc_init();
if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
vgetcpu_mode = VGETCPU_RDTSCP;
else
vgetcpu_mode = VGETCPU_LSL;
- printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
- cpu_khz / 1000, cpu_khz % 1000);
- init_tsc_clocksource();
late_time_init = choose_time_init();
}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 9bb2363851a..fec1ecedc9b 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
}
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index a1f07d79320..dcbf7a1159e 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -15,6 +15,8 @@
#include <asm/proto.h>
#include <asm/apicdef.h>
#include <asm/idle.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_bau.h>
#include <mach_ipi.h>
/*
@@ -162,6 +164,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
union smp_flush_state *f;
cpumask_t cpumask = *cpumaskp;
+ if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
+ return;
+
/* Caller has disabled preemption */
sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
f = &per_cpu(flush_state, sender);
@@ -270,5 +275,5 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
new file mode 100644
index 00000000000..d0fbb7712ab
--- /dev/null
+++ b/arch/x86/kernel/tlb_uv.c
@@ -0,0 +1,792 @@
+/*
+ * SGI UltraViolet TLB flush routines.
+ *
+ * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+
+#include <asm/mmu_context.h>
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_bau.h>
+#include <asm/genapic.h>
+#include <asm/idle.h>
+#include <asm/tsc.h>
+
+#include <mach_apic.h>
+
+static struct bau_control **uv_bau_table_bases __read_mostly;
+static int uv_bau_retry_limit __read_mostly;
+
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+
+static unsigned long uv_mmask __read_mostly;
+
+static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+static DEFINE_PER_CPU(struct bau_control, bau_control);
+
+/*
+ * Free a software acknowledge hardware resource by clearing its Pending
+ * bit. This will return a reply to the sender.
+ * If the message has timed out, a reply has already been sent by the
+ * hardware but the resource has not been released. In that case our
+ * clear of the Timeout bit (as well) will free the resource. No reply will
+ * be sent (the hardware will only do one reply per message).
+ */
+static void uv_reply_to_message(int resource,
+ struct bau_payload_queue_entry *msg,
+ struct bau_msg_status *msp)
+{
+ unsigned long dw;
+
+ dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+ msg->replied_to = 1;
+ msg->sw_ack_vector = 0;
+ if (msp)
+ msp->seen_by.bits = 0;
+ uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+}
+
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
+ int msg_slot, int sw_ack_slot)
+{
+ unsigned long this_cpu_mask;
+ struct bau_msg_status *msp;
+ int cpu;
+
+ msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
+ cpu = uv_blade_processor_id();
+ msg->number_of_cpus =
+ uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+ this_cpu_mask = 1UL << cpu;
+ if (msp->seen_by.bits & this_cpu_mask)
+ return;
+ atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+
+ if (msg->replied_to == 1)
+ return;
+
+ if (msg->address == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ __get_cpu_var(ptcstats).alltlb++;
+ } else {
+ __flush_tlb_one(msg->address);
+ __get_cpu_var(ptcstats).onetlb++;
+ }
+
+ __get_cpu_var(ptcstats).requestee++;
+
+ atomic_inc_short(&msg->acknowledge_count);
+ if (msg->number_of_cpus == msg->acknowledge_count)
+ uv_reply_to_message(sw_ack_slot, msg, msp);
+}
+
+/*
+ * Examine the payload queue on one distribution node to see
+ * which messages have not been seen, and which cpu(s) have not seen them.
+ *
+ * Returns the number of cpu's that have not responded.
+ */
+static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+{
+ struct bau_payload_queue_entry *msg;
+ struct bau_msg_status *msp;
+ int count = 0;
+ int i;
+ int j;
+
+ for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
+ msg++, i++) {
+ if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
+ msp = bau_tablesp->msg_statuses + i;
+ printk(KERN_DEBUG
+ "blade %d: address:%#lx %d of %d, not cpu(s): ",
+ i, msg->address, msg->acknowledge_count,
+ msg->number_of_cpus);
+ for (j = 0; j < msg->number_of_cpus; j++) {
+ if (!((1L << j) & msp->seen_by.bits)) {
+ count++;
+ printk("%d ", j);
+ }
+ }
+ printk("\n");
+ }
+ }
+ return count;
+}
+
+/*
+ * Examine the payload queue on all the distribution nodes to see
+ * which messages have not been seen, and which cpu(s) have not seen them.
+ *
+ * Returns the number of cpu's that have not responded.
+ */
+static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+{
+ int sender;
+ int i;
+ int count = 0;
+
+ sender = smp_processor_id();
+ for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
+ if (!bau_node_isset(i, distribution))
+ continue;
+ count += uv_examine_destination(uv_bau_table_bases[i], sender);
+ }
+ return count;
+}
+
+/*
+ * wait for completion of a broadcast message
+ *
+ * return COMPLETE, RETRY or GIVEUP
+ */
+static int uv_wait_completion(struct bau_desc *bau_desc,
+ unsigned long mmr_offset, int right_shift)
+{
+ int exams = 0;
+ long destination_timeouts = 0;
+ long source_timeouts = 0;
+ unsigned long descriptor_status;
+
+ while ((descriptor_status = (((unsigned long)
+ uv_read_local_mmr(mmr_offset) >>
+ right_shift) & UV_ACT_STATUS_MASK)) !=
+ DESC_STATUS_IDLE) {
+ if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+ source_timeouts++;
+ if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
+ source_timeouts = 0;
+ __get_cpu_var(ptcstats).s_retry++;
+ return FLUSH_RETRY;
+ }
+ /*
+ * spin here looking for progress at the destinations
+ */
+ if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
+ destination_timeouts++;
+ if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
+ /*
+ * returns number of cpus not responding
+ */
+ if (uv_examine_destinations
+ (&bau_desc->distribution) == 0) {
+ __get_cpu_var(ptcstats).d_retry++;
+ return FLUSH_RETRY;
+ }
+ exams++;
+ if (exams >= uv_bau_retry_limit) {
+ printk(KERN_DEBUG
+ "uv_flush_tlb_others");
+ printk("giving up on cpu %d\n",
+ smp_processor_id());
+ return FLUSH_GIVEUP;
+ }
+ /*
+ * delays can hang the simulator
+ udelay(1000);
+ */
+ destination_timeouts = 0;
+ }
+ }
+ }
+ return FLUSH_COMPLETE;
+}
+
+/**
+ * uv_flush_send_and_wait
+ *
+ * Send a broadcast and wait for a broadcast message to complete.
+ *
+ * The cpumaskp mask contains the cpus the broadcast was sent to.
+ *
+ * Returns 1 if all remote flushing was done. The mask is zeroed.
+ * Returns 0 if some remote flushing remains to be done. The mask is left
+ * unchanged.
+ */
+int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
+ cpumask_t *cpumaskp)
+{
+ int completion_status = 0;
+ int right_shift;
+ int tries = 0;
+ int blade;
+ int bit;
+ unsigned long mmr_offset;
+ unsigned long index;
+ cycles_t time1;
+ cycles_t time2;
+
+ if (cpu < UV_CPUS_PER_ACT_STATUS) {
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+ right_shift = cpu * UV_ACT_STATUS_SIZE;
+ } else {
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+ right_shift =
+ ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
+ }
+ time1 = get_cycles();
+ do {
+ tries++;
+ index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
+ cpu;
+ uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+ completion_status = uv_wait_completion(bau_desc, mmr_offset,
+ right_shift);
+ } while (completion_status == FLUSH_RETRY);
+ time2 = get_cycles();
+ __get_cpu_var(ptcstats).sflush += (time2 - time1);
+ if (tries > 1)
+ __get_cpu_var(ptcstats).retriesok++;
+
+ if (completion_status == FLUSH_GIVEUP) {
+ /*
+ * Cause the caller to do an IPI-style TLB shootdown on
+ * the cpu's, all of which are still in the mask.
+ */
+ __get_cpu_var(ptcstats).ptc_i++;
+ return 0;
+ }
+
+ /*
+ * Success, so clear the remote cpu's from the mask so we don't
+ * use the IPI method of shootdown on them.
+ */
+ for_each_cpu_mask(bit, *cpumaskp) {
+ blade = uv_cpu_to_blade_id(bit);
+ if (blade == this_blade)
+ continue;
+ cpu_clear(bit, *cpumaskp);
+ }
+ if (!cpus_empty(*cpumaskp))
+ return 0;
+ return 1;
+}
+
+/**
+ * uv_flush_tlb_others - globally purge translation cache of a virtual
+ * address or all TLB's
+ * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @mm: mm_struct containing virtual address range
+ * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ *
+ * This is the entry point for initiating any UV global TLB shootdown.
+ *
+ * Purges the translation caches of all specified processors of the given
+ * virtual address, or purges all TLB's on specified processors.
+ *
+ * The caller has derived the cpumaskp from the mm_struct and has subtracted
+ * the local cpu from the mask. This function is called only if there
+ * are bits set in the mask. (e.g. flush_tlb_page())
+ *
+ * The cpumaskp is converted into a nodemask of the nodes containing
+ * the cpus.
+ *
+ * Returns 1 if all remote flushing was done.
+ * Returns 0 if some remote flushing remains to be done.
+ */
+int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
+ unsigned long va)
+{
+ int i;
+ int bit;
+ int blade;
+ int cpu;
+ int this_blade;
+ int locals = 0;
+ struct bau_desc *bau_desc;
+
+ cpu = uv_blade_processor_id();
+ this_blade = uv_numa_blade_id();
+ bau_desc = __get_cpu_var(bau_control).descriptor_base;
+ bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu;
+
+ bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+
+ i = 0;
+ for_each_cpu_mask(bit, *cpumaskp) {
+ blade = uv_cpu_to_blade_id(bit);
+ BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
+ if (blade == this_blade) {
+ locals++;
+ continue;
+ }
+ bau_node_set(blade, &bau_desc->distribution);
+ i++;
+ }
+ if (i == 0) {
+ /*
+ * no off_node flushing; return status for local node
+ */
+ if (locals)
+ return 0;
+ else
+ return 1;
+ }
+ __get_cpu_var(ptcstats).requestor++;
+ __get_cpu_var(ptcstats).ntargeted += i;
+
+ bau_desc->payload.address = va;
+ bau_desc->payload.sending_cpu = smp_processor_id();
+
+ return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp);
+}
+
+/*
+ * The BAU message interrupt comes here. (registered by set_intr_gate)
+ * See entry_64.S
+ *
+ * We received a broadcast assist message.
+ *
+ * Interrupts may have been disabled; this interrupt could represent
+ * the receipt of several messages.
+ *
+ * All cores/threads on this node get this interrupt.
+ * The last one to see it does the s/w ack.
+ * (the resource will not be freed until noninterruptable cpus see this
+ * interrupt; hardware will timeout the s/w ack and reply ERROR)
+ */
+void uv_bau_message_interrupt(struct pt_regs *regs)
+{
+ struct bau_payload_queue_entry *va_queue_first;
+ struct bau_payload_queue_entry *va_queue_last;
+ struct bau_payload_queue_entry *msg;
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ cycles_t time1;
+ cycles_t time2;
+ int msg_slot;
+ int sw_ack_slot;
+ int fw;
+ int count = 0;
+ unsigned long local_pnode;
+
+ ack_APIC_irq();
+ exit_idle();
+ irq_enter();
+
+ time1 = get_cycles();
+
+ local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
+
+ va_queue_first = __get_cpu_var(bau_control).va_queue_first;
+ va_queue_last = __get_cpu_var(bau_control).va_queue_last;
+
+ msg = __get_cpu_var(bau_control).bau_msg_head;
+ while (msg->sw_ack_vector) {
+ count++;
+ fw = msg->sw_ack_vector;
+ msg_slot = msg - va_queue_first;
+ sw_ack_slot = ffs(fw) - 1;
+
+ uv_bau_process_message(msg, msg_slot, sw_ack_slot);
+
+ msg++;
+ if (msg > va_queue_last)
+ msg = va_queue_first;
+ __get_cpu_var(bau_control).bau_msg_head = msg;
+ }
+ if (!count)
+ __get_cpu_var(ptcstats).nomsg++;
+ else if (count > 1)
+ __get_cpu_var(ptcstats).multmsg++;
+
+ time2 = get_cycles();
+ __get_cpu_var(ptcstats).dflush += (time2 - time1);
+
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+
+static void uv_enable_timeouts(void)
+{
+ int i;
+ int blade;
+ int last_blade;
+ int pnode;
+ int cur_cpu = 0;
+ unsigned long apicid;
+
+ last_blade = -1;
+ for_each_online_node(i) {
+ blade = uv_node_to_blade_id(i);
+ if (blade == last_blade)
+ continue;
+ last_blade = blade;
+ apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+ pnode = uv_blade_to_pnode(blade);
+ cur_cpu += uv_blade_nr_possible_cpus(i);
+ }
+}
+
+static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+{
+ if (*offset < num_possible_cpus())
+ return offset;
+ return NULL;
+}
+
+static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ if (*offset < num_possible_cpus())
+ return offset;
+ return NULL;
+}
+
+static void uv_ptc_seq_stop(struct seq_file *file, void *data)
+{
+}
+
+/*
+ * Display the statistics thru /proc
+ * data points to the cpu number
+ */
+static int uv_ptc_seq_show(struct seq_file *file, void *data)
+{
+ struct ptc_stats *stat;
+ int cpu;
+
+ cpu = *(loff_t *)data;
+
+ if (!cpu) {
+ seq_printf(file,
+ "# cpu requestor requestee one all sretry dretry ptc_i ");
+ seq_printf(file,
+ "sw_ack sflush dflush sok dnomsg dmult starget\n");
+ }
+ if (cpu < num_possible_cpus() && cpu_online(cpu)) {
+ stat = &per_cpu(ptcstats, cpu);
+ seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
+ cpu, stat->requestor,
+ stat->requestee, stat->onetlb, stat->alltlb,
+ stat->s_retry, stat->d_retry, stat->ptc_i);
+ seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+ uv_read_global_mmr64(uv_blade_to_pnode
+ (uv_cpu_to_blade_id(cpu)),
+ UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
+ stat->sflush, stat->dflush,
+ stat->retriesok, stat->nomsg,
+ stat->multmsg, stat->ntargeted);
+ }
+
+ return 0;
+}
+
+/*
+ * 0: display meaning of the statistics
+ * >0: retry limit
+ */
+static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ long newmode;
+ char optstr[64];
+
+ if (count == 0 || count > sizeof(optstr))
+ return -EINVAL;
+ if (copy_from_user(optstr, user, count))
+ return -EFAULT;
+ optstr[count - 1] = '\0';
+ if (strict_strtoul(optstr, 10, &newmode) < 0) {
+ printk(KERN_DEBUG "%s is invalid\n", optstr);
+ return -EINVAL;
+ }
+
+ if (newmode == 0) {
+ printk(KERN_DEBUG "# cpu: cpu number\n");
+ printk(KERN_DEBUG
+ "requestor: times this cpu was the flush requestor\n");
+ printk(KERN_DEBUG
+ "requestee: times this cpu was requested to flush its TLBs\n");
+ printk(KERN_DEBUG
+ "one: times requested to flush a single address\n");
+ printk(KERN_DEBUG
+ "all: times requested to flush all TLB's\n");
+ printk(KERN_DEBUG
+ "sretry: number of retries of source-side timeouts\n");
+ printk(KERN_DEBUG
+ "dretry: number of retries of destination-side timeouts\n");
+ printk(KERN_DEBUG
+ "ptc_i: times UV fell through to IPI-style flushes\n");
+ printk(KERN_DEBUG
+ "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+ printk(KERN_DEBUG
+ "sflush_us: cycles spent in uv_flush_tlb_others()\n");
+ printk(KERN_DEBUG
+ "dflush_us: cycles spent in handling flush requests\n");
+ printk(KERN_DEBUG "sok: successes on retry\n");
+ printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
+ printk(KERN_DEBUG
+ "dmult: interrupts with multiple messages\n");
+ printk(KERN_DEBUG "starget: nodes targeted\n");
+ } else {
+ uv_bau_retry_limit = newmode;
+ printk(KERN_DEBUG "timeout retry limit:%d\n",
+ uv_bau_retry_limit);
+ }
+
+ return count;
+}
+
+static const struct seq_operations uv_ptc_seq_ops = {
+ .start = uv_ptc_seq_start,
+ .next = uv_ptc_seq_next,
+ .stop = uv_ptc_seq_stop,
+ .show = uv_ptc_seq_show
+};
+
+static int uv_ptc_proc_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &uv_ptc_seq_ops);
+}
+
+static const struct file_operations proc_uv_ptc_operations = {
+ .open = uv_ptc_proc_open,
+ .read = seq_read,
+ .write = uv_ptc_proc_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init uv_ptc_init(void)
+{
+ struct proc_dir_entry *proc_uv_ptc;
+
+ if (!is_uv_system())
+ return 0;
+
+ if (!proc_mkdir("sgi_uv", NULL))
+ return -EINVAL;
+
+ proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
+ if (!proc_uv_ptc) {
+ printk(KERN_ERR "unable to create %s proc entry\n",
+ UV_PTC_BASENAME);
+ remove_proc_entry("sgi_uv", NULL);
+ return -EINVAL;
+ }
+ proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
+ return 0;
+}
+
+/*
+ * begin the initialization of the per-blade control structures
+ */
+static struct bau_control * __init uv_table_bases_init(int blade, int node)
+{
+ int i;
+ int *ip;
+ struct bau_msg_status *msp;
+ struct bau_control *bau_tabp;
+
+ bau_tabp =
+ kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
+ BUG_ON(!bau_tabp);
+
+ bau_tabp->msg_statuses =
+ kmalloc_node(sizeof(struct bau_msg_status) *
+ DEST_Q_SIZE, GFP_KERNEL, node);
+ BUG_ON(!bau_tabp->msg_statuses);
+
+ for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
+ bau_cpubits_clear(&msp->seen_by, (int)
+ uv_blade_nr_possible_cpus(blade));
+
+ bau_tabp->watching =
+ kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
+ BUG_ON(!bau_tabp->watching);
+
+ for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
+ *ip = 0;
+
+ uv_bau_table_bases[blade] = bau_tabp;
+
+ return bau_tabp;
+}
+
+/*
+ * finish the initialization of the per-blade control structures
+ */
+static void __init
+uv_table_bases_finish(int blade, int node, int cur_cpu,
+ struct bau_control *bau_tablesp,
+ struct bau_desc *adp)
+{
+ struct bau_control *bcp;
+ int i;
+
+ for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) {
+ bcp = (struct bau_control *)&per_cpu(bau_control, i);
+
+ bcp->bau_msg_head = bau_tablesp->va_queue_first;
+ bcp->va_queue_first = bau_tablesp->va_queue_first;
+ bcp->va_queue_last = bau_tablesp->va_queue_last;
+ bcp->watching = bau_tablesp->watching;
+ bcp->msg_statuses = bau_tablesp->msg_statuses;
+ bcp->descriptor_base = adp;
+ }
+}
+
+/*
+ * initialize the sending side's sending buffers
+ */
+static struct bau_desc * __init
+uv_activation_descriptor_init(int node, int pnode)
+{
+ int i;
+ unsigned long pa;
+ unsigned long m;
+ unsigned long n;
+ unsigned long mmr_image;
+ struct bau_desc *adp;
+ struct bau_desc *ad2;
+
+ adp = (struct bau_desc *)
+ kmalloc_node(16384, GFP_KERNEL, node);
+ BUG_ON(!adp);
+
+ pa = __pa((unsigned long)adp);
+ n = pa >> uv_nshift;
+ m = pa & uv_mmask;
+
+ mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE);
+ if (mmr_image) {
+ uv_write_global_mmr64(pnode, (unsigned long)
+ UVH_LB_BAU_SB_DESCRIPTOR_BASE,
+ (n << UV_DESC_BASE_PNODE_SHIFT | m));
+ }
+
+ for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+ memset(ad2, 0, sizeof(struct bau_desc));
+ ad2->header.sw_ack_flag = 1;
+ ad2->header.base_dest_nodeid =
+ uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+ ad2->header.command = UV_NET_ENDPOINT_INTD;
+ ad2->header.int_both = 1;
+ /*
+ * all others need to be set to zero:
+ * fairness chaining multilevel count replied_to
+ */
+ }
+ return adp;
+}
+
+/*
+ * initialize the destination side's receiving buffers
+ */
+static struct bau_payload_queue_entry * __init
+uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
+{
+ struct bau_payload_queue_entry *pqp;
+ char *cp;
+
+ pqp = (struct bau_payload_queue_entry *) kmalloc_node(
+ (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
+ GFP_KERNEL, node);
+ BUG_ON(!pqp);
+
+ cp = (char *)pqp + 31;
+ pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
+ bau_tablesp->va_queue_first = pqp;
+ uv_write_global_mmr64(pnode,
+ UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
+ ((unsigned long)pnode <<
+ UV_PAYLOADQ_PNODE_SHIFT) |
+ uv_physnodeaddr(pqp));
+ uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
+ uv_physnodeaddr(pqp));
+ bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+ uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
+ (unsigned long)
+ uv_physnodeaddr(bau_tablesp->va_queue_last));
+ memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
+
+ return pqp;
+}
+
+/*
+ * Initialization of each UV blade's structures
+ */
+static int __init uv_init_blade(int blade, int node, int cur_cpu)
+{
+ int pnode;
+ unsigned long pa;
+ unsigned long apicid;
+ struct bau_desc *adp;
+ struct bau_payload_queue_entry *pqp;
+ struct bau_control *bau_tablesp;
+
+ bau_tablesp = uv_table_bases_init(blade, node);
+ pnode = uv_blade_to_pnode(blade);
+ adp = uv_activation_descriptor_init(node, pnode);
+ pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
+ uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
+ /*
+ * the below initialization can't be in firmware because the
+ * messaging IRQ will be determined by the OS
+ */
+ apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+ pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+ if ((pa & 0xff) != UV_BAU_MESSAGE) {
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
+ ((apicid << 32) | UV_BAU_MESSAGE));
+ }
+ return 0;
+}
+
+/*
+ * Initialization of BAU-related structures
+ */
+static int __init uv_bau_init(void)
+{
+ int blade;
+ int node;
+ int nblades;
+ int last_blade;
+ int cur_cpu = 0;
+
+ if (!is_uv_system())
+ return 0;
+
+ uv_bau_retry_limit = 1;
+ uv_nshift = uv_hub_info->n_val;
+ uv_mmask = (1UL << uv_hub_info->n_val) - 1;
+ nblades = 0;
+ last_blade = -1;
+ for_each_online_node(node) {
+ blade = uv_node_to_blade_id(node);
+ if (blade == last_blade)
+ continue;
+ last_blade = blade;
+ nblades++;
+ }
+ uv_bau_table_bases = (struct bau_control **)
+ kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
+ BUG_ON(!uv_bau_table_bases);
+
+ last_blade = -1;
+ for_each_online_node(node) {
+ blade = uv_node_to_blade_id(node);
+ if (blade == last_blade)
+ continue;
+ last_blade = blade;
+ uv_init_blade(blade, node, cur_cpu);
+ cur_cpu += uv_blade_nr_possible_cpus(blade);
+ }
+ set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+ uv_enable_timeouts();
+
+ return 0;
+}
+__initcall(uv_bau_init);
+__initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index abbf199adeb..1106fac6024 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,7 +2,7 @@
#include <asm/trampoline.h>
-/* ready for x86_64, no harm for x86, since it will overwrite after alloc */
+/* ready for x86_64 and x86 */
unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
/*
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 08d752de4ee..8a768973c4f 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
@@ -60,8 +61,6 @@
#include "mach_traps.h"
-int panic_on_unrecovered_nmi;
-
DECLARE_BITMAP(used_vectors, NR_VECTORS);
EXPORT_SYMBOL_GPL(used_vectors);
@@ -98,19 +97,22 @@ asmlinkage void alignment_check(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void machine_check(void);
+int panic_on_unrecovered_nmi;
int kstack_depth_to_print = 24;
static unsigned int code_bytes = 64;
+static int ignore_nmis;
+static int die_counter;
void printk_address(unsigned long address, int reliable)
{
#ifdef CONFIG_KALLSYMS
- char namebuf[KSYM_NAME_LEN];
unsigned long offset = 0;
unsigned long symsize;
const char *symname;
- char reliab[4] = "";
- char *delim = ":";
char *modname;
+ char *delim = ":";
+ char namebuf[KSYM_NAME_LEN];
+ char reliab[4] = "";
symname = kallsyms_lookup(address, &symsize, &offset,
&modname, namebuf);
@@ -130,22 +132,23 @@ void printk_address(unsigned long address, int reliable)
#endif
}
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size)
{
- return p > (void *)tinfo &&
- p <= (void *)tinfo + THREAD_SIZE - size;
+ void *t = tinfo;
+ return p > t && p <= t + THREAD_SIZE - size;
}
/* The form of the top of the frame on the stack */
struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
+ struct stack_frame *next_frame;
+ unsigned long return_address;
};
static inline unsigned long
print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
{
struct stack_frame *frame = (struct stack_frame *)bp;
@@ -167,8 +170,6 @@ print_context_stack(struct thread_info *tinfo,
return bp;
}
-#define MSG(msg) ops->warning(data, msg)
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
@@ -178,7 +179,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!stack) {
unsigned long dummy;
-
stack = &dummy;
if (task != current)
stack = (unsigned long *)task->thread.sp;
@@ -196,7 +196,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
}
#endif
- while (1) {
+ for (;;) {
struct thread_info *context;
context = (struct thread_info *)
@@ -248,10 +248,10 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
};
static void
@@ -351,15 +351,14 @@ void show_registers(struct pt_regs *regs)
printk(KERN_EMERG "Code: ");
ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
+ if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
/* try starting at EIP */
ip = (u8 *)regs->ip;
code_len = code_len - code_prologue + 1;
}
for (i = 0; i < code_len; i++, ip++) {
if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
+ probe_kernel_address(ip, c)) {
printk(" Bad EIP value.");
break;
}
@@ -384,8 +383,6 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static int die_counter;
-
int __kprobes __die(const char *str, struct pt_regs *regs, long err)
{
unsigned short ss;
@@ -402,26 +399,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
printk("DEBUG_PAGEALLOC");
#endif
printk("\n");
-
if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
-
- show_registers(regs);
- /* Executive summary in case the oops scrolled away */
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- }
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
- return 0;
+ show_registers(regs);
+ /* Executive summary in case the oops scrolled away */
+ sp = (unsigned long) (&regs->sp);
+ savesegment(ss, ss);
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
}
-
- return 1;
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+ return 0;
}
/*
@@ -546,7 +539,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
{ \
trace_hardirqs_fixup(); \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
}
@@ -562,7 +555,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
info.si_code = sicode; \
info.si_addr = (void __user *)siaddr; \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
}
@@ -571,7 +564,7 @@ void do_##name(struct pt_regs *regs, long error_code) \
void do_##name(struct pt_regs *regs, long error_code) \
{ \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
}
@@ -586,27 +579,29 @@ void do_##name(struct pt_regs *regs, long error_code) \
info.si_addr = (void __user *)siaddr; \
trace_hardirqs_fixup(); \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ == NOTIFY_STOP) \
return; \
do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
}
-DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
#ifndef CONFIG_KPROBES
DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
#endif
DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
-void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
+void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
{
+ struct task_struct *tsk;
struct thread_struct *thread;
struct tss_struct *tss;
int cpu;
@@ -647,23 +642,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
if (regs->flags & X86_VM_MASK)
goto gp_in_vm86;
+ tsk = current;
if (!user_mode(regs))
goto gp_in_kernel;
- current->thread.error_code = error_code;
- current->thread.trap_no = 13;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
- if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
- printk_ratelimit()) {
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
- current->comm, task_pid_nr(current),
- regs->ip, regs->sp, error_code);
+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, error_code);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
- force_sig(SIGSEGV, current);
+ force_sig(SIGSEGV, tsk);
return;
gp_in_vm86:
@@ -672,14 +668,15 @@ gp_in_vm86:
return;
gp_in_kernel:
- if (!fixup_exception(regs)) {
- current->thread.error_code = error_code;
- current->thread.trap_no = 13;
- if (notify_die(DIE_GPF, "general protection fault", regs,
+ if (fixup_exception(regs))
+ return;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
+ if (notify_die(DIE_GPF, "general protection fault", regs,
error_code, 13, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
- }
+ return;
+ die("general protection fault", regs, error_code);
}
static notrace __kprobes void
@@ -756,9 +753,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
static DEFINE_SPINLOCK(nmi_print_lock);
-void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
{
- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
return;
spin_lock(&nmi_print_lock);
@@ -767,10 +764,12 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
* to get a message out:
*/
bust_spinlocks(1);
- printk(KERN_EMERG "%s", msg);
+ printk(KERN_EMERG "%s", str);
printk(" on CPU%d, ip %08lx, registers:\n",
smp_processor_id(), regs->ip);
show_registers(regs);
+ if (do_panic)
+ panic("Non maskable interrupt");
console_silent();
spin_unlock(&nmi_print_lock);
bust_spinlocks(0);
@@ -790,14 +789,17 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
+ int cpu;
+
+ cpu = smp_processor_id();
- /* Only the BSP gets external NMIs from the system: */
- if (!smp_processor_id())
+ /* Only the BSP gets external NMIs from the system. */
+ if (!cpu)
reason = get_nmi_reason();
if (!(reason & 0xc0)) {
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
+ == NOTIFY_STOP)
return;
#ifdef CONFIG_X86_LOCAL_APIC
/*
@@ -806,7 +808,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
*/
if (nmi_watchdog_tick(regs, reason))
return;
- if (!do_nmi_callback(regs, smp_processor_id()))
+ if (!do_nmi_callback(regs, cpu))
unknown_nmi_error(reason, regs);
#else
unknown_nmi_error(reason, regs);
@@ -816,6 +818,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
return;
+
+ /* AK: following checks seem to be broken on modern chipsets. FIXME */
if (reason & 0x80)
mem_parity_error(reason, regs);
if (reason & 0x40)
@@ -827,8 +831,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
reassert_nmi();
}
-static int ignore_nmis;
-
notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
{
int cpu;
@@ -913,7 +915,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
tsk->thread.debugctlmsr = 0;
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
- SIGTRAP) == NOTIFY_STOP)
+ SIGTRAP) == NOTIFY_STOP)
return;
/* It's safe to allow irq's after DR6 has been saved */
if (regs->flags & X86_EFLAGS_IF)
@@ -974,9 +976,8 @@ clear_TF_reenable:
void math_error(void __user *ip)
{
struct task_struct *task;
- unsigned short cwd;
- unsigned short swd;
siginfo_t info;
+ unsigned short cwd, swd;
/*
* Save the info for the exception handler and clear the error.
@@ -995,7 +996,7 @@ void math_error(void __user *ip)
* C1 reg you need in case of a stack fault, 0x040 is the stack
* fault bit. We should only be taking one exception at a time,
* so if this combination doesn't produce any single exception,
- * then we have a bad program that isn't syncronizing its FPU usage
+ * then we have a bad program that isn't synchronizing its FPU usage
* and it will suffer the consequences since we won't be able to
* fully reproduce the context of the exception
*/
@@ -1004,7 +1005,7 @@ void math_error(void __user *ip)
switch (swd & ~cwd & 0x3f) {
case 0x000: /* No unmasked exception */
return;
- default: /* Multiple exceptions */
+ default: /* Multiple exceptions */
break;
case 0x001: /* Invalid Op */
/*
@@ -1040,8 +1041,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code)
static void simd_math_error(void __user *ip)
{
struct task_struct *task;
- unsigned short mxcsr;
siginfo_t info;
+ unsigned short mxcsr;
/*
* Save the info for the exception handler and clear the error.
@@ -1117,7 +1118,7 @@ void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
{
- struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+ struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
unsigned long base = (kesp - uesp) & -THREAD_SIZE;
unsigned long new_kesp = kesp - base;
unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
@@ -1196,19 +1197,16 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- init_apic_mappings();
-#endif
- set_trap_gate(0, &divide_error);
- set_intr_gate(1, &debug);
- set_intr_gate(2, &nmi);
- set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
- set_system_gate(4, &overflow);
- set_trap_gate(5, &bounds);
- set_trap_gate(6, &invalid_op);
- set_trap_gate(7, &device_not_available);
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
- set_trap_gate(9, &coprocessor_segment_overrun);
+ set_trap_gate(0, &divide_error);
+ set_intr_gate(1, &debug);
+ set_intr_gate(2, &nmi);
+ set_system_intr_gate(3, &int3); /* int3 can be called from all */
+ set_system_gate(4, &overflow); /* int4 can be called from all */
+ set_trap_gate(5, &bounds);
+ set_trap_gate(6, &invalid_op);
+ set_trap_gate(7, &device_not_available);
+ set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+ set_trap_gate(9, &coprocessor_segment_overrun);
set_trap_gate(10, &invalid_TSS);
set_trap_gate(11, &segment_not_present);
set_trap_gate(12, &stack_segment);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c..2696a683778 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -10,49 +10,49 @@
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'entry.S'.
*/
-#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/kdebug.h>
#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
#include <linux/string.h>
+#include <linux/unwind.h>
+#include <linux/delay.h>
#include <linux/errno.h>
-#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
#include <linux/timer.h>
-#include <linux/mm.h>
#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/nmi.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
#include <linux/bug.h>
-#include <linux/kdebug.h>
-#include <linux/utsname.h>
-
-#include <mach_traps.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
#include <asm/debugreg.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unwind.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
+#include <asm/nmi.h>
#include <asm/smp.h>
+#include <asm/io.h>
#include <asm/pgalloc.h>
-#include <asm/pda.h>
#include <asm/proto.h>
-#include <asm/nmi.h>
-#include <asm/stacktrace.h>
+#include <asm/pda.h>
+
+#include <mach_traps.h>
asmlinkage void divide_error(void);
asmlinkage void debug(void);
@@ -71,12 +71,15 @@ asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
asmlinkage void coprocessor_error(void);
asmlinkage void simd_coprocessor_error(void);
-asmlinkage void reserved(void);
asmlinkage void alignment_check(void);
-asmlinkage void machine_check(void);
asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void machine_check(void);
+int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 12;
static unsigned int code_bytes = 64;
+static int ignore_nmis;
+static int die_counter;
static inline void conditional_sti(struct pt_regs *regs)
{
@@ -100,34 +103,9 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
dec_preempt_count();
}
-int kstack_depth_to_print = 12;
-
void printk_address(unsigned long address, int reliable)
{
-#ifdef CONFIG_KALLSYMS
- unsigned long offset = 0, symsize;
- const char *symname;
- char *modname;
- char *delim = ":";
- char namebuf[KSYM_NAME_LEN];
- char reliab[4] = "";
-
- symname = kallsyms_lookup(address, &symsize, &offset,
- &modname, namebuf);
- if (!symname) {
- printk(" [<%016lx>]\n", address);
- return;
- }
- if (!reliable)
- strcpy(reliab, "? ");
-
- if (!modname)
- modname = delim = "";
- printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
- address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
- printk(" [<%016lx>]\n", address);
-#endif
+ printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
}
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -204,8 +182,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
return NULL;
}
-#define MSG(txt) ops->warning(data, txt)
-
/*
* x86-64 can have up to three kernel stacks:
* process stack
@@ -232,11 +208,11 @@ struct stack_frame {
unsigned long return_address;
};
-
-static inline unsigned long print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end)
{
struct stack_frame *frame = (struct stack_frame *)bp;
@@ -258,7 +234,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
return bp;
}
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
@@ -267,36 +243,34 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
unsigned used = 0;
struct thread_info *tinfo;
- if (!tsk)
- tsk = current;
- tinfo = task_thread_info(tsk);
+ if (!task)
+ task = current;
if (!stack) {
unsigned long dummy;
stack = &dummy;
- if (tsk && tsk != current)
- stack = (unsigned long *)tsk->thread.sp;
+ if (task && task != current)
+ stack = (unsigned long *)task->thread.sp;
}
#ifdef CONFIG_FRAME_POINTER
if (!bp) {
- if (tsk == current) {
+ if (task == current) {
/* Grab bp right from our regs */
- asm("movq %%rbp, %0" : "=r" (bp):);
+ asm("movq %%rbp, %0" : "=r" (bp) :);
} else {
/* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) tsk->thread.sp;
+ bp = *(unsigned long *) task->thread.sp;
}
}
#endif
-
-
/*
* Print function call entries in all stacks, starting at the
* current stack address. If the stacks consist of nested
* exceptions
*/
+ tinfo = task_thread_info(task);
for (;;) {
char *id;
unsigned long *estack_end;
@@ -381,18 +355,17 @@ static const struct stacktrace_ops print_trace_ops = {
.address = print_trace_address,
};
-void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
- unsigned long bp)
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
{
printk("\nCall Trace:\n");
- dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
+ dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
printk("\n");
}
static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
- unsigned long bp)
+_show_stack(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp)
{
unsigned long *stack;
int i;
@@ -404,14 +377,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
// back trace for this cpu.
if (sp == NULL) {
- if (tsk)
- sp = (unsigned long *)tsk->thread.sp;
+ if (task)
+ sp = (unsigned long *)task->thread.sp;
else
sp = (unsigned long *)&sp;
}
stack = sp;
- for(i=0; i < kstack_depth_to_print; i++) {
+ for (i = 0; i < kstack_depth_to_print; i++) {
if (stack >= irqstack && stack <= irqstack_end) {
if (stack == irqstack_end) {
stack = (unsigned long *) (irqstack_end[-1]);
@@ -426,12 +399,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
- show_trace(tsk, regs, sp, bp);
+ show_trace(task, regs, sp, bp);
}
-void show_stack(struct task_struct *tsk, unsigned long * sp)
+void show_stack(struct task_struct *task, unsigned long *sp)
{
- _show_stack(tsk, NULL, sp, 0);
+ _show_stack(task, NULL, sp, 0);
}
/*
@@ -439,8 +412,8 @@ void show_stack(struct task_struct *tsk, unsigned long * sp)
*/
void dump_stack(void)
{
- unsigned long dummy;
unsigned long bp = 0;
+ unsigned long stack;
#ifdef CONFIG_FRAME_POINTER
if (!bp)
@@ -452,7 +425,7 @@ void dump_stack(void)
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- show_trace(NULL, NULL, &dummy, bp);
+ show_trace(NULL, NULL, &stack, bp);
}
EXPORT_SYMBOL(dump_stack);
@@ -463,12 +436,8 @@ void show_registers(struct pt_regs *regs)
unsigned long sp;
const int cpu = smp_processor_id();
struct task_struct *cur = cpu_pda(cpu)->pcurrent;
- u8 *ip;
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
sp = regs->sp;
- ip = (u8 *) regs->ip - code_prologue;
printk("CPU %d ", cpu);
__show_regs(regs);
printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -479,15 +448,21 @@ void show_registers(struct pt_regs *regs)
* time of the fault..
*/
if (!user_mode(regs)) {
+ unsigned int code_prologue = code_bytes * 43 / 64;
+ unsigned int code_len = code_bytes;
unsigned char c;
+ u8 *ip;
+
printk("Stack: ");
_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
printk("\n");
printk(KERN_EMERG "Code: ");
+
+ ip = (u8 *)regs->ip - code_prologue;
if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
/* try starting at RIP */
- ip = (u8 *) regs->ip;
+ ip = (u8 *)regs->ip;
code_len = code_len - code_prologue + 1;
}
for (i = 0; i < code_len; i++, ip++) {
@@ -503,7 +478,7 @@ void show_registers(struct pt_regs *regs)
}
}
printk("\n");
-}
+}
int is_valid_bugaddr(unsigned long ip)
{
@@ -561,10 +536,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
do_exit(signr);
}
-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
{
- static int die_counter;
- printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+ printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
#ifdef CONFIG_PREEMPT
printk("PREEMPT ");
#endif
@@ -575,8 +549,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
printk("DEBUG_PAGEALLOC");
#endif
printk("\n");
- if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
return 1;
+
show_registers(regs);
add_taint(TAINT_DIE);
/* Executive summary in case the oops scrolled away */
@@ -588,7 +564,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
return 0;
}
-void die(const char * str, struct pt_regs * regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
{
unsigned long flags = oops_begin();
@@ -605,8 +581,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
{
unsigned long flags;
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
- NOTIFY_STOP)
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
return;
flags = oops_begin();
@@ -614,7 +589,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
* We are in trouble anyway, lets at least try
* to get a message out.
*/
- printk(str, smp_processor_id());
+ printk(KERN_EMERG "%s", str);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
show_registers(regs);
if (kexec_should_crash(current))
crash_kexec(regs);
@@ -626,44 +603,44 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
do_exit(SIGBUS);
}
-static void __kprobes do_trap(int trapnr, int signr, char *str,
- struct pt_regs * regs, long error_code,
- siginfo_t *info)
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
{
struct task_struct *tsk = current;
- if (user_mode(regs)) {
- /*
- * We want error_code and trap_no set for userspace
- * faults and kernelspace faults which result in
- * die(), but not kernelspace faults which are fixed
- * up. die() gives the process no chance to handle
- * the signal and notice the kernel fault information,
- * so that won't result in polluting the information
- * about previously queued, but not yet delivered,
- * faults. See also do_general_protection below.
- */
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
-
- if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid, str,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
+ if (!user_mode(regs))
+ goto kernel_trap;
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
- return;
+ /*
+ * We want error_code and trap_no set for userspace faults and
+ * kernelspace faults which result in die(), but not
+ * kernelspace faults which are fixed up. die() gives the
+ * process no chance to handle the signal and notice the
+ * kernel fault information, so that won't result in polluting
+ * the information about previously queued, but not yet
+ * delivered, faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
}
+ if (info)
+ force_sig_info(signr, info, tsk);
+ else
+ force_sig(signr, tsk);
+ return;
+kernel_trap:
if (!fixup_exception(regs)) {
tsk->thread.error_code = error_code;
tsk->thread.trap_no = trapnr;
@@ -673,41 +650,39 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
}
#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
+ do_trap(trapnr, signr, str, regs, error_code, NULL); \
}
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- siginfo_t info; \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- trace_hardirqs_fixup(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+ siginfo_t info; \
+ info.si_signo = signr; \
+ info.si_errno = 0; \
+ info.si_code = sicode; \
+ info.si_addr = (void __user *)siaddr; \
+ trace_hardirqs_fixup(); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
+ do_trap(trapnr, signr, str, regs, error_code, &info); \
}
-DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
-DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, SIGSEGV, "reserved", reserved)
/* Runs on IST stack */
asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
@@ -737,31 +712,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
die(str, regs, error_code);
}
-asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
- long error_code)
+asmlinkage void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
+ struct task_struct *tsk;
conditional_sti(regs);
- if (user_mode(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
-
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
+ tsk = current;
+ if (!user_mode(regs))
+ goto gp_in_kernel;
- force_sig(SIGSEGV, tsk);
- return;
- }
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 13;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(KERN_INFO
+ "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+ force_sig(SIGSEGV, tsk);
+ return;
+
+gp_in_kernel:
if (fixup_exception(regs))
return;
@@ -774,14 +752,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
}
static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
{
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
reason);
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
#if defined(CONFIG_EDAC)
- if(edac_handler_set()) {
+ if (edac_handler_set()) {
edac_atomic_assert_error();
return;
}
@@ -798,7 +776,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
}
static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
+io_check_error(unsigned char reason, struct pt_regs *regs)
{
printk("NMI: IOCK error (debug interrupt?)\n");
show_registers(regs);
@@ -828,14 +806,14 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
/* Runs on IST stack. This code must keep interrupts off all the time.
Nested NMIs are prevented by the CPU. */
-asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int cpu;
cpu = smp_processor_id();
- /* Only the BSP gets external NMIs from the system. */
+ /* Only the BSP gets external NMIs from the system. */
if (!cpu)
reason = get_nmi_reason();
@@ -847,32 +825,57 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
- if (nmi_watchdog_tick(regs,reason))
+ if (nmi_watchdog_tick(regs, reason))
return;
- if (!do_nmi_callback(regs,cpu))
+ if (!do_nmi_callback(regs, cpu))
unknown_nmi_error(reason, regs);
return;
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
+ return;
/* AK: following checks seem to be broken on modern chipsets. FIXME */
-
if (reason & 0x80)
mem_parity_error(reason, regs);
if (reason & 0x40)
io_check_error(reason, regs);
}
+asmlinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+ nmi_enter();
+
+ add_pda(__nmi_count, 1);
+
+ if (!ignore_nmis)
+ default_do_nmi(regs);
+
+ nmi_exit();
+}
+
+void stop_nmi(void)
+{
+ acpi_nmi_disable();
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+ acpi_nmi_enable();
+}
+
/* runs on IST stack. */
-asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
+asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
{
trace_hardirqs_fixup();
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+ if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+ == NOTIFY_STOP)
return;
- }
+
preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
@@ -903,8 +906,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
asmlinkage void __kprobes do_debug(struct pt_regs * regs,
unsigned long error_code)
{
- unsigned long condition;
struct task_struct *tsk = current;
+ unsigned long condition;
siginfo_t info;
trace_hardirqs_fixup();
@@ -925,21 +928,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
/* Mask out spurious debug traps due to lazy DR7 setting */
if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
- if (!tsk->thread.debugreg7) {
+ if (!tsk->thread.debugreg7)
goto clear_dr7;
- }
}
tsk->thread.debugreg6 = condition;
-
/*
* Single-stepping through TF: make sure we ignore any events in
* kernel space (but re-enable TF when returning to user mode).
*/
if (condition & DR_STEP) {
- if (!user_mode(regs))
- goto clear_TF_reenable;
+ if (!user_mode(regs))
+ goto clear_TF_reenable;
}
/* Ok, finally something we can handle */
@@ -952,7 +953,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
force_sig_info(SIGTRAP, &info, tsk);
clear_dr7:
- set_debugreg(0UL, 7);
+ set_debugreg(0, 7);
preempt_conditional_cli(regs);
return;
@@ -960,6 +961,7 @@ clear_TF_reenable:
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
preempt_conditional_cli(regs);
+ return;
}
static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
@@ -982,7 +984,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
asmlinkage void do_coprocessor_error(struct pt_regs *regs)
{
void __user *ip = (void __user *)(regs->ip);
- struct task_struct * task;
+ struct task_struct *task;
siginfo_t info;
unsigned short cwd, swd;
@@ -1015,30 +1017,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
switch (swd & ~cwd & 0x3f) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- /*
- * swd & 0x240 == 0x040: Stack Underflow
- * swd & 0x240 == 0x240: Stack Overflow
- * User must clear the SF bit (0x40) if set
- */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
+ case 0x000: /* No unmasked exception */
+ default: /* Multiple exceptions */
+ break;
+ case 0x001: /* Invalid Op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
}
force_sig_info(SIGFPE, &info, task);
}
@@ -1051,7 +1053,7 @@ asmlinkage void bad_intr(void)
asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
{
void __user *ip = (void __user *)(regs->ip);
- struct task_struct * task;
+ struct task_struct *task;
siginfo_t info;
unsigned short mxcsr;
@@ -1079,25 +1081,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
*/
mxcsr = get_fpu_mxcsr(task);
switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
+ case 0x000:
+ default:
+ break;
+ case 0x001: /* Invalid Op */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
}
force_sig_info(SIGFPE, &info, task);
}
@@ -1115,7 +1117,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
}
/*
- * 'math_state_restore()' saves the current math information in the
+ * 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
*
* Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1140,7 +1142,7 @@ asmlinkage void math_state_restore(void)
local_irq_disable();
}
- clts(); /* Allow maths ops (or we recurse) */
+ clts(); /* Allow maths ops (or we recurse) */
restore_fpu_checking(&me->thread.xstate->fxsave);
task_thread_info(me)->status |= TS_USEDFPU;
me->fpu_counter++;
@@ -1149,64 +1151,61 @@ EXPORT_SYMBOL_GPL(math_state_restore);
void __init trap_init(void)
{
- set_intr_gate(0,&divide_error);
- set_intr_gate_ist(1,&debug,DEBUG_STACK);
- set_intr_gate_ist(2,&nmi,NMI_STACK);
- set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
- set_system_gate(4,&overflow); /* int4 can be called from all */
- set_intr_gate(5,&bounds);
- set_intr_gate(6,&invalid_op);
- set_intr_gate(7,&device_not_available);
- set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
- set_intr_gate(9,&coprocessor_segment_overrun);
- set_intr_gate(10,&invalid_TSS);
- set_intr_gate(11,&segment_not_present);
- set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
- set_intr_gate(13,&general_protection);
- set_intr_gate(14,&page_fault);
- set_intr_gate(15,&spurious_interrupt_bug);
- set_intr_gate(16,&coprocessor_error);
- set_intr_gate(17,&alignment_check);
+ set_intr_gate(0, &divide_error);
+ set_intr_gate_ist(1, &debug, DEBUG_STACK);
+ set_intr_gate_ist(2, &nmi, NMI_STACK);
+ set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
+ set_system_gate(4, &overflow); /* int4 can be called from all */
+ set_intr_gate(5, &bounds);
+ set_intr_gate(6, &invalid_op);
+ set_intr_gate(7, &device_not_available);
+ set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+ set_intr_gate(9, &coprocessor_segment_overrun);
+ set_intr_gate(10, &invalid_TSS);
+ set_intr_gate(11, &segment_not_present);
+ set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(13, &general_protection);
+ set_intr_gate(14, &page_fault);
+ set_intr_gate(15, &spurious_interrupt_bug);
+ set_intr_gate(16, &coprocessor_error);
+ set_intr_gate(17, &alignment_check);
#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(18,&machine_check, MCE_STACK);
+ set_intr_gate_ist(18, &machine_check, MCE_STACK);
#endif
- set_intr_gate(19,&simd_coprocessor_error);
+ set_intr_gate(19, &simd_coprocessor_error);
#ifdef CONFIG_IA32_EMULATION
set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
#endif
-
/*
* initialize the per thread extended state:
*/
- init_thread_xstate();
+ init_thread_xstate();
/*
- * Should be a barrier for any external CPU state.
+ * Should be a barrier for any external CPU state:
*/
cpu_init();
}
-
static int __init oops_setup(char *s)
-{
+{
if (!s)
return -EINVAL;
if (!strcmp(s, "panic"))
panic_on_oops = 1;
return 0;
-}
+}
early_param("oops", oops_setup);
static int __init kstack_setup(char *s)
{
if (!s)
return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s,NULL,0);
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
return 0;
}
early_param("kstack", kstack_setup);
-
static int __init code_bytes_setup(char *s)
{
code_bytes = simple_strtoul(s, NULL, 0);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
new file mode 100644
index 00000000000..7603c055390
--- /dev/null
+++ b/arch/x86/kernel/tsc.c
@@ -0,0 +1,535 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/cpufreq.h>
+#include <linux/dmi.h>
+#include <linux/delay.h>
+#include <linux/clocksource.h>
+#include <linux/percpu.h>
+
+#include <asm/hpet.h>
+#include <asm/timer.h>
+#include <asm/vgtod.h>
+#include <asm/time.h>
+#include <asm/delay.h>
+
+unsigned int cpu_khz; /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+unsigned int tsc_khz;
+EXPORT_SYMBOL(tsc_khz);
+
+/*
+ * TSC can be unstable due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+
+/* native_sched_clock() is called before tsc_init(), so
+ we must start with the TSC soft disabled to prevent
+ erroneous rdtsc usage on !cpu_has_tsc processors */
+static int tsc_disabled = -1;
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+u64 native_sched_clock(void)
+{
+ u64 this_offset;
+
+ /*
+ * Fall back to jiffies if there's no TSC available:
+ * ( But note that we still use it if the TSC is marked
+ * unstable. We do this because unlike Time Of Day,
+ * the scheduler clock tolerates small errors and it's
+ * very important for it to be as fast as the platform
+ * can achive it. )
+ */
+ if (unlikely(tsc_disabled)) {
+ /* No locking but a rare wrong value is not a big deal: */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+ }
+
+ /* read the Time Stamp Counter: */
+ rdtscll(this_offset);
+
+ /* return the value in ns */
+ return cycles_2_ns(this_offset);
+}
+
+/* We need to define a real function for sched_clock, to override the
+ weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+ return paravirt_sched_clock();
+}
+#else
+unsigned long long
+sched_clock(void) __attribute__((alias("native_sched_clock")));
+#endif
+
+int check_tsc_unstable(void)
+{
+ return tsc_unstable;
+}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
+#ifdef CONFIG_X86_TSC
+int __init notsc_setup(char *str)
+{
+ printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+ "cannot disable TSC completely.\n");
+ tsc_disabled = 1;
+ return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+int __init notsc_setup(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_TSC);
+ return 1;
+}
+#endif
+
+__setup("notsc", notsc_setup);
+
+#define MAX_RETRIES 5
+#define SMI_TRESHOLD 50000
+
+/*
+ * Read TSC and the reference counters. Take care of SMI disturbance
+ */
+static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
+{
+ u64 t1, t2;
+ int i;
+
+ for (i = 0; i < MAX_RETRIES; i++) {
+ t1 = get_cycles();
+ if (hpet)
+ *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ else
+ *pm = acpi_pm_read_early();
+ t2 = get_cycles();
+ if ((t2 - t1) < SMI_TRESHOLD)
+ return t2;
+ }
+ return ULLONG_MAX;
+}
+
+/**
+ * native_calibrate_tsc - calibrate the tsc on boot
+ */
+unsigned long native_calibrate_tsc(void)
+{
+ unsigned long flags;
+ u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
+ int hpet = is_hpet_enabled();
+ unsigned int tsc_khz_val = 0;
+
+ local_irq_save(flags);
+
+ tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
+
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ outb(0xb0, 0x43);
+ outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+ outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+ tr1 = get_cycles();
+ while ((inb(0x61) & 0x20) == 0);
+ tr2 = get_cycles();
+
+ tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+
+ local_irq_restore(flags);
+
+ /*
+ * Preset the result with the raw and inaccurate PIT
+ * calibration value
+ */
+ delta = (tr2 - tr1);
+ do_div(delta, 50);
+ tsc_khz_val = delta;
+
+ /* hpet or pmtimer available ? */
+ if (!hpet && !pm1 && !pm2) {
+ printk(KERN_INFO "TSC calibrated against PIT\n");
+ goto out;
+ }
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) {
+ printk(KERN_WARNING "TSC calibration disturbed by SMI, "
+ "using PIT calibration result\n");
+ goto out;
+ }
+
+ tsc2 = (tsc2 - tsc1) * 1000000LL;
+
+ if (hpet) {
+ printk(KERN_INFO "TSC calibrated against HPET\n");
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tsc1, 1000000);
+ } else {
+ printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tsc1 = pm2 * 1000000000LL;
+ do_div(tsc1, PMTMR_TICKS_PER_SEC);
+ }
+
+ do_div(tsc2, tsc1);
+ tsc_khz_val = tsc2;
+
+out:
+ return tsc_khz_val;
+}
+
+
+#ifdef CONFIG_X86_32
+/* Only called from the Powernow K7 cpu freq driver */
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+ unsigned long cpu_khz_old = cpu_khz;
+
+ if (cpu_has_tsc) {
+ tsc_khz = calibrate_tsc();
+ cpu_khz = tsc_khz;
+ cpu_data(0).loops_per_jiffy =
+ cpufreq_scale(cpu_data(0).loops_per_jiffy,
+ cpu_khz_old, cpu_khz);
+ return 0;
+ } else
+ return -ENODEV;
+#else
+ return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+#endif /* CONFIG_X86_32 */
+
+/* Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision, since
+ * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+DEFINE_PER_CPU(unsigned long, cyc2ns);
+
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+ unsigned long long tsc_now, ns_now;
+ unsigned long flags, *scale;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ scale = &per_cpu(cyc2ns, cpu);
+
+ rdtscll(tsc_now);
+ ns_now = __cycles_2_ns(tsc_now);
+
+ if (cpu_khz)
+ *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+
+ sched_clock_idle_wakeup_event(0);
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+ * changes.
+ *
+ * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
+ * not that important because current Opteron setups do not support
+ * scaling on SMP anyroads.
+ *
+ * Should fix up last_tsc too. Currently gettimeofday in the
+ * first tick after the change will be slightly wrong.
+ */
+
+static unsigned int ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long *lpj, dummy;
+
+ if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ lpj = &dummy;
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+ lpj = &cpu_data(freq->cpu).loops_per_jiffy;
+#else
+ lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+ if (!ref_freq) {
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = *lpj;
+ tsc_khz_ref = tsc_khz;
+ }
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+ (val == CPUFREQ_RESUMECHANGE)) {
+ *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+ tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ mark_tsc_unstable("cpufreq changes");
+ }
+
+ set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+ cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif /* CONFIG_CPU_FREQ */
+
+/* clocksource code */
+
+static struct clocksource clocksource_tsc;
+
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp. This can be observed in a
+ * very small window right after one CPU updated cycle_last under
+ * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
+ * is smaller than the cycle_last reference value due to a TSC which
+ * is slighty behind. This delta is nowhere else observable, but in
+ * that case it results in a forward time jump in the range of hours
+ * due to the unsigned delta calculation of the time keeping core
+ * code, which is necessary to support wrapping clocksources like pm
+ * timer.
+ */
+static cycle_t read_tsc(void)
+{
+ cycle_t ret = (cycle_t)get_cycles();
+
+ return ret >= clocksource_tsc.cycle_last ?
+ ret : clocksource_tsc.cycle_last;
+}
+
+#ifdef CONFIG_X86_64
+static cycle_t __vsyscall_fn vread_tsc(void)
+{
+ cycle_t ret = (cycle_t)vget_cycles();
+
+ return ret >= __vsyscall_gtod_data.clock.cycle_last ?
+ ret : __vsyscall_gtod_data.clock.cycle_last;
+}
+#endif
+
+static struct clocksource clocksource_tsc = {
+ .name = "tsc",
+ .rating = 300,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .shift = 22,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_MUST_VERIFY,
+#ifdef CONFIG_X86_64
+ .vread = vread_tsc,
+#endif
+};
+
+void mark_tsc_unstable(char *reason)
+{
+ if (!tsc_unstable) {
+ tsc_unstable = 1;
+ printk("Marking TSC unstable due to %s\n", reason);
+ /* Change only the rating, when not registered */
+ if (clocksource_tsc.mult)
+ clocksource_change_rating(&clocksource_tsc, 0);
+ else
+ clocksource_tsc.rating = 0;
+ }
+}
+
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
+ d->ident);
+ tsc_unstable = 1;
+ return 0;
+}
+
+/* List of systems that have known TSC problems */
+static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
+ {
+ .callback = dmi_mark_tsc_unstable,
+ .ident = "IBM Thinkpad 380XD",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
+ },
+ },
+ {}
+};
+
+/*
+ * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
+ */
+#ifdef CONFIG_MGEODE_LX
+/* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+
+static void __init check_geode_tsc_reliable(void)
+{
+ unsigned long res_low, res_high;
+
+ rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ if (res_low & RTSC_SUSP)
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+#else
+static inline void check_geode_tsc_reliable(void) { }
+#endif
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+ if (!cpu_has_tsc || tsc_unstable)
+ return 1;
+
+#ifdef CONFIG_SMP
+ if (apic_is_clustered_box())
+ return 1;
+#endif
+
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return 0;
+ /*
+ * Intel systems are normally all synchronized.
+ * Exceptions must mark TSC as unstable:
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ /* assume multi socket systems are not synchronized: */
+ if (num_possible_cpus() > 1)
+ tsc_unstable = 1;
+ }
+
+ return tsc_unstable;
+}
+
+static void __init init_tsc_clocksource(void)
+{
+ clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+ clocksource_tsc.shift);
+ /* lower the rating if we already know its unstable: */
+ if (check_tsc_unstable()) {
+ clocksource_tsc.rating = 0;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+ }
+ clocksource_register(&clocksource_tsc);
+}
+
+void __init tsc_init(void)
+{
+ u64 lpj;
+ int cpu;
+
+ if (!cpu_has_tsc)
+ return;
+
+ tsc_khz = calibrate_tsc();
+ cpu_khz = tsc_khz;
+
+ if (!tsc_khz) {
+ mark_tsc_unstable("could not calculate TSC khz");
+ return;
+ }
+
+#ifdef CONFIG_X86_64
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
+ cpu_khz = calibrate_cpu();
+#endif
+
+ lpj = ((u64)tsc_khz * 1000);
+ do_div(lpj, HZ);
+ lpj_fine = lpj;
+
+ printk("Detected %lu.%03lu MHz processor.\n",
+ (unsigned long)cpu_khz / 1000,
+ (unsigned long)cpu_khz % 1000);
+
+ /*
+ * Secondary CPUs do not run through tsc_init(), so set up
+ * all the scale factors for all CPUs, assuming the same
+ * speed as the bootup CPU. (cpufreq notifiers will fix this
+ * up if their speed diverges)
+ */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(cpu_khz, cpu);
+
+ if (tsc_disabled > 0)
+ return;
+
+ /* now allow native_sched_clock() to use rdtsc */
+ tsc_disabled = 0;
+
+ use_tsc_delay();
+ /* Check and install the TSC clocksource */
+ dmi_check_system(bad_tsc_dmi_table);
+
+ if (unsynchronized_tsc())
+ mark_tsc_unstable("TSCs unsynchronized");
+
+ check_geode_tsc_reliable();
+ init_tsc_clocksource();
+}
+
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
deleted file mode 100644
index 068759db63d..00000000000
--- a/arch/x86/kernel/tsc_32.c
+++ /dev/null
@@ -1,453 +0,0 @@
-#include <linux/sched.h>
-#include <linux/clocksource.h>
-#include <linux/workqueue.h>
-#include <linux/cpufreq.h>
-#include <linux/jiffies.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-#include <linux/percpu.h>
-
-#include <asm/delay.h>
-#include <asm/tsc.h>
-#include <asm/io.h>
-#include <asm/timer.h>
-
-#include "mach_timer.h"
-
-static int tsc_disabled;
-
-/*
- * On some systems the TSC frequency does not
- * change with the cpu frequency. So we need
- * an extra value to store the TSC freq
- */
-unsigned int tsc_khz;
-EXPORT_SYMBOL_GPL(tsc_khz);
-
-#ifdef CONFIG_X86_TSC
-static int __init tsc_setup(char *str)
-{
- printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC completely.\n");
- tsc_disabled = 1;
- return 1;
-}
-#else
-/*
- * disable flag for tsc. Takes effect by clearing the TSC cpu flag
- * in cpu/common.c
- */
-static int __init tsc_setup(char *str)
-{
- setup_clear_cpu_cap(X86_FEATURE_TSC);
- return 1;
-}
-#endif
-
-__setup("notsc", tsc_setup);
-
-/*
- * code to mark and check if the TSC is unstable
- * due to cpufreq or due to unsynced TSCs
- */
-static int tsc_unstable;
-
-int check_tsc_unstable(void)
-{
- return tsc_unstable;
-}
-EXPORT_SYMBOL_GPL(check_tsc_unstable);
-
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
- unsigned long long tsc_now, ns_now;
- unsigned long flags, *scale;
-
- local_irq_save(flags);
- sched_clock_idle_sleep_event();
-
- scale = &per_cpu(cyc2ns, cpu);
-
- rdtscll(tsc_now);
- ns_now = __cycles_2_ns(tsc_now);
-
- if (cpu_khz)
- *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-
- /*
- * Start smoothly with the new frequency:
- */
- sched_clock_idle_wakeup_event(0);
- local_irq_restore(flags);
-}
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long native_sched_clock(void)
-{
- unsigned long long this_offset;
-
- /*
- * Fall back to jiffies if there's no TSC available:
- * ( But note that we still use it if the TSC is marked
- * unstable. We do this because unlike Time Of Day,
- * the scheduler clock tolerates small errors and it's
- * very important for it to be as fast as the platform
- * can achive it. )
- */
- if (unlikely(tsc_disabled))
- /* No locking but a rare wrong value is not a big deal: */
- return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
-
- /* read the Time Stamp Counter: */
- rdtscll(this_offset);
-
- /* return the value in ns */
- return cycles_2_ns(this_offset);
-}
-
-/* We need to define a real function for sched_clock, to override the
- weak default version */
-#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
-{
- return paravirt_sched_clock();
-}
-#else
-unsigned long long sched_clock(void)
- __attribute__((alias("native_sched_clock")));
-#endif
-
-unsigned long native_calculate_cpu_khz(void)
-{
- unsigned long long start, end;
- unsigned long count;
- u64 delta64 = (u64)ULLONG_MAX;
- int i;
- unsigned long flags;
-
- local_irq_save(flags);
-
- /* run 3 times to ensure the cache is warm and to get an accurate reading */
- for (i = 0; i < 3; i++) {
- mach_prepare_counter();
- rdtscll(start);
- mach_countup(&count);
- rdtscll(end);
-
- /*
- * Error: ECTCNEVERSET
- * The CTC wasn't reliable: we got a hit on the very first read,
- * or the CPU was so fast/slow that the quotient wouldn't fit in
- * 32 bits..
- */
- if (count <= 1)
- continue;
-
- /* cpu freq too slow: */
- if ((end - start) <= CALIBRATE_TIME_MSEC)
- continue;
-
- /*
- * We want the minimum time of all runs in case one of them
- * is inaccurate due to SMI or other delay
- */
- delta64 = min(delta64, (end - start));
- }
-
- /* cpu freq too fast (or every run was bad): */
- if (delta64 > (1ULL<<32))
- goto err;
-
- delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
- do_div(delta64,CALIBRATE_TIME_MSEC);
-
- local_irq_restore(flags);
- return (unsigned long)delta64;
-err:
- local_irq_restore(flags);
- return 0;
-}
-
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
- unsigned long cpu_khz_old = cpu_khz;
-
- if (cpu_has_tsc) {
- cpu_khz = calculate_cpu_khz();
- tsc_khz = cpu_khz;
- cpu_data(0).loops_per_jiffy =
- cpufreq_scale(cpu_data(0).loops_per_jiffy,
- cpu_khz_old, cpu_khz);
- return 0;
- } else
- return -ENODEV;
-#else
- return -ENODEV;
-#endif
-}
-
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-
-#ifdef CONFIG_CPU_FREQ
-
-/*
- * if the CPU frequency is scaled, TSC-based delays will need a different
- * loops_per_jiffy value to function properly.
- */
-static unsigned int ref_freq;
-static unsigned long loops_per_jiffy_ref;
-static unsigned long cpu_khz_ref;
-
-static int
-time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
-{
- struct cpufreq_freqs *freq = data;
-
- if (!ref_freq) {
- if (!freq->old){
- ref_freq = freq->new;
- return 0;
- }
- ref_freq = freq->old;
- loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy;
- cpu_khz_ref = cpu_khz;
- }
-
- if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- cpu_data(freq->cpu).loops_per_jiffy =
- cpufreq_scale(loops_per_jiffy_ref,
- ref_freq, freq->new);
-
- if (cpu_khz) {
-
- if (num_online_cpus() == 1)
- cpu_khz = cpufreq_scale(cpu_khz_ref,
- ref_freq, freq->new);
- if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
- tsc_khz = cpu_khz;
- set_cyc2ns_scale(cpu_khz, freq->cpu);
- /*
- * TSC based sched_clock turns
- * to junk w/ cpufreq
- */
- mark_tsc_unstable("cpufreq changes");
- }
- }
- }
-
- return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
- .notifier_call = time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
- return cpufreq_register_notifier(&time_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
-}
-core_initcall(cpufreq_tsc);
-
-#endif
-
-/* clock source code */
-
-static unsigned long current_tsc_khz;
-static struct clocksource clocksource_tsc;
-
-/*
- * We compare the TSC to the cycle_last value in the clocksource
- * structure to avoid a nasty time-warp issue. This can be observed in
- * a very small window right after one CPU updated cycle_last under
- * xtime lock and the other CPU reads a TSC value which is smaller
- * than the cycle_last reference value due to a TSC which is slighty
- * behind. This delta is nowhere else observable, but in that case it
- * results in a forward time jump in the range of hours due to the
- * unsigned delta calculation of the time keeping core code, which is
- * necessary to support wrapping clocksources like pm timer.
- */
-static cycle_t read_tsc(void)
-{
- cycle_t ret;
-
- rdtscll(ret);
-
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
-}
-
-static struct clocksource clocksource_tsc = {
- .name = "tsc",
- .rating = 300,
- .read = read_tsc,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS |
- CLOCK_SOURCE_MUST_VERIFY,
-};
-
-void mark_tsc_unstable(char *reason)
-{
- if (!tsc_unstable) {
- tsc_unstable = 1;
- printk("Marking TSC unstable due to: %s.\n", reason);
- /* Can be called before registration */
- if (clocksource_tsc.mult)
- clocksource_change_rating(&clocksource_tsc, 0);
- else
- clocksource_tsc.rating = 0;
- }
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
- d->ident);
- tsc_unstable = 1;
- return 0;
-}
-
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
- {
- .callback = dmi_mark_tsc_unstable,
- .ident = "IBM Thinkpad 380XD",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
- },
- },
- {}
-};
-
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
- if (!cpu_has_tsc || tsc_unstable)
- return 1;
-
- /* Anything with constant TSC should be synchronized */
- if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- return 0;
-
- /*
- * Intel systems are normally all synchronized.
- * Exceptions must mark TSC as unstable:
- */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
- /* assume multi socket systems are not synchronized: */
- if (num_possible_cpus() > 1)
- tsc_unstable = 1;
- }
- return tsc_unstable;
-}
-
-/*
- * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
- */
-#ifdef CONFIG_MGEODE_LX
-/* RTSC counts during suspend */
-#define RTSC_SUSP 0x100
-
-static void __init check_geode_tsc_reliable(void)
-{
- unsigned long res_low, res_high;
-
- rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
- if (res_low & RTSC_SUSP)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-}
-#else
-static inline void check_geode_tsc_reliable(void) { }
-#endif
-
-
-void __init tsc_init(void)
-{
- int cpu;
-
- if (!cpu_has_tsc || tsc_disabled) {
- /* Disable the TSC in case of !cpu_has_tsc */
- tsc_disabled = 1;
- return;
- }
-
- cpu_khz = calculate_cpu_khz();
- tsc_khz = cpu_khz;
-
- if (!cpu_khz) {
- mark_tsc_unstable("could not calculate TSC khz");
- /*
- * We need to disable the TSC completely in this case
- * to prevent sched_clock() from using it.
- */
- tsc_disabled = 1;
- return;
- }
-
- printk("Detected %lu.%03lu MHz processor.\n",
- (unsigned long)cpu_khz / 1000,
- (unsigned long)cpu_khz % 1000);
-
- /*
- * Secondary CPUs do not run through tsc_init(), so set up
- * all the scale factors for all CPUs, assuming the same
- * speed as the bootup CPU. (cpufreq notifiers will fix this
- * up if their speed diverges)
- */
- for_each_possible_cpu(cpu)
- set_cyc2ns_scale(cpu_khz, cpu);
-
- use_tsc_delay();
-
- /* Check and install the TSC clocksource */
- dmi_check_system(bad_tsc_dmi_table);
-
- unsynchronized_tsc();
- check_geode_tsc_reliable();
- current_tsc_khz = tsc_khz;
- clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
- clocksource_tsc.shift);
- /* lower the rating if we already know its unstable: */
- if (check_tsc_unstable()) {
- clocksource_tsc.rating = 0;
- clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
- }
- clocksource_register(&clocksource_tsc);
-}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
deleted file mode 100644
index 1784b8077a1..00000000000
--- a/arch/x86/kernel/tsc_64.c
+++ /dev/null
@@ -1,357 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/clocksource.h>
-#include <linux/time.h>
-#include <linux/acpi.h>
-#include <linux/cpufreq.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/hpet.h>
-#include <asm/timex.h>
-#include <asm/timer.h>
-#include <asm/vgtod.h>
-
-static int notsc __initdata = 0;
-
-unsigned int cpu_khz; /* TSC clocks / usec, not used here */
-EXPORT_SYMBOL(cpu_khz);
-unsigned int tsc_khz;
-EXPORT_SYMBOL(tsc_khz);
-
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
- unsigned long long tsc_now, ns_now;
- unsigned long flags, *scale;
-
- local_irq_save(flags);
- sched_clock_idle_sleep_event();
-
- scale = &per_cpu(cyc2ns, cpu);
-
- rdtscll(tsc_now);
- ns_now = __cycles_2_ns(tsc_now);
-
- if (cpu_khz)
- *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-
- sched_clock_idle_wakeup_event(0);
- local_irq_restore(flags);
-}
-
-unsigned long long native_sched_clock(void)
-{
- unsigned long a = 0;
-
- /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
- * which means it is not completely exact and may not be monotonous
- * between CPUs. But the errors should be too small to matter for
- * scheduling purposes.
- */
-
- rdtscll(a);
- return cycles_2_ns(a);
-}
-
-/* We need to define a real function for sched_clock, to override the
- weak default version */
-#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
-{
- return paravirt_sched_clock();
-}
-#else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
-#endif
-
-
-static int tsc_unstable;
-
-int check_tsc_unstable(void)
-{
- return tsc_unstable;
-}
-EXPORT_SYMBOL_GPL(check_tsc_unstable);
-
-#ifdef CONFIG_CPU_FREQ
-
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
- * changes.
- *
- * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
- * not that important because current Opteron setups do not support
- * scaling on SMP anyroads.
- *
- * Should fix up last_tsc too. Currently gettimeofday in the
- * first tick after the change will be slightly wrong.
- */
-
-static unsigned int ref_freq;
-static unsigned long loops_per_jiffy_ref;
-static unsigned long tsc_khz_ref;
-
-static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- struct cpufreq_freqs *freq = data;
- unsigned long *lpj, dummy;
-
- if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
- return 0;
-
- lpj = &dummy;
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-#ifdef CONFIG_SMP
- lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#else
- lpj = &boot_cpu_data.loops_per_jiffy;
-#endif
-
- if (!ref_freq) {
- ref_freq = freq->old;
- loops_per_jiffy_ref = *lpj;
- tsc_khz_ref = tsc_khz;
- }
- if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
- *lpj =
- cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-
- tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- mark_tsc_unstable("cpufreq changes");
- }
-
- set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
-
- return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
- .notifier_call = time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
- cpufreq_register_notifier(&time_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- return 0;
-}
-
-core_initcall(cpufreq_tsc);
-
-#endif
-
-#define MAX_RETRIES 5
-#define SMI_TRESHOLD 50000
-
-/*
- * Read TSC and the reference counters. Take care of SMI disturbance
- */
-static unsigned long __init tsc_read_refs(unsigned long *pm,
- unsigned long *hpet)
-{
- unsigned long t1, t2;
- int i;
-
- for (i = 0; i < MAX_RETRIES; i++) {
- t1 = get_cycles();
- if (hpet)
- *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
- else
- *pm = acpi_pm_read_early();
- t2 = get_cycles();
- if ((t2 - t1) < SMI_TRESHOLD)
- return t2;
- }
- return ULONG_MAX;
-}
-
-/**
- * tsc_calibrate - calibrate the tsc on boot
- */
-void __init tsc_calibrate(void)
-{
- unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
- int hpet = is_hpet_enabled(), cpu;
-
- local_irq_save(flags);
-
- tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
-
- outb((inb(0x61) & ~0x02) | 0x01, 0x61);
-
- outb(0xb0, 0x43);
- outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
- outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
- tr1 = get_cycles();
- while ((inb(0x61) & 0x20) == 0);
- tr2 = get_cycles();
-
- tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
-
- local_irq_restore(flags);
-
- /*
- * Preset the result with the raw and inaccurate PIT
- * calibration value
- */
- tsc_khz = (tr2 - tr1) / 50;
-
- /* hpet or pmtimer available ? */
- if (!hpet && !pm1 && !pm2) {
- printk(KERN_INFO "TSC calibrated against PIT\n");
- goto out;
- }
-
- /* Check, whether the sampling was disturbed by an SMI */
- if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
- printk(KERN_WARNING "TSC calibration disturbed by SMI, "
- "using PIT calibration result\n");
- goto out;
- }
-
- tsc2 = (tsc2 - tsc1) * 1000000L;
-
- if (hpet) {
- printk(KERN_INFO "TSC calibrated against HPET\n");
- if (hpet2 < hpet1)
- hpet2 += 0x100000000;
- hpet2 -= hpet1;
- tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
- } else {
- printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
- if (pm2 < pm1)
- pm2 += ACPI_PM_OVRRUN;
- pm2 -= pm1;
- tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC;
- }
-
- tsc_khz = tsc2 / tsc1;
-
-out:
- for_each_possible_cpu(cpu)
- set_cyc2ns_scale(tsc_khz, cpu);
-}
-
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
- if (tsc_unstable)
- return 1;
-
-#ifdef CONFIG_SMP
- if (apic_is_clustered_box())
- return 1;
-#endif
-
- if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- return 0;
-
- /* Assume multi socket systems are not synchronized */
- return num_present_cpus() > 1;
-}
-
-int __init notsc_setup(char *s)
-{
- notsc = 1;
- return 1;
-}
-
-__setup("notsc", notsc_setup);
-
-static struct clocksource clocksource_tsc;
-
-/*
- * We compare the TSC to the cycle_last value in the clocksource
- * structure to avoid a nasty time-warp. This can be observed in a
- * very small window right after one CPU updated cycle_last under
- * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
- * is smaller than the cycle_last reference value due to a TSC which
- * is slighty behind. This delta is nowhere else observable, but in
- * that case it results in a forward time jump in the range of hours
- * due to the unsigned delta calculation of the time keeping core
- * code, which is necessary to support wrapping clocksources like pm
- * timer.
- */
-static cycle_t read_tsc(void)
-{
- cycle_t ret = (cycle_t)get_cycles();
-
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
-}
-
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
- cycle_t ret = (cycle_t)vget_cycles();
-
- return ret >= __vsyscall_gtod_data.clock.cycle_last ?
- ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-
-static struct clocksource clocksource_tsc = {
- .name = "tsc",
- .rating = 300,
- .read = read_tsc,
- .mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS |
- CLOCK_SOURCE_MUST_VERIFY,
- .vread = vread_tsc,
-};
-
-void mark_tsc_unstable(char *reason)
-{
- if (!tsc_unstable) {
- tsc_unstable = 1;
- printk("Marking TSC unstable due to %s\n", reason);
- /* Change only the rating, when not registered */
- if (clocksource_tsc.mult)
- clocksource_change_rating(&clocksource_tsc, 0);
- else
- clocksource_tsc.rating = 0;
- }
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
-void __init init_tsc_clocksource(void)
-{
- if (!notsc) {
- clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
- clocksource_tsc.shift);
- if (check_tsc_unstable())
- clocksource_tsc.rating = 0;
-
- clocksource_register(&clocksource_tsc);
- }
-}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
new file mode 100644
index 00000000000..e94bdb6add1
--- /dev/null
+++ b/arch/x86/kernel/visws_quirks.c
@@ -0,0 +1,709 @@
+/*
+ * SGI Visual Workstation support and quirks, unmaintained.
+ *
+ * Split out from setup.c by davej@suse.de
+ *
+ * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
+ *
+ * SGI Visual Workstation interrupt controller
+ *
+ * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
+ * which serves as the main interrupt controller in the system. Non-legacy
+ * hardware in the system uses this controller directly. Legacy devices
+ * are connected to the PIIX4 which in turn has its 8259(s) connected to
+ * a of the Cobalt APIC entry.
+ *
+ * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
+ *
+ * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
+ */
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/visws/cobalt.h>
+#include <asm/visws/piix4.h>
+#include <asm/arch_hooks.h>
+#include <asm/fixmap.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/smp.h>
+#include <asm/io.h>
+
+#include <mach_ipi.h>
+
+#include "mach_apic.h"
+
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/irq_vectors.h>
+#include <asm/visws/cobalt.h>
+#include <asm/visws/lithium.h>
+#include <asm/visws/piix4.h>
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+extern int no_broadcast;
+
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/visws/cobalt.h>
+#include <asm/visws/lithium.h>
+
+char visws_board_type = -1;
+char visws_board_rev = -1;
+
+int is_visws_box(void)
+{
+ return visws_board_type >= 0;
+}
+
+static int __init visws_time_init_quirk(void)
+{
+ printk(KERN_INFO "Starting Cobalt Timer system clock\n");
+
+ /* Set the countdown value */
+ co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
+
+ /* Start the timer */
+ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
+
+ /* Enable (unmask) the timer interrupt */
+ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
+
+ /*
+ * Zero return means the generic timer setup code will set up
+ * the standard vector:
+ */
+ return 0;
+}
+
+static int __init visws_pre_intr_init_quirk(void)
+{
+ init_VISWS_APIC_irqs();
+
+ /*
+ * We dont want ISA irqs to be set up by the generic code:
+ */
+ return 1;
+}
+
+/* Quirk for machine specific memory setup. */
+
+#define MB (1024 * 1024)
+
+unsigned long sgivwfb_mem_phys;
+unsigned long sgivwfb_mem_size;
+EXPORT_SYMBOL(sgivwfb_mem_phys);
+EXPORT_SYMBOL(sgivwfb_mem_size);
+
+long long mem_size __initdata = 0;
+
+static char * __init visws_memory_setup_quirk(void)
+{
+ long long gfx_mem_size = 8 * MB;
+
+ mem_size = boot_params.alt_mem_k;
+
+ if (!mem_size) {
+ printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
+ mem_size = 128 * MB;
+ }
+
+ /*
+ * this hardcodes the graphics memory to 8 MB
+ * it really should be sized dynamically (or at least
+ * set as a boot param)
+ */
+ if (!sgivwfb_mem_size) {
+ printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
+ sgivwfb_mem_size = 8 * MB;
+ }
+
+ /*
+ * Trim to nearest MB
+ */
+ sgivwfb_mem_size &= ~((1 << 20) - 1);
+ sgivwfb_mem_phys = mem_size - gfx_mem_size;
+
+ e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+ e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
+ e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+
+ return "PROM";
+}
+
+static void visws_machine_emergency_restart(void)
+{
+ /*
+ * Visual Workstations restart after this
+ * register is poked on the PIIX4
+ */
+ outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
+}
+
+static void visws_machine_power_off(void)
+{
+ unsigned short pm_status;
+/* extern unsigned int pci_bus0; */
+
+ while ((pm_status = inw(PMSTS_PORT)) & 0x100)
+ outw(pm_status, PMSTS_PORT);
+
+ outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
+
+ mdelay(10);
+
+#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
+ (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
+
+/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
+ outl(PIIX_SPECIAL_STOP, 0xCFC);
+}
+
+static int __init visws_get_smp_config_quirk(unsigned int early)
+{
+ /*
+ * Prevent MP-table parsing by the generic code:
+ */
+ return 1;
+}
+
+extern unsigned int __cpuinitdata maxcpus;
+
+/*
+ * The Visual Workstation is Intel MP compliant in the hardware
+ * sense, but it doesn't have a BIOS(-configuration table).
+ * No problem for Linux.
+ */
+
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+ int ver, logical_apicid;
+ physid_mask_t apic_cpus;
+
+ if (!(m->mpc_cpuflag & CPU_ENABLED))
+ return;
+
+ logical_apicid = m->mpc_apicid;
+ printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
+ m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+ m->mpc_apicid,
+ (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+ (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+ m->mpc_apicver);
+
+ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
+ boot_cpu_physical_apicid = m->mpc_apicid;
+
+ ver = m->mpc_apicver;
+ if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
+ printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
+ m->mpc_apicid, MAX_APICS);
+ return;
+ }
+
+ apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
+ physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
+ /*
+ * Validate version
+ */
+ if (ver == 0x0) {
+ printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
+ "fixing up to 0x10. (tell your hw vendor)\n",
+ m->mpc_apicid);
+ ver = 0x10;
+ }
+ apic_version[m->mpc_apicid] = ver;
+}
+
+int __init visws_find_smp_config_quirk(unsigned int reserve)
+{
+ struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
+ unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
+
+ if (ncpus > CO_CPU_MAX) {
+ printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
+ ncpus, mp);
+
+ ncpus = CO_CPU_MAX;
+ }
+
+ if (ncpus > maxcpus)
+ ncpus = maxcpus;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ smp_found_config = 1;
+#endif
+ while (ncpus--)
+ MP_processor_info(mp++);
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ return 1;
+}
+
+extern int visws_trap_init_quirk(void);
+
+void __init visws_early_detect(void)
+{
+ int raw;
+
+ visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
+ >> PIIX_GPI_BD_SHIFT;
+
+ if (visws_board_type < 0)
+ return;
+
+ /*
+ * Install special quirks for timer, interrupt and memory setup:
+ */
+ arch_time_init_quirk = visws_time_init_quirk;
+ arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
+ arch_memory_setup_quirk = visws_memory_setup_quirk;
+
+ /*
+ * Fall back to generic behavior for traps:
+ */
+ arch_intr_init_quirk = NULL;
+ arch_trap_init_quirk = visws_trap_init_quirk;
+
+ /*
+ * Install reboot quirks:
+ */
+ pm_power_off = visws_machine_power_off;
+ machine_ops.emergency_restart = visws_machine_emergency_restart;
+
+ /*
+ * Do not use broadcast IPIs:
+ */
+ no_broadcast = 0;
+
+ /*
+ * Override generic MP-table parsing:
+ */
+ mach_get_smp_config_quirk = visws_get_smp_config_quirk;
+ mach_find_smp_config_quirk = visws_find_smp_config_quirk;
+
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Turn off IO-APIC detection and initialization:
+ */
+ skip_ioapic_setup = 1;
+#endif
+
+ /*
+ * Get Board rev.
+ * First, we have to initialize the 307 part to allow us access
+ * to the GPIO registers. Let's map them at 0x0fc0 which is right
+ * after the PIIX4 PM section.
+ */
+ outb_p(SIO_DEV_SEL, SIO_INDEX);
+ outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
+
+ outb_p(SIO_DEV_MSB, SIO_INDEX);
+ outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
+
+ outb_p(SIO_DEV_LSB, SIO_INDEX);
+ outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
+
+ outb_p(SIO_DEV_ENB, SIO_INDEX);
+ outb_p(1, SIO_DATA); /* Enable GPIO registers. */
+
+ /*
+ * Now, we have to map the power management section to write
+ * a bit which enables access to the GPIO registers.
+ * What lunatic came up with this shit?
+ */
+ outb_p(SIO_DEV_SEL, SIO_INDEX);
+ outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
+
+ outb_p(SIO_DEV_MSB, SIO_INDEX);
+ outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
+
+ outb_p(SIO_DEV_LSB, SIO_INDEX);
+ outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
+
+ outb_p(SIO_DEV_ENB, SIO_INDEX);
+ outb_p(1, SIO_DATA); /* Enable PM registers. */
+
+ /*
+ * Now, write the PM register which enables the GPIO registers.
+ */
+ outb_p(SIO_PM_FER2, SIO_PM_INDEX);
+ outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
+
+ /*
+ * Now, initialize the GPIO registers.
+ * We want them all to be inputs which is the
+ * power on default, so let's leave them alone.
+ * So, let's just read the board rev!
+ */
+ raw = inb_p(SIO_GP_DATA1);
+ raw &= 0x7f; /* 7 bits of valid board revision ID. */
+
+ if (visws_board_type == VISWS_320) {
+ if (raw < 0x6) {
+ visws_board_rev = 4;
+ } else if (raw < 0xc) {
+ visws_board_rev = 5;
+ } else {
+ visws_board_rev = 6;
+ }
+ } else if (visws_board_type == VISWS_540) {
+ visws_board_rev = 2;
+ } else {
+ visws_board_rev = raw;
+ }
+
+ printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
+ (visws_board_type == VISWS_320 ? "320" :
+ (visws_board_type == VISWS_540 ? "540" :
+ "unknown")), visws_board_rev);
+}
+
+#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
+#define BCD (LI_INTB | LI_INTC | LI_INTD)
+#define ALLDEVS (A01234 | BCD)
+
+static __init void lithium_init(void)
+{
+ set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
+ set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
+
+ if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+ (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+ printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
+/* panic("This machine is not SGI Visual Workstation 320/540"); */
+ }
+
+ if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+ (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+ printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
+/* panic("This machine is not SGI Visual Workstation 320/540"); */
+ }
+
+ li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
+ li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
+}
+
+static __init void cobalt_init(void)
+{
+ /*
+ * On normal SMP PC this is used only with SMP, but we have to
+ * use it and set it up here to start the Cobalt clock
+ */
+ set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+ setup_local_APIC();
+ printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
+ (unsigned int)apic_read(APIC_LVR),
+ (unsigned int)apic_read(APIC_ID));
+
+ set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
+ set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
+ printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
+ co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
+
+ /* Enable Cobalt APIC being careful to NOT change the ID! */
+ co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
+
+ printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
+ co_apic_read(CO_APIC_ID));
+}
+
+int __init visws_trap_init_quirk(void)
+{
+ lithium_init();
+ cobalt_init();
+
+ return 1;
+}
+
+/*
+ * IRQ controller / APIC support:
+ */
+
+static DEFINE_SPINLOCK(cobalt_lock);
+
+/*
+ * Set the given Cobalt APIC Redirection Table entry to point
+ * to the given IDT vector/index.
+ */
+static inline void co_apic_set(int entry, int irq)
+{
+ co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
+ co_apic_write(CO_APIC_HI(entry), 0);
+}
+
+/*
+ * Cobalt (IO)-APIC functions to handle PCI devices.
+ */
+static inline int co_apic_ide0_hack(void)
+{
+ extern char visws_board_type;
+ extern char visws_board_rev;
+
+ if (visws_board_type == VISWS_320 && visws_board_rev == 5)
+ return 5;
+ return CO_APIC_IDE0;
+}
+
+static int is_co_apic(unsigned int irq)
+{
+ if (IS_CO_APIC(irq))
+ return CO_APIC(irq);
+
+ switch (irq) {
+ case 0: return CO_APIC_CPU;
+ case CO_IRQ_IDE0: return co_apic_ide0_hack();
+ case CO_IRQ_IDE1: return CO_APIC_IDE1;
+ default: return -1;
+ }
+}
+
+
+/*
+ * This is the SGI Cobalt (IO-)APIC:
+ */
+
+static void enable_cobalt_irq(unsigned int irq)
+{
+ co_apic_set(is_co_apic(irq), irq);
+}
+
+static void disable_cobalt_irq(unsigned int irq)
+{
+ int entry = is_co_apic(irq);
+
+ co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
+ co_apic_read(CO_APIC_LO(entry));
+}
+
+/*
+ * "irq" really just serves to identify the device. Here is where we
+ * map this to the Cobalt APIC entry where it's physically wired.
+ * This is called via request_irq -> setup_irq -> irq_desc->startup()
+ */
+static unsigned int startup_cobalt_irq(unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cobalt_lock, flags);
+ if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+ irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+ enable_cobalt_irq(irq);
+ spin_unlock_irqrestore(&cobalt_lock, flags);
+ return 0;
+}
+
+static void ack_cobalt_irq(unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cobalt_lock, flags);
+ disable_cobalt_irq(irq);
+ apic_write(APIC_EOI, APIC_EIO_ACK);
+ spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static void end_cobalt_irq(unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cobalt_lock, flags);
+ if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+ enable_cobalt_irq(irq);
+ spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip cobalt_irq_type = {
+ .typename = "Cobalt-APIC",
+ .startup = startup_cobalt_irq,
+ .shutdown = disable_cobalt_irq,
+ .enable = enable_cobalt_irq,
+ .disable = disable_cobalt_irq,
+ .ack = ack_cobalt_irq,
+ .end = end_cobalt_irq,
+};
+
+
+/*
+ * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
+ * -- not the manner expected by the code in i8259.c.
+ *
+ * there is a 'master' physical interrupt source that gets sent to
+ * the CPU. But in the chipset there are various 'virtual' interrupts
+ * waiting to be handled. We represent this to Linux through a 'master'
+ * interrupt controller type, and through a special virtual interrupt-
+ * controller. Device drivers only see the virtual interrupt sources.
+ */
+static unsigned int startup_piix4_master_irq(unsigned int irq)
+{
+ init_8259A(0);
+
+ return startup_cobalt_irq(irq);
+}
+
+static void end_piix4_master_irq(unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cobalt_lock, flags);
+ enable_cobalt_irq(irq);
+ spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip piix4_master_irq_type = {
+ .typename = "PIIX4-master",
+ .startup = startup_piix4_master_irq,
+ .ack = ack_cobalt_irq,
+ .end = end_piix4_master_irq,
+};
+
+
+static struct irq_chip piix4_virtual_irq_type = {
+ .typename = "PIIX4-virtual",
+ .shutdown = disable_8259A_irq,
+ .enable = enable_8259A_irq,
+ .disable = disable_8259A_irq,
+};
+
+
+/*
+ * PIIX4-8259 master/virtual functions to handle interrupt requests
+ * from legacy devices: floppy, parallel, serial, rtc.
+ *
+ * None of these get Cobalt APIC entries, neither do they have IDT
+ * entries. These interrupts are purely virtual and distributed from
+ * the 'master' interrupt source: CO_IRQ_8259.
+ *
+ * When the 8259 interrupts its handler figures out which of these
+ * devices is interrupting and dispatches to its handler.
+ *
+ * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
+ * enable_irq gets the right irq. This 'master' irq is never directly
+ * manipulated by any driver.
+ */
+static irqreturn_t piix4_master_intr(int irq, void *dev_id)
+{
+ int realirq;
+ irq_desc_t *desc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+
+ /* Find out what's interrupting in the PIIX4 master 8259 */
+ outb(0x0c, 0x20); /* OCW3 Poll command */
+ realirq = inb(0x20);
+
+ /*
+ * Bit 7 == 0 means invalid/spurious
+ */
+ if (unlikely(!(realirq & 0x80)))
+ goto out_unlock;
+
+ realirq &= 7;
+
+ if (unlikely(realirq == 2)) {
+ outb(0x0c, 0xa0);
+ realirq = inb(0xa0);
+
+ if (unlikely(!(realirq & 0x80)))
+ goto out_unlock;
+
+ realirq = (realirq & 7) + 8;
+ }
+
+ /* mask and ack interrupt */
+ cached_irq_mask |= 1 << realirq;
+ if (unlikely(realirq > 7)) {
+ inb(0xa1);
+ outb(cached_slave_mask, 0xa1);
+ outb(0x60 + (realirq & 7), 0xa0);
+ outb(0x60 + 2, 0x20);
+ } else {
+ inb(0x21);
+ outb(cached_master_mask, 0x21);
+ outb(0x60 + realirq, 0x20);
+ }
+
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ desc = irq_desc + realirq;
+
+ /*
+ * handle this 'virtual interrupt' as a Cobalt one now.
+ */
+ kstat_cpu(smp_processor_id()).irqs[realirq]++;
+
+ if (likely(desc->action != NULL))
+ handle_IRQ_event(realirq, desc->action);
+
+ if (!(desc->status & IRQ_DISABLED))
+ enable_8259A_irq(realirq);
+
+ return IRQ_HANDLED;
+
+out_unlock:
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+ return IRQ_NONE;
+}
+
+static struct irqaction master_action = {
+ .handler = piix4_master_intr,
+ .name = "PIIX4-8259",
+};
+
+static struct irqaction cascade_action = {
+ .handler = no_action,
+ .name = "cascade",
+};
+
+
+void init_VISWS_APIC_irqs(void)
+{
+ int i;
+
+ for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+ irq_desc[i].status = IRQ_DISABLED;
+ irq_desc[i].action = 0;
+ irq_desc[i].depth = 1;
+
+ if (i == 0) {
+ irq_desc[i].chip = &cobalt_irq_type;
+ }
+ else if (i == CO_IRQ_IDE0) {
+ irq_desc[i].chip = &cobalt_irq_type;
+ }
+ else if (i == CO_IRQ_IDE1) {
+ irq_desc[i].chip = &cobalt_irq_type;
+ }
+ else if (i == CO_IRQ_8259) {
+ irq_desc[i].chip = &piix4_master_irq_type;
+ }
+ else if (i < CO_IRQ_APIC0) {
+ irq_desc[i].chip = &piix4_virtual_irq_type;
+ }
+ else if (IS_CO_APIC(i)) {
+ irq_desc[i].chip = &cobalt_irq_type;
+ }
+ }
+
+ setup_irq(CO_IRQ_8259, &master_action);
+ setup_irq(2, &cascade_action);
+}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 956f38927aa..b15346092b7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -151,7 +151,7 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
insns, ip);
case PARAVIRT_PATCH(pv_cpu_ops.iret):
return patch_internal(VMI_CALL_IRET, len, insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
+ case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
default:
break;
@@ -896,7 +896,7 @@ static inline int __init activate_vmi(void)
* the backend. They are performance critical anyway, so requiring
* a patch is not a big problem.
*/
- pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0;
+ pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
pv_cpu_ops.iret = (void *)0xbadbab0;
#ifdef CONFIG_SMP
@@ -932,7 +932,7 @@ static inline int __init activate_vmi(void)
pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
#endif
pv_time_ops.sched_clock = vmi_sched_clock;
- pv_time_ops.get_cpu_khz = vmi_cpu_khz;
+ pv_time_ops.get_tsc_khz = vmi_tsc_khz;
/* We have true wallclock functions; disable CMOS clock sync */
no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index a2b030780aa..6953859fe28 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -33,8 +33,7 @@
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/i8253.h>
-
-#include <irq_vectors.h>
+#include <asm/irq_vectors.h>
#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
@@ -70,8 +69,8 @@ unsigned long long vmi_sched_clock(void)
return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
}
-/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
-unsigned long vmi_cpu_khz(void)
+/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
+unsigned long vmi_tsc_khz(void)
{
unsigned long long khz;
khz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e..cdb2363697d 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -49,23 +49,14 @@ SECTIONS
_etext = .; /* End of text section */
} :text = 0x9090
+ NOTES :text :note
+
. = ALIGN(16); /* Exception table */
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
__start___ex_table = .;
*(__ex_table)
__stop___ex_table = .;
- }
-
- NOTES :text :note
-
- BUG_TABLE :text
-
- . = ALIGN(4);
- .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
- __tracedata_start = .;
- *(.tracedata)
- __tracedata_end = .;
- }
+ } :text = 0x9090
RODATA
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a..63e5c1a22e8 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,7 +19,7 @@ PHDRS {
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(4); /* R__ */
+ note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
{
@@ -40,26 +40,17 @@ SECTIONS
_etext = .; /* End of text section */
} :text = 0x9090
+ NOTES :text :note
+
. = ALIGN(16); /* Exception table */
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
__start___ex_table = .;
*(__ex_table)
__stop___ex_table = .;
- }
-
- NOTES :text :note
-
- BUG_TABLE :text
+ } :text = 0x9090
RODATA
- . = ALIGN(4);
- .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
- __tracedata_start = .;
- *(.tracedata)
- __tracedata_end = .;
- }
-
. = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -177,6 +168,7 @@ SECTIONS
*(.con_initcall.init)
}
__con_initcall_end = .;
+ . = ALIGN(16);
__x86cpuvendor_start = .;
.x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
*(.x86cpuvendor.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index ba8c0b75ab0..0c029e8959c 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,9 +15,12 @@
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
+
+#include <asm/apic.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
#include <asm/paravirt.h>
+#include <asm/setup.h>
#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d56..0b8b6690a86 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -42,7 +42,8 @@
#include <asm/topology.h>
#include <asm/vgtod.h>
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","cx","memory"
/*
@@ -249,7 +250,7 @@ static ctl_table kernel_root_table2[] = {
doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
{
- unsigned long *d;
+ unsigned long d;
unsigned long node = 0;
#ifdef CONFIG_NUMA
node = cpu_to_node(cpu);
@@ -260,11 +261,11 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
/* Store cpu number in limit so that it can be loaded quickly
in user space in vgetcpu.
12 bits for the CPU and 8 bits for the node. */
- d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
- *d = 0x0f40000000000ULL;
- *d |= cpu;
- *d |= (node & 0xf) << 12;
- *d |= (node >> 4) << 48;
+ d = 0x0f40000000000ULL;
+ d |= cpu;
+ d |= (node & 0xf) << 12;
+ d |= (node >> 4) << 48;
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
static void __cpuinit cpu_vsyscall_init(void *arg)
@@ -278,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
- smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+ smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
return NOTIFY_DONE;
}
@@ -301,7 +302,7 @@ static int __init vsyscall_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_table(kernel_root_table2);
#endif
- on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+ on_each_cpu(cpu_vsyscall_init, NULL, 1);
hotcpu_notifier(cpu_vsyscall_notifier, 0);
return 0;
}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index f6c05d0410f..b545f371b5f 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -2,13 +2,20 @@
All C exports should go in the respective C files. */
#include <linux/module.h>
-#include <net/checksum.h>
#include <linux/smp.h>
+#include <net/checksum.h>
+
#include <asm/processor.h>
-#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#include <asm/uaccess.h>
#include <asm/desc.h>
+#include <asm/ftrace.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
EXPORT_SYMBOL(kernel_thread);
@@ -53,8 +60,3 @@ EXPORT_SYMBOL(init_level4_pgt);
EXPORT_SYMBOL(load_gs_index);
EXPORT_SYMBOL(_proxy_pda);
-
-#ifdef CONFIG_PARAVIRT
-/* Virtualized guests may want to use it */
-EXPORT_SYMBOL_GPL(cpu_gdt_descr);
-#endif