diff options
Diffstat (limited to 'arch/x86/kernel')
124 files changed, 4852 insertions, 4937 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..d8e5d0cdd67 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit.o +obj-y += time.o ioport.o ldt.o dumpstack.o +obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o @@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o @@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285..67e929b8987 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -833,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void) extern int es7000_plat; #endif -static struct { - int gsi_base; - int gsi_end; -} mp_ioapic_routing[MAX_IO_APICS]; - -int mp_find_ioapic(int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_base) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; -} - -int mp_find_ioapic_pin(int ioapic, int gsi) -{ - if (WARN_ON(ioapic == -1)) - return -1; - if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end)) - return -1; - - return gsi - mp_ioapic_routing[ioapic].gsi_base; -} - -static u8 __init uniq_ioapic_id(u8 id) -{ -#ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif -} - -static int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); - return 1; - } - return 0; -} - -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) -{ - int idx = 0; - - if (bad_ioapic(address)) - return; - - idx = nr_ioapics; - - mp_ioapics[idx].type = MP_IOAPIC; - mp_ioapics[idx].flags = MPC_APIC_USABLE; - mp_ioapics[idx].apicaddr = address; - - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].apicid = uniq_ioapic_id(id); - mp_ioapics[idx].apicver = io_apic_get_version(idx); - - /* - * Build basic GSI lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI GSIs). - */ - mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, - mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - nr_ioapics++; -} - int __init acpi_probe_gsi(void) { int idx; @@ -947,7 +847,7 @@ int __init acpi_probe_gsi(void) max_gsi = 0; for (idx = 0; idx < nr_ioapics; idx++) { - gsi = mp_ioapic_routing[idx].gsi_end; + gsi = mp_gsi_routing[idx].gsi_end; if (gsi > max_gsi) max_gsi = gsi; @@ -1179,9 +1079,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) * If MPS is present, it will handle them, * otherwise the system will stay in PIC mode */ - if (acpi_disabled || acpi_noirq) { + if (acpi_disabled || acpi_noirq) return -ENODEV; - } if (!cpu_has_apic) return -ENODEV; diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8c44c232efc..59cdfa4686b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f5765870257..de7353c0ce9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/stringify.h> #include <linux/kprobes.h> #include <linux/mm.h> #include <linux/vmalloc.h> @@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly); #define smp_alt_once 1 #endif -static int debug_alternative; +static int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { @@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str) __setup("noreplace-smp", setup_noreplace_smp); #ifdef CONFIG_PARAVIRT -static int noreplace_paravirt = 0; +static int __initdata_or_module noreplace_paravirt = 0; static int __init setup_noreplace_paravirt(char *str) { @@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) -#ifdef GENERIC_NOP1 +#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) /* Use inline assembly to define this because the nops are defined as inline assembly strings in the include files and we cannot get them easily into strings. */ -asm("\t.section .rodata, \"a\"\nintelnops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 GENERIC_NOP7 GENERIC_NOP8 "\t.previous"); extern const unsigned char intelnops[]; -static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +intel_nops[ASM_NOP_MAX+1] = { NULL, intelnops, intelnops + 1, @@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { #endif #ifdef K8_NOP1 -asm("\t.section .rodata, \"a\"\nk8nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 K8_NOP7 K8_NOP8 "\t.previous"); extern const unsigned char k8nops[]; -static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k8_nops[ASM_NOP_MAX+1] = { NULL, k8nops, k8nops + 1, @@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { }; #endif -#ifdef K7_NOP1 -asm("\t.section .rodata, \"a\"\nk7nops: " +#if defined(K7_NOP1) && !defined(CONFIG_X86_64) +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 K7_NOP7 K7_NOP8 "\t.previous"); extern const unsigned char k7nops[]; -static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k7_nops[ASM_NOP_MAX+1] = { NULL, k7nops, k7nops + 1, @@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { #endif #ifdef P6_NOP1 -asm("\t.section .rodata, \"a\"\np6nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 P6_NOP7 P6_NOP8 "\t.previous"); extern const unsigned char p6nops[]; -static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +p6_nops[ASM_NOP_MAX+1] = { NULL, p6nops, p6nops + 1, @@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_has(X86_FEATURE_NOPL)) @@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void) #else /* CONFIG_X86_64 */ -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_has(X86_FEATURE_K8)) return k8_nops; @@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void) #endif /* CONFIG_X86_64 */ /* Use this to add nops to a buffer, then text_poke the whole buffer. */ -void add_nops(void *insns, unsigned int len) +static void __init_or_module add_nops(void *insns, unsigned int len) { const unsigned char *const *noptable = find_nop_table(); @@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len) len -= noplen; } } -EXPORT_SYMBOL_GPL(add_nops); extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; +static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with @@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[]; APs have less capabilities than the boot processor are not handled. Tough. Make sure you disable such features by hand. */ -void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) { struct alt_instr *a; char insnbuf[MAX_PATCH_LEN]; @@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules); static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ -void alternatives_smp_module_add(struct module *mod, char *name, - void *locks, void *locks_end, - void *text, void *text_end) +void __init_or_module alternatives_smp_module_add(struct module *mod, + char *name, + void *locks, void *locks_end, + void *text, void *text_end) { struct smp_alt_module *smp; @@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name, mutex_unlock(&smp_alt); } -void alternatives_smp_module_del(struct module *mod) +void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; @@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) +void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) { struct paravirt_patch_site *p; char insnbuf[MAX_PATCH_LEN]; @@ -485,13 +492,14 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *text_poke_early(void *addr, const void *opcode, size_t len) +static void *__init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f95dfe52644..0285521e0a9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); -#ifdef CONFIG_IOMMU_API +/* + * Domain for untranslated devices - only allocated + * if iommu=pt passed on kernel cmd line. + */ +static struct protection_domain *pt_domain; + static struct iommu_ops amd_iommu_ops; -#endif /* * general struct to manage commands send to an IOMMU @@ -55,16 +59,16 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 - **pte_page, gfp_t gfp); +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, int end_lvl, + u64 **pte_page, gfp_t gfp); static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages); - -#ifndef BUS_NOTIFY_UNBOUND_DRIVER -#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 -#endif +static void reset_iommu_command_buffer(struct amd_iommu *iommu); +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size); +static void update_domain(struct protection_domain *domain); #ifdef CONFIG_AMD_IOMMU_STATS @@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu) * ****************************************************************************/ -static void iommu_print_event(void *__evt) +static void dump_dte_entry(u16 devid) +{ + int i; + + for (i = 0; i < 8; ++i) + pr_err("AMD-Vi: DTE[%d]: %08x\n", i, + amd_iommu_dev_table[devid].data[i]); +} + +static void dump_command(unsigned long phys_addr) +{ + struct iommu_cmd *cmd = phys_to_virt(phys_addr); + int i; + + for (i = 0; i < 4; ++i) + pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); +} + +static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { u32 *event = __evt; int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; @@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt) int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; u64 address = (u64)(((u64)event[3]) << 32) | event[2]; - printk(KERN_ERR "AMD IOMMU: Event logged ["); + printk(KERN_ERR "AMD-Vi: Event logged ["); switch (type) { case EVENT_TYPE_ILL_DEV: @@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt) "address=0x%016llx flags=0x%04x]\n", PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), address, flags); + dump_dte_entry(devid); break; case EVENT_TYPE_IO_FAULT: printk("IO_PAGE_FAULT device=%02x:%02x.%x " @@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + reset_iommu_command_buffer(iommu); + dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: printk("COMMAND_HARDWARE_ERROR address=0x%016llx " @@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); while (head != tail) { - iommu_print_event(iommu->evt_buf + head); + iommu_print_event(iommu, iommu->evt_buf + head); head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; } @@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) - panic("AMD IOMMU: Completion wait loop failed\n"); + if (unlikely(i == EXIT_LOOP_COUNT)) { + spin_unlock(&iommu->lock); + reset_iommu_command_buffer(iommu); + spin_lock(&iommu->lock); + } } /* @@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) } /* + * This function flushes one domain on one IOMMU + */ +static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +{ + struct iommu_cmd cmd; + unsigned long flags; + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + flush_domain_on_iommu(iommu, i); + } + +} + +/* * This function is used to flush the IO/TLB for a given protection domain * on every IOMMU in the system */ static void iommu_flush_domain(u16 domid) { - unsigned long flags; struct amd_iommu *iommu; - struct iommu_cmd cmd; INC_STATS_COUNTER(domain_flush_all); - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); - - for_each_iommu(iommu) { - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); - } + for_each_iommu(iommu) + flush_domain_on_iommu(iommu, domid); } void amd_iommu_flush_all_domains(void) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + flush_all_domains_on_iommu(iommu); +} + +static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +{ int i; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (iommu != amd_iommu_rlookup_table[i]) continue; - iommu_flush_domain(i); + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); } } -void amd_iommu_flush_all_devices(void) +static void flush_devices_by_domain(struct protection_domain *domain) { struct amd_iommu *iommu; int i; for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] == NULL) + if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || + (amd_iommu_pd_table[i] != domain)) continue; iommu = amd_iommu_rlookup_table[i]; @@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void) } } +static void reset_iommu_command_buffer(struct amd_iommu *iommu) +{ + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + iommu->reset_in_progress = true; + + amd_iommu_reset_cmd_buffer(iommu); + flush_all_devices_for_iommu(iommu); + flush_all_domains_on_iommu(iommu); + + iommu->reset_in_progress = false; +} + +void amd_iommu_flush_all_devices(void) +{ + flush_devices_by_domain(NULL); +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void) static int iommu_map_page(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, - int prot) + int prot, + int map_size) { u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); - /* only support 512GB address spaces for now */ - if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) + BUG_ON(!PM_ALIGNED(map_size, bus_addr)); + BUG_ON(!PM_ALIGNED(map_size, phys_addr)); + + if (!(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); + pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom, *pte = __pte; + update_domain(dom); + return 0; } static void iommu_unmap_page(struct protection_domain *dom, - unsigned long bus_addr) + unsigned long bus_addr, int map_size) { - u64 *pte; - - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + u64 *pte = fetch_pte(dom, bus_addr, map_size); - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - *pte = 0; + if (pte) + *pte = 0; } /* @@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, + PM_MAP_4k); if (ret) return ret; /* @@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * This function checks if there is a PTE for a given dma address. If * there is one, it returns the pointer to it. */ -static u64* fetch_pte(struct protection_domain *domain, - unsigned long address) +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { + int level; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + level -= 1; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } return pte; } @@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu, u64 *pte, *pte_page; for (i = 0; i < num_ptes; ++i) { - pte = alloc_pte(&dma_dom->domain, address, + pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, &pte_page, gfp); if (!pte) goto out_free; @@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu, for (i = dma_dom->aperture[index]->offset; i < dma_dom->aperture_size; i += PAGE_SIZE) { - u64 *pte = fetch_pte(&dma_dom->domain, i); + u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); if (!pte || !IOMMU_PTE_PRESENT(*pte)) continue; dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); } + update_domain(&dma_dom->domain); + return 0; out_free: + update_domain(&dma_dom->domain); + free_page((unsigned long)dma_dom->aperture[index]->bitmap); kfree(dma_dom->aperture[index]); @@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; - dma_dom->domain.mode = PAGE_MODE_3_LEVEL; + dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; @@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +static void set_dte_entry(u16 devid, struct protection_domain *domain) +{ + u64 pte_root = virt_to_phys(domain->pt_root); + + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + + amd_iommu_dev_table[devid].data[2] = domain->id; + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + + amd_iommu_pd_table[devid] = domain; +} + +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ +static void __attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) +{ + /* lock domain */ + spin_lock(&domain->lock); + + /* update DTE entry */ + set_dte_entry(devid, domain); + + domain->dev_cnt += 1; + + /* ready */ + spin_unlock(&domain->lock); +} + /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware @@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu, u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); - - domain->dev_cnt += 1; - - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[2] = domain->id; - - amd_iommu_pd_table[devid] = domain; + __attach_device(iommu, domain, devid); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - /* - * We might boot into a crash-kernel here. The crashed kernel - * left the caches in the IOMMU dirty. So we have to flush - * here to evict all dirty stuff. - */ + /* + * We might boot into a crash-kernel here. The crashed kernel + * left the caches in the IOMMU dirty. So we have to flush + * here to evict all dirty stuff. + */ iommu_queue_inv_dev_entry(iommu, devid); iommu_flush_tlb_pde(iommu, domain->id); } @@ -1121,6 +1227,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid) /* ready */ spin_unlock(&domain->lock); + + /* + * If we run in passthrough mode the device must be assigned to the + * passthrough domain if it is detached from any other domain + */ + if (iommu_pass_through) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + __attach_device(iommu, pt_domain, devid); + } } /* @@ -1166,6 +1281,8 @@ static int device_change_notifier(struct notifier_block *nb, case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; + if (iommu_pass_through) + break; detach_device(domain, devid); break; case BUS_NOTIFY_ADD_DEVICE: @@ -1294,39 +1411,91 @@ static int get_device_resources(struct device *dev, return 1; } +static void update_device_table(struct protection_domain *domain) +{ + unsigned long flags; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] != domain) + continue; + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + set_dte_entry(i, domain); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + } +} + +static void update_domain(struct protection_domain *domain) +{ + if (!domain->updated) + return; + + update_device_table(domain); + flush_devices_by_domain(domain); + iommu_flush_domain(domain->id); + + domain->updated = false; +} + /* - * If the pte_page is not yet allocated this function is called + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. */ -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 **pte_page, gfp_t gfp) +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) +{ + u64 *pte; + + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; + + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; +} + +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { u64 *pte, *page; + int level; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } + level -= 1; - pte = IOMMU_PTE_PAGE(*pte); + pte = IOMMU_PTE_PAGE(*pte); - if (pte_page) - *pte_page = pte; + if (pte_page && level == end_lvl) + *pte_page = pte; - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + pte = &pte[PM_LEVEL_INDEX(level, address)]; + } return pte; } @@ -1346,10 +1515,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { - pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, + GFP_ATOMIC); aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; } else - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); + + update_domain(&dom->domain); return pte; } @@ -1411,7 +1583,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, if (!pte) return; - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); WARN_ON(!*pte); @@ -1990,19 +2162,47 @@ static void cleanup_domain(struct protection_domain *domain) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -static int amd_iommu_domain_init(struct iommu_domain *dom) +static void protection_domain_free(struct protection_domain *domain) +{ + if (!domain) + return; + + if (domain->id) + domain_id_free(domain->id); + + kfree(domain); +} + +static struct protection_domain *protection_domain_alloc(void) { struct protection_domain *domain; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) - return -ENOMEM; + return NULL; spin_lock_init(&domain->lock); - domain->mode = PAGE_MODE_3_LEVEL; domain->id = domain_id_alloc(); if (!domain->id) + goto out_err; + + return domain; + +out_err: + kfree(domain); + + return NULL; +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = protection_domain_alloc(); + if (!domain) goto out_free; + + domain->mode = PAGE_MODE_3_LEVEL; domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); if (!domain->pt_root) goto out_free; @@ -2012,7 +2212,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) return 0; out_free: - kfree(domain); + protection_domain_free(domain); return -ENOMEM; } @@ -2117,7 +2317,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr &= PAGE_MASK; for (i = 0; i < npages; ++i) { - ret = iommu_map_page(domain, iova, paddr, prot); + ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) return ret; @@ -2138,7 +2338,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; for (i = 0; i < npages; ++i) { - iommu_unmap_page(domain, iova); + iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } @@ -2153,21 +2353,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, phys_addr_t paddr; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + pte = fetch_pte(domain, iova, PM_MAP_4k); - if (!IOMMU_PTE_PRESENT(*pte)) + if (!pte || !IOMMU_PTE_PRESENT(*pte)) return 0; paddr = *pte & IOMMU_PAGE_MASK; @@ -2193,3 +2381,46 @@ static struct iommu_ops amd_iommu_ops = { .domain_has_cap = amd_iommu_domain_has_cap, }; +/***************************************************************************** + * + * The next functions do a basic initialization of IOMMU for pass through + * mode + * + * In passthrough mode the IOMMU is initialized and enabled but not used for + * DMA-API translation. + * + *****************************************************************************/ + +int __init amd_iommu_init_passthrough(void) +{ + struct pci_dev *dev = NULL; + u16 devid, devid2; + + /* allocate passthroug domain */ + pt_domain = protection_domain_alloc(); + if (!pt_domain) + return -ENOMEM; + + pt_domain->mode |= PAGE_MODE_NONE; + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + struct amd_iommu *iommu; + + devid = calc_devid(dev->bus->number, dev->devfn); + if (devid > amd_iommu_last_bdf) + continue; + + devid2 = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid2]; + if (!iommu) + continue; + + __attach_device(iommu, pt_domain, devid); + __attach_device(iommu, pt_domain, devid2); + } + + pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); + + return 0; +} diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1e423b2add1..c20001e4f55 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,7 +252,7 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); @@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) } /* + * This function resets the command buffer if the IOMMU stopped fetching + * commands from it. + */ +void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); +} + +/* * This function writes the command buffer address to the hardware and * enables it. */ @@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); - /* set head and tail to zero manually */ - writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - - iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); + amd_iommu_reset_cmd_buffer(iommu); } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -880,7 +890,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) switch (*p) { case ACPI_IVHD_TYPE: - DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + DUMP_printk("device: %02x:%02x.%01x cap: %04x " "seg: %d flags: %01x info %04x\n", PCI_BUS(h->devid), PCI_SLOT(h->devid), PCI_FUNC(h->devid), h->cap_ptr, @@ -924,7 +934,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) r = request_irq(iommu->dev->irq, amd_iommu_int_handler, IRQF_SAMPLE_RANDOM, - "AMD IOMMU", + "AMD-Vi", NULL); if (r) { @@ -1172,7 +1182,7 @@ int __init amd_iommu_init(void) if (no_iommu) { - printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); + printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); return 0; } @@ -1264,22 +1274,28 @@ int __init amd_iommu_init(void) if (ret) goto free; - ret = amd_iommu_init_dma_ops(); + if (iommu_pass_through) + ret = amd_iommu_init_passthrough(); + else + ret = amd_iommu_init_dma_ops(); if (ret) goto free; enable_iommus(); - printk(KERN_INFO "AMD IOMMU: device isolation "); + if (iommu_pass_through) + goto out; + + printk(KERN_INFO "AMD-Vi: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); else printk("disabled\n"); if (amd_iommu_unmap_flush) - printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); + printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else - printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); out: return ret; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 676debfc170..128111d8ffe 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,6 +20,7 @@ #include <linux/bitops.h> #include <linux/ioport.h> #include <linux/suspend.h> +#include <linux/kmemleak.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/iommu.h> @@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void) * code for safe */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(p); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec6..894aa97f071 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> @@ -35,7 +35,8 @@ #include <linux/smp.h> #include <linux/mm.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> +#include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/atomic.h> #include <asm/mpspec.h> @@ -49,6 +50,7 @@ #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/mce.h> +#include <asm/kvm_para.h> unsigned int num_processors; @@ -60,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. * - * This determines the messaging protocol we can use: if all APIC IDs + * On AMD, this determines the messaging protocol we can use: if all APIC IDs * are in the 0 ... 7 range, then we can use logical addressing which * has some performance advantages (better broadcasting). * @@ -977,7 +979,7 @@ void lapic_shutdown(void) { unsigned long flags; - if (!cpu_has_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return; local_irq_save(flags); @@ -1187,7 +1189,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(); + perf_events_lapic_init(); preempt_disable(); @@ -1195,8 +1197,7 @@ void __cpuinit setup_local_APIC(void) * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. */ - if (!apic->apic_id_registered()) - BUG(); + BUG_ON(!apic->apic_id_registered()); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1361,52 +1362,80 @@ void enable_x2apic(void) } #endif /* CONFIG_X86_X2APIC */ -void __init enable_IR_x2apic(void) +int __init enable_IR(void) { #ifdef CONFIG_INTR_REMAP - int ret; - unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; - - ret = dmar_table_init(); - if (ret) { - pr_debug("dmar_table_init() failed with %d:\n", ret); - goto ir_failed; - } - if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - goto ir_failed; + return 0; } - if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return; + return 0; } + if (enable_intr_remapping(x2apic_supported())) + return 0; + + pr_info("Enabled Interrupt-remapping\n"); + + return 1; + +#endif + return 0; +} + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + struct IO_APIC_route_entry **ioapic_entries = NULL; + int ret, x2apic_enabled = 0; + int dmar_table_init_ret = 0; + +#ifdef CONFIG_INTR_REMAP + dmar_table_init_ret = dmar_table_init(); + if (dmar_table_init_ret) + pr_debug("dmar_table_init() failed with %d:\n", + dmar_table_init_ret); +#endif + ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { - pr_info("Allocate ioapic_entries failed: %d\n", ret); - goto end; + pr_err("Allocate ioapic_entries failed\n"); + goto out; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto end; + goto out; } local_irq_save(flags); - mask_IO_APIC_setup(ioapic_entries); mask_8259A(); + mask_IO_APIC_setup(ioapic_entries); - ret = enable_intr_remapping(x2apic_supported()); - if (ret) - goto end_restore; + if (dmar_table_init_ret) + ret = 0; + else + ret = enable_IR(); - pr_info("Enabled Interrupt-remapping\n"); + if (!ret) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || !kvm_para_available()) + goto nox2apic; + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_force_phys(); + } + + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { x2apic_mode = 1; @@ -1414,41 +1443,25 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -end_restore: - if (ret) - /* - * IR enabling failed - */ +nox2apic: + if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); local_irq_restore(flags); -end: +out: if (ioapic_entries) free_ioapic_entries(ioapic_entries); - if (!ret) + if (x2apic_enabled) return; -ir_failed: if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); + panic("x2apic: enabled by BIOS but kernel init failed."); else if (cpu_has_x2apic) - pr_info("Not enabling x2apic,Intr-remapping\n"); -#else - if (!cpu_has_x2apic) - return; - - if (x2apic_preenabled) - panic("x2apic enabled prior OS handover," - " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); -#endif - - return; + pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); } - #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. @@ -1549,8 +1562,6 @@ no_apic: #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) { - unsigned long phys_addr; - /* * If no local APIC can be found then go out * : it means there is no mpatable and MADT @@ -1558,11 +1569,9 @@ void __init early_init_lapic_mapping(void) if (!smp_found_config) return; - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); + APIC_BASE, mp_lapic_addr); /* * Fetch the APIC ID of the BSP in case we have a @@ -1651,7 +1660,6 @@ int __init APIC_init_uniprocessor(void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } #endif @@ -1701,7 +1709,7 @@ int __init APIC_init_uniprocessor(void) localise_nmi_watchdog(); #endif - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); #ifdef CONFIG_X86_64 check_nmi_watchdog(); #endif @@ -1908,24 +1916,14 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj <ashok.raj@intel.com> - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + def_to_bigsmp = 1; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) def_to_bigsmp = 1; - } } #endif diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 676cdac385c..77a06413b6b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) return physids_promote(0xFFL); } -static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int bigsmp_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8952a589028..89174f847b4 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void) { /* MPENTIUMIII */ if (boot_cpu_data.x86 == 6 && - (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) + (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) return 1; return 0; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2284a4812b6..dc69f28489f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,8 @@ #include <asm/apic.h> #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -85,12 +87,20 @@ int nr_ioapic_registers[MAX_IO_APICS]; struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; +/* IO APIC gsi routing info */ +struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; + /* MP IRQ source entries */ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -116,15 +126,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_pin_list; - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -139,6 +140,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; @@ -172,6 +178,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { [15] = { .vector = IRQ15_VECTOR, }, }; +void __init io_apic_disable_legacy(void) +{ + nr_legacy_irqs = 0; + nr_irqs_gsi = 0; +} + int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -189,7 +201,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < NR_IRQS_LEGACY) + if (i < nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -215,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node) cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { - if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { kfree(cfg); cfg = NULL; - } else if (!alloc_cpumask_var_node(&cfg->old_domain, + } else if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) { free_cpumask_var(cfg->domain); kfree(cfg); cfg = NULL; - } else { - cpumask_clear(cfg->domain); - cpumask_clear(cfg->old_domain); } } @@ -414,13 +423,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -428,9 +434,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -498,72 +501,68 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; + struct irq_pin_list **last, *entry; - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } - - while (entry->next) { - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; + return 0; + last = &entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } entry->apic = apic; entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; + struct irq_pin_list *entry; - while (entry) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } - entry = entry->next; } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -580,7 +579,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -596,11 +594,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { @@ -613,7 +606,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -883,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1480,7 +1472,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) disable_8259A_irq(irq); ioapic_write_entry(apic_id, pin, entry); @@ -1702,12 +1694,8 @@ __apicdebuginit(void) print_IO_APIC(void) if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -1851,7 +1839,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET) + if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1883,7 +1871,7 @@ __apicdebuginit(int) print_all_ICs(void) print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic || disable_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return 0; print_all_local_APICs(); @@ -1914,6 +1902,10 @@ void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } + + if (!nr_legacy_irqs) + return; + for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ @@ -1968,6 +1960,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); + if (!nr_legacy_irqs) + return; + /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode @@ -2001,7 +1996,7 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - if (cpu_has_apic) + if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } @@ -2014,7 +2009,7 @@ void disable_IO_APIC(void) * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -2023,9 +2018,8 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + if (acpi_ioapic) return; - /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -2199,7 +2193,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; @@ -2211,7 +2205,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2224,14 +2217,6 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -2269,13 +2254,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2288,9 +2269,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } @@ -2515,11 +2493,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2532,31 +2507,28 @@ static void ack_apic_level(unsigned int irq) } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2598,7 +2570,7 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2606,26 +2578,15 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void @@ -2710,7 +2671,7 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) make_8259A_irq(irq); else /* Strange. Oh, well.. */ @@ -3046,7 +3007,7 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) +#define PIC_IRQS (1UL << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { @@ -3054,21 +3015,19 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* * Set up IO-APIC IRQ routing. */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif + x86_init.mpparse.setup_ioapic_ids(); + sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - check_timer(); + if (nr_legacy_irqs) + check_timer(); } /* @@ -3169,7 +3128,6 @@ static int __init ioapic_init_sysfs(void) device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3241,8 +3199,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); @@ -3793,6 +3750,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + return irq; } @@ -3907,9 +3867,13 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= NR_IRQS_LEGACY) { + if (irq >= nr_legacy_irqs) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); @@ -3938,11 +3902,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq, return __io_apic_set_pci_routing(dev, irq, irq_attr); } -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ +u8 __init io_apic_unique_id(u8 id) +{ +#ifdef CONFIG_X86_32 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +#else + int i; + DECLARE_BITMAP(used, 256); -#ifdef CONFIG_ACPI + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} #ifdef CONFIG_X86_32 int __init io_apic_get_unique_id(int ioapic, int apic_id) @@ -4051,8 +4032,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return 0; } -#endif /* CONFIG_ACPI */ - /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online @@ -4106,7 +4085,7 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(void) +static struct resource * __init ioapic_setup_resources(int nr_ioapics) { unsigned long n; struct resource *res; @@ -4122,15 +4101,13 @@ static struct resource * __init ioapic_setup_resources(void) mem = alloc_bootmem(n); res = (void *)mem; - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * nr_ioapics; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; } ioapic_resources = res; @@ -4144,7 +4121,7 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(); + ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].apicaddr; @@ -4173,11 +4150,9 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; } } @@ -4198,3 +4173,76 @@ void __init ioapic_insert_resources(void) r++; } } + +int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_gsi_routing[i].gsi_base) + && (gsi <= mp_gsi_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, int gsi) +{ + if (WARN_ON(ioapic == -1)) + return -1; + if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + return -1; + + return gsi - mp_gsi_routing[ioapic].gsi_base; +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + return 1; + } + if (!address) { + printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].type = MP_IOAPIC; + mp_ioapics[idx].flags = MPC_APIC_USABLE; + mp_ioapics[idx].apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].apicid = io_apic_unique_id(id); + mp_ioapics[idx].apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_gsi_routing[idx].gsi_base = gsi_base; + mp_gsi_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, + mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, + mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + + nr_ioapics++; +} diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a..08385e090a6 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; + if (WARN_ONCE(!mask, "empty IPI mask")) + return; + local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); @@ -150,7 +153,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!boot_cpu_has(X86_FEATURE_APIC)) + if (!cpu_has_apic) return 0; apicid = hard_smp_processor_id(); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..7ff61d6a188 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -506,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) /* * proc handler for /proc/sys/kernel/nmi */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, +int proc_nmi_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (!!old_state == !!nmi_watchdog_enabled) return 0; @@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) return 0; } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ca96e68f0d2..efa00e2b850 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -66,7 +66,6 @@ struct mpc_trans { unsigned short trans_reserved; }; -/* x86_quirks member */ static int mpc_record; static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; @@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void) } } -static int __init numaq_pre_time_init(void) +static void __init numaq_tsc_init(void) { numaq_tsc_disable(); - return 0; } static inline int generate_logical_apicid(int quad, int phys_apicid) @@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m) quad_local_to_mp_bus_id[quad][local] = m->busid; } +/* + * Called from mpparse code. + * mode = 0: prescan + * mode = 1: one mpc entry scanned + */ +static void numaq_mpc_record(unsigned int mode) +{ + if (!mode) + mpc_record = 0; + else + mpc_record++; +} + static void __init MP_translation_info(struct mpc_trans *m) { printk(KERN_INFO @@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len) /* * Read/parse the MPC oem tables */ -static void __init - smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) +static void __init smp_read_mpc_oem(struct mpc_table *mpc) { + struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; int count = sizeof(*oemtable); /* the header size */ unsigned char *oemptr = ((unsigned char *)oemtable) + count; @@ -250,29 +261,6 @@ static void __init } } -static int __init numaq_setup_ioapic_ids(void) -{ - /* so can skip it */ - return 1; -} - -static struct x86_quirks numaq_x86_quirks __initdata = { - .arch_pre_time_init = numaq_pre_time_init, - .arch_time_init = NULL, - .arch_pre_intr_init = NULL, - .arch_memory_setup = NULL, - .arch_intr_init = NULL, - .arch_trap_init = NULL, - .mach_get_smp_config = NULL, - .mach_find_smp_config = NULL, - .mpc_record = &mpc_record, - .mpc_apic_id = mpc_apic_id, - .mpc_oem_bus_info = mpc_oem_bus_info, - .mpc_oem_pci_bus = mpc_oem_pci_bus, - .smp_read_mpc_oem = smp_read_mpc_oem, - .setup_ioapic_ids = numaq_setup_ioapic_ids, -}; - static __init void early_check_numaq(void) { /* @@ -286,8 +274,15 @@ static __init void early_check_numaq(void) if (smp_found_config) early_get_smp_config(); - if (found_numaq) - x86_quirks = &numaq_x86_quirks; + if (found_numaq) { + x86_init.mpparse.mpc_record = numaq_mpc_record; + x86_init.mpparse.setup_ioapic_ids = x86_init_noop; + x86_init.mpparse.mpc_apic_id = mpc_apic_id; + x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; + x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; + x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; + x86_init.timers.tsc_pre_init = numaq_tsc_init; + } } int __init get_memcfg_numaq(void) @@ -418,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) +static inline int numaq_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b8..c4cbd3080c1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -44,29 +44,46 @@ static struct apic *apic_probe[] __initdata = { NULL, }; +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic_mode && (apic != &apic_x2apic_phys && + if (x2apic_mode #ifdef CONFIG_X86_UV - apic != &apic_x2apic_uv_x && + && apic != &apic_x2apic_uv_x #endif - apic != &apic_x2apic_cluster)) { + ) { if (x2apic_phys) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif if (apic == &apic_flat) { - if (max_physical_apicid >= 8) - apic = &apic_physflat; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + apic = &apic_physflat; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) + apic = &apic_physflat; + } + } + + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + + if (is_vsmp_box()) { + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; } /* diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index eafdfbd1ea9..645ecc4ff0b 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid) return physid_mask_of_physid(0); } -static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int summit_check_phys_apicid_present(int physical_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8e4cbb255c3..a5371ec3677 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled(); } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } /* @@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a284359627e..a8989aadc99 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - +/* + * need to use more than cpu 0, because we need more vectors when + * MSI-X are used. + */ static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) @@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id) static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) { - return current_cpu_data.initial_apicid >> index_msb; + return initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 096d19aea2f..f5f5886a6b5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -46,7 +46,7 @@ static int early_get_nodeid(void) return node_id.s.node_id; } -static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { if (!strcmp(oem_table_id, "UVL")) @@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -struct apic apic_x2apic_uv_x = { +struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = NULL, @@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = { .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .irq_dest_mode = 0, /* physical */ .target_cpus = uv_target_cpus, .disable_esr = 0, @@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } -static __init void map_low_mmrs(void) -{ - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); -} - enum map_type {map_wb, map_uc}; static __init void map_high(char *id, unsigned long base, int shift, @@ -395,16 +389,6 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } -static __init void map_config_high(int max_pnode) -{ - union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; - int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; - - cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); - if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); -} - static __init void map_mmr_high(int max_pnode) { union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; @@ -566,8 +550,6 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; - map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -591,6 +573,8 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) + uv_blade_info[blade].memory_nid = -1; get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); @@ -629,6 +613,9 @@ void __init uv_system_init(void) lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; + /* Any node on the blade, else will contain -1. */ + uv_blade_info[blade].memory_nid = nid; + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; @@ -662,11 +649,11 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; + max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); map_mmr_high(max_pnode); - map_config_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9a33a..151ace69a5a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -811,7 +819,7 @@ static int apm_do_idle(void) u8 ret = 0; int idled = 0; int polling; - int err; + int err = 0; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { @@ -2332,15 +2340,6 @@ static int __init apm_init(void) pm_flags |= PM_APM; /* - * Set up a segment that references the real mode segment 0x40 - * that extends up to the end of page zero (that we have reserved). - * This is for buggy BIOS's that refer to (real mode) segment 0x40 - * even though they are called in protected mode. - */ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); - - /* * Set up the long jump entry point to the APM BIOS, which is called * from inline assembly. */ @@ -2358,12 +2357,12 @@ static int __init apm_init(void) * code to that CPU. */ gdt = get_cpu_gdt_table(0); - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); proc_create("apm", 0, NULL, &apm_file_ops); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 898ecc47e12..4a6aeedcd96 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -3,6 +3,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/crypto.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3efcb2b96a1..68537e957a9 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,9 +7,13 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg endif +# Make sure load_percpu_segment has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_common.o := $(nostackp) + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o @@ -23,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e2485b03f1c..c910a716a71 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -2,7 +2,7 @@ #include <linux/bitops.h> #include <linux/mm.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> @@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) #define CBAR_ENB (0x80000000) #define CBAR_KEY (0X000000CB) if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); } } @@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) d = d2-d; if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); + printk(KERN_CONT + "system stability may be impaired when more than 32 MB are used.\n"); else - printk("probably OK (after B9730xxxx).\n"); + printk(KERN_CONT "probably OK (after B9730xxxx).\n"); printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } @@ -183,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) * approved Athlon */ WARN_ONCE(1, "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); + " processors is not suitable for SMP.\n"); if (!test_taint(TAINT_UNSAFE_SMP)) add_taint(TAINT_UNSAFE_SMP); @@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } @@ -251,6 +253,64 @@ static int __cpuinit nearby_node(int apicid) #endif /* + * Fixup core topology information for AMD multi-node processors. + * Assumption 1: Number of cores in each internal node is the same. + * Assumption 2: Mixed systems with both single-node and dual-node + * processors are not supported. + */ +#ifdef CONFIG_X86_HT +static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_PCI + u32 t, cpn; + u8 n, n_id; + int cpu = smp_processor_id(); + + /* fixup topology information only once for a core */ + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return; + + /* check for multi-node processor on boot cpu */ + t = read_pci_config(0, 24, 3, 0xe8); + if (!(t & (1 << 29))) + return; + + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + + /* cores per node: each internal node has half the number of cores */ + cpn = c->x86_max_cores >> 1; + + /* even-numbered NB_id of this dual-node processor */ + n = c->phys_proc_id << 1; + + /* + * determine internal node id and assign cores fifty-fifty to + * each node of the dual-node processor + */ + t = read_pci_config(0, 24 + n, 3, 0xe8); + n = (t>>30) & 0x3; + if (n == 0) { + if (c->cpu_core_id < cpn) + n_id = 0; + else + n_id = 1; + } else { + if (c->cpu_core_id < cpn) + n_id = 1; + else + n_id = 0; + } + + /* compute entire NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; + + /* fixup core id to be in range from 0 to cpn */ + c->cpu_core_id = c->cpu_core_id % cpn; +#endif +} +#endif + +/* * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ @@ -267,17 +327,31 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + /* fixup topology information on multi-node processors */ + if ((c->x86 == 0x10) && (c->x86_model == 9)) + amd_fixup_dcm(c); +#endif +} + +int amd_get_nb_id(int cpu) +{ + int id = 0; +#ifdef CONFIG_SMP + id = per_cpu(cpu_llc_id, cpu); #endif + return id; } +EXPORT_SYMBOL_GPL(amd_get_nb_id); static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; + unsigned apicid = c->apicid; + + node = per_cpu(cpu_llc_id, cpu); - node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) node = apicid_to_node[apicid]; if (!node_online(node)) { @@ -398,11 +472,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) u32 level; level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). + */ + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + u64 val; + + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &val)) { + val &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, val); + } + } + } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); #else /* @@ -487,27 +580,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < + ((tseg>>PMD_SHIFT) < (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); } } #endif } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, + unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { - if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_mask == 0) size = 64; + /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ + (c->x86_mask == 0 || c->x86_mask == 1)) size = 256; } return size; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c8e315f1aa8..01a26521239 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -81,7 +81,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk("Hmm, FPU with FDIV bug.\n"); + printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); } static void __init check_hlt(void) @@ -98,7 +98,7 @@ static void __init check_hlt(void) halt(); halt(); halt(); - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); } /* @@ -122,9 +122,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk("Buggy.\n"); + printk(KERN_CONT "Buggy.\n"); else - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); #endif } @@ -156,7 +156,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 9a3ed0649d4..04f0fe5af83 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -15,7 +15,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #if !defined(CONFIG_SMP) - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif alternative_instructions(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9..cc25c2b4a56 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,13 +13,13 @@ #include <linux/io.h> #include <asm/stackprotector.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/sections.h> -#include <asm/topology.h> -#include <asm/cpumask.h> +#include <linux/topology.h> +#include <linux/cpumask.h> #include <asm/pgtable.h> #include <asm/atomic.h> #include <asm/proto.h> @@ -28,13 +28,12 @@ #include <asm/desc.h> #include <asm/i387.h> #include <asm/mtrr.h> -#include <asm/numa.h> +#include <linux/numa.h> #include <asm/asm.h> #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> -#include <asm/smp.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -59,7 +58,30 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static const struct cpu_dev *this_cpu __cpuinitdata; +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -71,45 +93,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; @@ -332,29 +354,6 @@ void switch_to_new_gdt(int cpu) static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static const struct cpu_dev __cpuinitconst default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -870,7 +869,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -982,18 +981,26 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; /* @@ -1008,8 +1015,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ void syscall_init(void) @@ -1042,8 +1048,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist); #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, stack_canary); +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif /* Make sure %fs and %gs are initialized properly in idle threads */ diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd040..dca325c0399 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include <asm/apic.h> #include <asm/desc.h> -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220c..7d5c3b0ea8d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> -#include <trace/power.h> +#include <trace/events/power.h> #include <linux/acpi.h> #include <linux/io.h> @@ -60,7 +60,6 @@ enum { }; #define INTEL_MSR_RANGE (0xffff) -#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) struct acpi_cpufreq_data { struct acpi_processor_performance *acpi_data; @@ -71,13 +70,7 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); -struct acpi_msr_data { - u64 saved_aperf, saved_mperf; -}; - -static DEFINE_PER_CPU(struct acpi_msr_data, msr_data); - -DEFINE_TRACE(power_mark); +static DEFINE_PER_CPU(struct aperfmperf, old_perf); /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask) return cmd.val; } -struct perf_pair { - union { - struct { - u32 lo; - u32 hi; - } split; - u64 whole; - } aperf, mperf; -}; - /* Called via smp_call_function_single(), on the target CPU */ static void read_measured_perf_ctrs(void *_cur) { - struct perf_pair *cur = _cur; + struct aperfmperf *am = _cur; - rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); - rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi); + get_aperfmperf(am); } /* @@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur) static unsigned int get_measured_perf(struct cpufreq_policy *policy, unsigned int cpu) { - struct perf_pair readin, cur; - unsigned int perf_percent; + struct aperfmperf perf; + unsigned long ratio; unsigned int retval; - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - cur.aperf.whole = readin.aperf.whole - - per_cpu(msr_data, cpu).saved_aperf; - cur.mperf.whole = readin.mperf.whole - - per_cpu(msr_data, cpu).saved_mperf; - per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole; - per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole; - -#ifdef __i386__ - /* - * We dont want to do 64 bit divide with 32 bit kernel - * Get an approximate value. Return failure in case we cannot get - * an approximate value. - */ - if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) { - int shift_count; - u32 h; - - h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi); - shift_count = fls(h); - - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) { - int shift_count = 7; - cur.aperf.split.lo >>= shift_count; - cur.mperf.split.lo >>= shift_count; - } - - if (cur.aperf.split.lo && cur.mperf.split.lo) - perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo; - else - perf_percent = 0; - -#else - if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { - int shift_count = 7; - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (cur.aperf.whole && cur.mperf.whole) - perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; - else - perf_percent = 0; - -#endif + ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); + per_cpu(old_perf, cpu) = perf; - retval = (policy->cpuinfo.max_freq * perf_percent) / 100; + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; return retval; } @@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; - struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_mark(&it, POWER_PSTATE, next_perf_state); + trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: @@ -588,6 +523,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { }, { } }; + +static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +{ + /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf + * AL30: A Machine Check Exception (MCE) Occurring during an + * Enhanced Intel SpeedStep Technology Ratio Change May Cause + * Both Processor Cores to Lock Up when HT is enabled*/ + if (c->x86_vendor == X86_VENDOR_INTEL) { + if ((c->x86 == 15) && + (c->x86_model == 6) && + (c->x86_mask == 8) && smt_capable()) + return -ENODEV; + } + return 0; +} #endif static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) @@ -602,6 +552,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) dprintk("acpi_cpufreq_cpu_init\n"); +#ifdef CONFIG_SMP + result = acpi_cpufreq_blacklist(c); + if (result) + return result; +#endif + data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -731,12 +687,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) acpi_processor_notify_smm(THIS_MODULE); /* Check for APERF/MPERF support in hardware */ - if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { - unsigned int ecx; - ecx = cpuid_ecx(6); - if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) - acpi_cpufreq_driver.getavg = get_measured_perf; - } + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + acpi_cpufreq_driver.getavg = get_measured_perf; dprintk("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2a50ef89100..6394aa5c798 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, return 0; } -static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) +static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, + unsigned int entry) { - data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; + powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; } static void print_basics(struct powernow_k8_data *data) @@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) goto err_out; } + /* fill in data */ + data->numps = data->acpi_data.state_count; + powernow_k8_acpi_pst_values(data, 0); + if (cpu_family == CPU_HW_PSTATE) ret_val = fill_powernow_table_pstate(data, powernow_table); else @@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) powernow_table[data->acpi_data.state_count].index = 0; data->powernow_table = powernow_table; - /* fill in data */ - data->numps = data->acpi_data.state_count; if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); - powernow_k8_acpi_pst_values(data, 0); /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); @@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, "bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS " "manufacturer\n"); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); if (!(hi & HW_PSTATE_VALID_MASK)) { dprintk("invalid pstate %d, ignoring\n", index); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } @@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) { int i; - int cntlofreq = 0; for (i = 0; i < data->acpi_data.state_count; i++) { u32 fid; @@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, /* verify frequency is OK */ if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { dprintk("invalid freq %u kHz, ignoring\n", freq); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } @@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, * BIOSs are using "off" to indicate invalid */ if (vid == VID_OFF) { dprintk("invalid vid %u, ignoring\n", vid); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } - /* verify only 1 entry from the lo frequency table */ - if (fid < HI_FID_TABLE_BOTTOM) { - if (cntlofreq) { - /* if both entries are the same, - * ignore this one ... */ - if ((freq != powernow_table[cntlofreq].frequency) || - (index != powernow_table[cntlofreq].index)) { - printk(KERN_ERR PFX - "Too many lo freq table " - "entries\n"); - return 1; - } - - dprintk("double low frequency table entry, " - "ignoring it.\n"); - invalidate_entry(data, i); - continue; - } else - cntlofreq = i; - } - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries " "%u kHz vs. %u kHz\n", freq, (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); - invalidate_entry(data, i); + invalidate_entry(powernow_table, i); continue; } } diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 593171e967e..19807b89f05 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -3,10 +3,10 @@ #include <linux/delay.h> #include <linux/pci.h> #include <asm/dma.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> -#include <asm/timer.h> +#include <linux/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> @@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * The 5510/5520 companion chips have a funky PIT. */ if (vendor == PCI_VENDOR_ID_CYRIX && - (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif @@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * ? : 0x7x * GX1 : 0x8x GX1 datasheet 56 */ - if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); return; } else { /* MediaGX */ @@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); local_irq_restore(flags); } } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index fb5b86af0b0..08be922de33 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,18 +28,10 @@ static inline void __cpuinit detect_hypervisor_vendor(struct cpuinfo_x86 *c) { - if (vmware_platform()) { + if (vmware_platform()) c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; - } else { + else c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; - } -} - -unsigned long get_hypervisor_tsc_freq(void) -{ - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - return vmware_get_tsc_khz(); - return 0; } static inline void __cpuinit @@ -56,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) detect_hypervisor_vendor(c); hypervisor_set_feature_bits(c); } + +void __init init_hypervisor_platform(void) +{ + init_hypervisor(&boot_cpu_data); + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + vmware_platform_setup(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3260ab04499..40e1835b35e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,17 +7,17 @@ #include <linux/sched.h> #include <linux/thread_info.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/ds.h> #include <asm/bugs.h> #include <asm/cpu.h> #ifdef CONFIG_X86_64 -#include <asm/topology.h> +#include <linux/topology.h> #include <asm/numa_64.h> #endif @@ -174,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; @@ -207,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); } } @@ -283,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) /* Intel has a non-standard dependency on %ecx for this CPUID level. */ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) - return ((eax >> 26) + 1); + return (eax >> 26) + 1; else return 1; } @@ -349,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } + if (c->cpuid_level > 6) { + unsigned ecx = cpuid_ecx(6); + if (ecx & 0x01) + set_cpu_cap(c, X86_FEATURE_APERFMPERF); + } + if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 789efe217e1..804c40e2bc3 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -3,7 +3,7 @@ * * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. + * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -16,7 +16,7 @@ #include <linux/pci.h> #include <asm/processor.h> -#include <asm/smp.h> +#include <linux/smp.h> #include <asm/k8.h> #define LVL_1_INST 1 @@ -25,14 +25,15 @@ #define LVL_3 4 #define LVL_TRACE 5 -struct _cache_table -{ +struct _cache_table { unsigned char descriptor; char cache_type; short size; }; -/* all the cache descriptor types we care about (no TLB or trace cache entries) */ +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + static const struct _cache_table __cpuinitconst cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ @@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = }; -enum _cache_type -{ +enum _cache_type { CACHE_TYPE_NULL = 0, CACHE_TYPE_DATA = 1, CACHE_TYPE_INST = 2, @@ -170,31 +170,31 @@ unsigned short num_cache_leaves; Maybe later */ union l1_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 8; - unsigned assoc : 8; - unsigned size_in_kb : 8; + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; }; unsigned val; }; union l2_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned size_in_kb : 16; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; }; unsigned val; }; union l3_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned res : 2; - unsigned size_encoded : 14; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; }; unsigned val; }; @@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 0: if (!l1->val) return; - assoc = l1->assoc; + assoc = assocs[l1->assoc]; line_size = l1->line_size; lines_per_tag = l1->lines_per_tag; size_in_kb = l1->size_in_kb; @@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 2: if (!l2.val) return; - assoc = l2.assoc; + assoc = assocs[l2.assoc]; line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ @@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 3: if (!l3.val) return; - assoc = l3.assoc; + assoc = assocs[l3.assoc]; line_size = l3.line_size; lines_per_tag = l3.lines_per_tag; size_in_kb = l3.size_encoded * 512; + if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; + } break; default: return; @@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.is_self_initializing = 1; eax->split.type = types[leaf]; eax->split.level = levels[leaf]; - if (leaf == 3) - eax->split.num_threads_sharing = - current_cpu_data.x86_max_cores - 1; - else - eax->split.num_threads_sharing = 0; + eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; - if (assoc == 0xf) + if (assoc == 0xffff) eax->split.is_fully_associative = 1; ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assocs[assoc] - 1; + ebx->split.ways_of_associativity = assoc - 1; ebx->split.physical_line_partition = lines_per_tag - 1; ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / (ebx->split.ways_of_associativity + 1) - 1; @@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void) unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; @@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) retval = cpuid4_cache_lookup_regs(i, &this_leaf); if (retval >= 0) { - switch(this_leaf.eax.split.level) { - case 1: + switch (this_leaf.eax.split.level) { + case 1: if (this_leaf.eax.split.type == CACHE_TYPE_DATA) new_l1d = this_leaf.size/1024; @@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) CACHE_TYPE_INST) new_l1i = this_leaf.size/1024; break; - case 2: + case 2: new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l2_id = c->apicid >> index_msb; break; - case 3: + case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); + index_msb = get_count_order( + num_threads_sharing); l3_id = c->apicid >> index_msb; break; - default: + default: break; } } @@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; - for ( i = 0 ; i < n ; i++ ) { + for (i = 0 ; i < n ; i++) { cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for ( j = 0 ; j < 3 ; j++ ) { - if (regs[j] & (1 << 31)) regs[j] = 0; - } + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; /* Byte 0 is level count, not a descriptor */ - for ( j = 1 ; j < 16 ; j++ ) { + for (j = 1 ; j < 16 ; j++) { unsigned char des = dp[j]; unsigned char k = 0; /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) - { + while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { if (only_trace && cache_table[k].cache_type != LVL_TRACE) break; @@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) } if (trace) - printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if ( l1i ) - printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); + printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); + else if (l1i) + printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); if (l1d) - printk(", L1 D cache: %dK\n", l1d); + printk(KERN_CONT ", L1 D cache: %dK\n", l1d); else - printk("\n"); + printk(KERN_CONT "\n"); if (l2) printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); @@ -522,6 +523,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) int index_msb, i; struct cpuinfo_x86 *c = &cpu_data(cpu); + if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { + struct cpuinfo_x86 *d; + for_each_online_cpu(i) { + if (!per_cpu(cpuid4_info, i)) + continue; + d = &cpu_data(i); + this_leaf = CPUID4_INFO_IDX(i, index); + cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), + d->llc_shared_map); + } + return; + } this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; @@ -558,8 +571,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +} + +static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +{ +} #endif static void __cpuinit free_cache_attributes(unsigned int cpu) @@ -645,7 +663,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); static ssize_t show_##file_name \ (struct _cpuid4_info *this_leaf, char *buf) \ { \ - return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ + return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } show_one_plus(level, eax.split.level, 0); @@ -656,7 +674,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) { - return sprintf (buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -669,7 +687,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, const struct cpumask *mask; mask = to_cpumask(this_leaf->shared_cpu_map); - n = type? + n = type ? cpulist_scnprintf(buf, len-2, mask) : cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; @@ -800,7 +818,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -static struct attribute * default_attrs[] = { +static struct attribute *default_attrs[] = { &type.attr, &level.attr, &coherency_line_size.attr, @@ -815,7 +833,7 @@ static struct attribute * default_attrs[] = { NULL }; -static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -828,8 +846,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) return ret; } -static ssize_t store(struct kobject * kobj, struct attribute * attr, - const char * buf, size_t count) +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -883,7 +901,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) goto err_out; per_cpu(index_kobject, cpu) = kzalloc( - sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); + sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); if (unlikely(per_cpu(index_kobject, cpu) == NULL)) goto err_out; @@ -917,7 +935,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } for (i = 0; i < num_cache_leaves; i++) { - this_object = INDEX_KOBJECT_PTR(cpu,i); + this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; retval = kobject_init_and_add(&(this_object->kobj), @@ -925,9 +943,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) per_cpu(cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { - for (j = 0; j < i; j++) { - kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); - } + for (j = 0; j < i; j++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; @@ -952,7 +969,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); for (i = 0; i < num_cache_leaves; i++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); + kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } @@ -977,8 +994,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = -{ +static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2..4ac6d48fe11 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,8 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc60..00000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones <davej@redhat.com> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f0..472763d9209 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@ #include <linux/string.h> #include <linux/fs.h> #include <linux/smp.h> +#include <linux/notifier.h> +#include <linux/kdebug.h> +#include <linux/cpu.h> +#include <linux/sched.h> #include <asm/mce.h> +#include <asm/apic.h> /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -39,44 +44,142 @@ static void inject_mce(struct mce *m) i->finished = 1; } -struct delayed_mce { - struct timer_list timer; - struct mce m; -}; +static void raise_poll(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; -/* Inject mce on current CPU */ -static void raise_mce(unsigned long data) + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs) { - struct delayed_mce *dm = (struct delayed_mce *)data; - struct mce *m = &dm->m; - int cpu = m->extcpu; + struct pt_regs regs; + unsigned long flags; - inject_mce(m); - if (m->status & MCI_STATUS_UC) { - struct pt_regs regs; + if (!pregs) { memset(®s, 0, sizeof(struct pt_regs)); regs.ip = m->ip; regs.cs = m->cs; + pregs = ®s; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + return NOTIFY_DONE; + cpu_clear(cpu, mce_inject_cpumask); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, args->regs); + else if (m->status) + raise_poll(m); + return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { + .notifier_call = mce_raise_notify, + .priority = 1000, +}; + +/* Inject mce on current CPU */ +static int raise_local(void) +{ + struct mce *m = &__get_cpu_var(injectm); + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->inject_flags & MCJ_EXCEPTION) { printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); - do_machine_check(®s, 0); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_exception(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); - } else { - mce_banks_t b; - memset(&b, 0xff, sizeof(mce_banks_t)); + } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - machine_check_poll(0, &b); + raise_poll(m); mce_notify_irq(); - printk(KERN_INFO "Finished machine check poll on CPU %d\n", - cpu); - } - kfree(dm); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & MCJ_NMI_BROADCAST) { + unsigned long start; + int cpu; + get_online_cpus(); + mce_inject_cpumask = cpu_online_map; + cpu_clear(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpu_clear(cpu, mce_inject_cpumask); + } + if (!cpus_empty(mce_inject_cpumask)) + apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + start = jiffies; + while (!cpus_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject NMI %lx\n", + *cpus_addr(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(); + put_cpu(); + put_online_cpus(); + } else +#endif + raise_local(); } /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - struct delayed_mce *dm; struct mce m; if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; - dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); - if (!dm) - return -ENOMEM; - /* * Need to give user space some time to set everything up, * so do it a jiffie or two later everywhere. - * Should we use a hrtimer here for better synchronization? */ - memcpy(&dm->m, &m, sizeof(struct mce)); - setup_timer(&dm->timer, raise_mce, (unsigned long)dm); - dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.extcpu); + schedule_timeout(2); + raise_mce(&m); return usize; } @@ -116,6 +212,7 @@ static int inject_init(void) { printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; + register_die_notifier(&mce_raise_nb); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e..32996f9fab6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include <linux/sysdev.h> #include <asm/mce.h> enum severity_level { @@ -10,6 +11,20 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f9705..8a85dd1b1aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg) } } +#ifdef CONFIG_DEBUG_FS static void *s_start(struct seq_file *f, loff_t *pos) { if (*pos >= ARRAY_SIZE(severities)) @@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void) { struct dentry *dmce = NULL, *fseverities_coverage = NULL; - dmce = debugfs_create_dir("mce", NULL); + dmce = mce_get_debugfs_dir(); if (dmce == NULL) goto err_out; fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); - if (dmce) - debugfs_remove(dmce); return -ENOMEM; } late_initcall(severities_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1cfb623ce11..b1598a9436d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@ #include <linux/smp.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/debugfs.h> #include <asm/processor.h> #include <asm/hw_irq.h> @@ -45,21 +46,8 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -87,28 +74,35 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +static void default_decode_mce(struct mce *m) +{ + pr_emerg("No human readable MCE decoding support on this CPU type.\n"); + pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); +} + +/* + * CPU/chipset specific EDAC code can register a callback here to print + * MCE errors in a human-readable form: + */ +void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; +EXPORT_SYMBOL(x86_mce_decode_callback); /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -185,47 +179,58 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - printk(KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + pr_emerg("RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } - printk(KERN_EMERG "TSC %llx ", m->tsc); + + pr_emerg("TSC %llx ", m->tsc); if (m->addr) - printk(KERN_CONT "ADDR %llx ", m->addr); + pr_cont("ADDR %llx ", m->addr); if (m->misc) - printk(KERN_CONT "MISC %llx ", m->misc); - printk(KERN_CONT "\n"); - printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, - m->apicid); + pr_cont("MISC %llx ", m->misc); + + pr_cont("\n"); + pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + /* + * Print out human-readable details about the MCE error, + * (if the CPU has an implementation for that): + */ + x86_mce_decode_callback(m); } static void print_mce_head(void) { - printk(KERN_EMERG "\nHARDWARE ERROR\n"); + pr_emerg("\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { - printk(KERN_EMERG "This is not a software problem!\n" - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + pr_emerg("This is not a software problem!\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; + /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); local_irq_enable(); while (timeout-- > 0) @@ -239,15 +244,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_add_return(1, &mce_paniced) > 1) - wait_for_panic(); - barrier(); + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); - bust_spinlocks(1); - console_verbose(); + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -274,9 +285,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); - if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; - panic(msg); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); + } else + printk(KERN_EMERG "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -284,13 +298,14 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -301,13 +316,25 @@ static int msr_to_offset(u32 msr) static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset < 0) return 0; return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); } - rdmsrl(msr, v); + + if (rdmsrl_safe(msr, &v)) { + WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + /* + * Return zero in case the access faulted. This should + * not happen normally but can happen if the CPU does + * something weird, or if the code is buggy. + */ + v = 0; + } + return v; } @@ -315,6 +342,7 @@ static void mce_wrmsrl(u32 msr, u64 v) { if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset >= 0) *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; return; @@ -411,7 +439,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) m->ip = mce_rdmsrl(rip_msr); } -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_LOCAL_APIC /* * Called after interrupts have been reenabled again * when a MCE happened during an interrupts off region @@ -495,7 +523,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -504,7 +532,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -519,9 +547,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -537,7 +565,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -558,7 +586,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -618,7 +646,7 @@ out: * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * - * Also this detects the case of an machine check event coming from outer + * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * @@ -671,7 +699,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (!m && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -705,7 +733,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. @@ -842,7 +870,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -895,11 +923,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - no_way_out = mce_no_way_out(&m, &msg); - final = &__get_cpu_var(mces_seen); *final = m; + no_way_out = mce_no_way_out(&m, &msg); + barrier(); /* @@ -916,14 +944,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -964,9 +992,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1091,7 +1119,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1110,7 +1138,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1159,10 +1187,26 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit mce_cap_init(void) { unsigned b; u64 cap; @@ -1182,11 +1226,11 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = mce_banks_init(); + + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1218,16 +1262,23 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + return -EOPNOTSUPP; + } + /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { if (c->x86 == 15 && banks > 4) { @@ -1236,7 +1287,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1250,7 +1301,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1263,8 +1314,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1273,11 +1324,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; if (mce_bootlog != 0) mce_panic_timeout = 30; + + return 0; } static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) @@ -1311,7 +1371,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1324,6 +1384,17 @@ static void mce_init_timer(void) add_timer_on(t, smp_processor_id()); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -1338,11 +1409,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; - if (mce_cap_init() < 0) { + if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { mce_disabled = 1; return; } - mce_cpu_quirks(c); machine_check_vector = do_machine_check; @@ -1538,8 +1608,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) @@ -1580,8 +1652,10 @@ static int mce_disable(void) int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } @@ -1656,14 +1730,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1674,7 +1749,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1816,7 +1891,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1825,10 +1900,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1846,7 +1921,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); @@ -1863,8 +1938,10 @@ static void mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } @@ -1879,8 +1956,10 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } @@ -1912,7 +1991,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; @@ -1928,35 +2007,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } static __init int mce_init_device(void) @@ -1969,9 +2034,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) @@ -1991,57 +2054,65 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_OLD_MCE: */ - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) { - if (mce_disabled) - return; + static struct dentry *dmce; - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; + return dmce; +} - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; } -static int __init mcheck_enable(char *str) +static int fake_panic_set(void *data, u64 val) { - mce_p5_enabled = 1; - return 1; + mce_reset(); + fake_panic = val; + return 0; } -__setup("mce", mcheck_enable); -#endif /* CONFIG_X86_OLD_MCE */ +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) +static int __init mce_debugfs_init(void) { - mce_disabled = 1; - return 1; + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; } -__setup("nomce", mcheck_disable); +late_initcall(mce_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae21620bd..83a3d1f4efc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { @@ -489,12 +489,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; +#ifdef CONFIG_SMP + struct cpuinfo_x86 *c = &cpu_data(cpu); +#endif sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_core_mask(cpu)); + i = cpumask_first(c->llc_shared_map); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -514,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, cpu_core_mask(cpu)); + cpumask_copy(b->cpus, c->llc_shared_map); per_cpu(threshold_banks, cpu)[bank] = b; goto out; @@ -539,7 +542,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, cpu_core_mask(cpu)); + cpumask_copy(b->cpus, c->llc_shared_map); #endif per_cpu(threshold_banks, cpu)[bank] = b; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a3..7c785634af2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> +#include <linux/sched.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> @@ -90,7 +91,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +102,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +153,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb..00000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. <davej@redhat.com> - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include <linux/interrupt.h> -#include <linux/workqueue.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f817818..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd..b3a1dba7533 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,19 +34,31 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; -static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +/* + * Current thermal throttling state: + */ +struct thermal_state { + bool is_throttled; + + u64 next_check; + unsigned long throttle_count; + unsigned long last_throttle_count; +}; -static atomic_t therm_throt_en = ATOMIC_INIT(0); +static DEFINE_PER_CPU(struct thermal_state, thermal_state); + +static atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) #define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ + \ +static ssize_t therm_throt_sysdev_show_##name( \ + struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ { \ unsigned int cpu = dev->id; \ ssize_t ret; \ @@ -54,7 +66,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ + per_cpu(thermal_state, cpu).name); \ else \ ret = 0; \ preempt_enable(); \ @@ -62,11 +74,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ return ret; \ } -define_therm_throt_sysdev_show_func(count); -define_therm_throt_sysdev_one_ro(count); +define_therm_throt_sysdev_show_func(throttle_count); +define_therm_throt_sysdev_one_ro(throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_count.attr, + &attr_throttle_count.attr, NULL }; @@ -92,31 +104,43 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(int curr) +static int therm_throt_process(bool is_throttled) { - unsigned int cpu = smp_processor_id(); - __u64 tmp_jiffs = get_jiffies_64(); + struct thermal_state *state; + unsigned int this_cpu; + bool was_throttled; + u64 now; - if (curr) - __get_cpu_var(thermal_throttle_count)++; + this_cpu = smp_processor_id(); + now = get_jiffies_64(); + state = &per_cpu(thermal_state, this_cpu); - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + was_throttled = state->is_throttled; + state->is_throttled = is_throttled; + + if (is_throttled) + state->throttle_count++; + + if (time_before64(now, state->next_check) && + state->throttle_count != state->last_throttle_count) return 0; - __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + state->next_check = now + CHECK_INTERVAL; + state->last_throttle_count = state->throttle_count; /* if we just entered the thermal event */ - if (curr) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + if (is_throttled) { + printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + return 1; + } + if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); + return 1; } - return 1; + return 0; } #ifdef CONFIG_SYSFS @@ -206,7 +230,7 @@ static void intel_thermal_interrupt(void) __u64 msr_val; rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) mce_log_therm_throt_event(msr_val); } @@ -253,9 +277,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -264,6 +285,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index ee2331b0e58..33af14110df 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -7,15 +7,15 @@ static void amd_get_mtrr(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type) + unsigned long *size, mtrr_type *type) { unsigned long low, high; rdmsr(MSR_K6_UWCCR, low, high); - /* Upper dword is region 1, lower is region 0 */ + /* Upper dword is region 1, lower is region 0 */ if (reg == 1) low = high; - /* The base masks off on the right alignment */ + /* The base masks off on the right alignment */ *base = (low & 0xFFFE0000) >> PAGE_SHIFT; *type = 0; if (low & 1) @@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base, return; } /* - * This needs a little explaining. The size is stored as an - * inverted mask of bits of 128K granularity 15 bits long offset - * 2 bits + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. * - * So to get a size we do invert the mask and add 1 to the lowest - * mask bit (4 as its 2 bits in). This gives us a size we then shift - * to turn into 128K blocks + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. * - * eg 111 1111 1111 1100 is 512K + * eg 111 1111 1111 1100 is 512K * - * invert 000 0000 0000 0011 - * +1 000 0000 0000 0100 - * *128K ... + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... */ low = (~low) & 0x1FFFC; *size = (low + 4) << (15 - PAGE_SHIFT); - return; } -static void amd_set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - <reg> The register to set. - <base> The base address of the region. - <size> The size of the region. If this is 0 the region is disabled. - <type> The type of the region. - [RETURNS] Nothing. -*/ +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { u32 regs[2]; /* - * Low is MTRR0 , High MTRR 1 + * Low is MTRR0, High MTRR 1 */ rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); /* - * Blank to disable + * Blank to disable */ - if (size == 0) + if (size == 0) { regs[reg] = 0; - else - /* Set the register to the base, the type (off by one) and an - inverted bitmask of the size The size is the only odd - bit. We are fed say 512K We invert this and we get 111 1111 - 1111 1011 but if you subtract one and invert you get the - desired 111 1111 1111 1100 mask - - But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) | (base << PAGE_SHIFT) | (type + 1); + } /* - * The writeback rule is quite specific. See the manual. Its - * disable local interrupts, write back the cache, set the mtrr + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr */ wbinvd(); wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); } -static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { - /* Apply the K6 block alignment and size rules - In order - o Uncached or gathering only - o 128K or bigger block - o Power of 2 block - o base suitably aligned to the power - */ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; @@ -115,5 +122,3 @@ int __init amd_init_mtrr(void) set_mtrr_ops(&amd_mtrr_ops); return 0; } - -//arch_initcall(amd_mtrr_init); diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index cb9aa3a7a7a..de89f14eff3 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -1,7 +1,9 @@ #include <linux/init.h> #include <linux/mm.h> + #include <asm/mtrr.h> #include <asm/msr.h> + #include "mtrr.h" static struct { @@ -12,25 +14,25 @@ static struct { static u8 centaur_mcr_reserved; static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ -/* - * Report boot time MCR setups +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. */ - static int centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - <base> The starting (base) address of the region. - <size> The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { if (centaur_mcr_reserved & (1 << i)) continue; @@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) if (lsize == 0) return i; } + return -ENOSPC; } -void -mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +/* + * Report boot time MCR setups + */ +void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { centaur_mcr[mcr].low = lo; centaur_mcr[mcr].high = hi; @@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base, { *base = centaur_mcr[reg].high >> PAGE_SHIFT; *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; - *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) *type = MTRR_TYPE_WRBACK; if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) *type = MTRR_TYPE_WRBACK; - } -static void centaur_set_mcr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) { unsigned long low, high; if (size == 0) { - /* Disable */ + /* Disable */ high = low = 0; } else { high = base << PAGE_SHIFT; - if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ - else { + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << PAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ } } centaur_mcr[reg].high = high; @@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base, wrmsr(MSR_IDT_MCR0 + reg, low, high); } -#if 0 -/* - * Initialise the later (saner) Winchip MCR variant. In this version - * the BIOS can pass us the registers it has used (but not their values) - * and the control register is read/write - */ - -static void __init -centaur_mcr1_init(void) -{ - unsigned i; - u32 lo, hi; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ - lo &= ~0x1C0; /* clear key */ - lo |= 0x040; /* set key to 1 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ - } - - centaur_mcr_type = 1; - - /* - * Clear any unconfigured MCR's. - */ - - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { - if (!(lo & (1 << (9 + i)))) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - else - /* - * If the BIOS set up an MCR we cannot see it - * but we don't wish to obliterate it - */ - centaur_mcr_reserved |= (1 << i); - } - } - /* - * Throw the main write-combining switch... - * However if OOSTORE is enabled then people have already done far - * cleverer things and we should behave. - */ - - lo |= 15; /* Write combine enables */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -/* - * Initialise the original winchip with read only MCR registers - * no used bitmask for the BIOS to pass on and write only control - */ - -static void __init -centaur_mcr0_init(void) -{ - unsigned i; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - /* Clear any unconfigured MCR's. - * This way we are sure that the centaur_mcr array contains the actual - * values. The disadvantage is that any BIOS tweaks are thus undone. - * - */ - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - } - - wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ -} - -/* - * Initialise Winchip series MCR registers - */ - -static void __init -centaur_mcr_init(void) -{ - struct set_mtrr_context ctxt; - - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - if (boot_cpu_data.x86_model == 4) - centaur_mcr0_init(); - else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) - centaur_mcr1_init(); - - set_mtrr_done(&ctxt); -} -#endif - -static int centaur_validate_add_page(unsigned long base, - unsigned long size, unsigned int type) +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { /* - * FIXME: Winchip2 supports uncached + * FIXME: Winchip2 supports uncached */ - if (type != MTRR_TYPE_WRCOMB && + if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { - printk(KERN_WARNING - "mtrr: only write-combining%s supported\n", - centaur_mcr_type ? " and uncacheable are" - : " is"); + pr_warning("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); return -EINVAL; } return 0; @@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base, static struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, -// .init = centaur_mcr_init, .set = centaur_set_mcr, .get = centaur_get_mcr, .get_free_region = centaur_get_free_region, @@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void) set_mtrr_ops(¢aur_mtrr_ops); return 0; } - -//arch_initcall(centaur_init_mtrr); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 1d584a18a50..315738c74aa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -1,51 +1,75 @@ -/* MTRR (Memory Type Range Register) cleanup - - Copyright (C) 2009 Yinghai Lu - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ #include <linux/module.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/smp.h> #include <linux/cpu.h> -#include <linux/mutex.h> #include <linux/sort.h> +#include <linux/mutex.h> +#include <linux/uaccess.h> +#include <linux/kvm_para.h> +#include <asm/processor.h> #include <asm/e820.h> #include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> #include <asm/msr.h> -#include <asm/kvm_para.h> -#include "mtrr.h" -/* should be related to MTRR_VAR_RANGES nums */ -#define RANGE_NUM 256 +#include "mtrr.h" struct res_range { - unsigned long start; - unsigned long end; + unsigned long start; + unsigned long end; +}; + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; }; +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + + static int __init -add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { - /* out of slots */ + /* Out of slots: */ if (nr_range >= RANGE_NUM) return nr_range; @@ -58,12 +82,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start, } static int __init -add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range_with_merge(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { int i; - /* try to merge it with old one */ + /* Try to merge it with old one: */ for (i = 0; i < nr_range; i++) { unsigned long final_start, final_end; unsigned long common_start, common_end; @@ -84,7 +108,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, return nr_range; } - /* need to add that */ + /* Need to add it: */ return add_range(range, nr_range, start, end); } @@ -117,7 +141,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) } if (start > range[j].start && end < range[j].end) { - /* find the new spare */ + /* Find the new spare: */ for (i = 0; i < RANGE_NUM; i++) { if (range[i].end == 0) break; @@ -146,14 +170,8 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } -struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; -}; - -static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; -static int __initdata debug_print; +#define BIOS_BUG_MSG KERN_WARNING \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, @@ -180,7 +198,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, range[i].start, range[i].end + 1); } - /* take out UC ranges */ + /* Take out UC ranges: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_UNCACHABLE && @@ -193,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && (mtrr_state.enabled & 1)) { /* Var MTRR contains UC entry below 1M? Skip it: */ - printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " - "contains strange UC entry under 1M, check " - "with your system vendor!\n", i); + printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) continue; size -= (1<<(20-PAGE_SHIFT)) - base; @@ -237,17 +253,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, return nr_range; } -static struct res_range __initdata range[RANGE_NUM]; -static int __initdata nr_range; - #ifdef CONFIG_MTRR_SANITIZER static unsigned long __init sum_ranges(struct res_range *range, int nr_range) { - unsigned long sum; + unsigned long sum = 0; int i; - sum = 0; for (i = 0; i < nr_range; i++) sum += range[i].end + 1 - range[i].start; @@ -278,17 +290,9 @@ static int __init mtrr_cleanup_debug_setup(char *str) } early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); -struct var_mtrr_state { - unsigned long range_startk; - unsigned long range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; -}; - static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) + unsigned char type, unsigned int address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -301,7 +305,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, mask = (1ULL << address_bits) - 1; mask &= ~((((u64)sizek) << 10) - 1); - base = ((u64)basek) << 10; + base = ((u64)basek) << 10; base |= type; mask |= 0x800; @@ -317,15 +321,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, static void __init save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type) + unsigned char type) { range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); range_state[reg].type = type; } -static void __init -set_var_mtrr_all(unsigned int address_bits) +static void __init set_var_mtrr_all(unsigned int address_bits) { unsigned long basek, sizek; unsigned char type; @@ -342,11 +345,11 @@ set_var_mtrr_all(unsigned int address_bits) static unsigned long to_size_factor(unsigned long sizek, char *factorp) { - char factor; unsigned long base = sizek; + char factor; if (base & ((1<<10) - 1)) { - /* not MB alignment */ + /* Not MB-aligned: */ factor = 'K'; } else if (base & ((1<<20) - 1)) { factor = 'M'; @@ -372,11 +375,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long max_align, align; unsigned long sizek; - /* Compute the maximum size I can make a range */ + /* Compute the maximum size with which we can make a range: */ if (range_startk) max_align = ffs(range_startk) - 1; else max_align = 32; + align = fls(range_sizek) - 1; if (align > max_align) align = max_align; @@ -386,11 +390,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; - start_base = to_size_factor(range_startk, - &start_factor), - size_base = to_size_factor(sizek, &size_factor), + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); - printk(KERN_DEBUG "Setting variable MTRR %d, " + Dprintk("Setting variable MTRR %d, " "base: %ld%cB, range: %ld%cB, type %s\n", reg, start_base, start_factor, size_base, size_factor, @@ -425,10 +428,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; - /* align with gran size, prevent small block used up MTRRs */ + /* Align with gran size, prevent small block used up MTRRs: */ range_basek = ALIGN(state->range_startk, gran_sizek); if ((range_basek > basek) && basek) return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); range_sizek = ALIGN(state->range_sizek, gran_sizek); @@ -439,22 +443,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } state->range_sizek = range_sizek; - /* try to append some small hole */ + /* Try to append some small hole: */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - /* no increase */ + /* No increase: */ if (range0_sizek == state->range_sizek) { - if (debug_print) - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + state->range_sizek)<<10); + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK); return 0; } - /* only cut back, when it is not the last */ + /* Only cut back when it is not the last: */ if (sizek) { while (range0_basek + range0_sizek > (basek + sizek)) { if (range0_sizek >= chunk_sizek) @@ -470,16 +473,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_try: range_basek = range0_basek + range0_sizek; - /* one hole in the middle */ + /* One hole in the middle: */ if (range_basek > basek && range_basek <= (basek + sizek)) second_sizek = range_basek - basek; if (range0_sizek > state->range_sizek) { - /* one hole in middle or at end */ + /* One hole in middle or at the end: */ hole_sizek = range0_sizek - state->range_sizek - second_sizek; - /* hole size should be less than half of range0 size */ + /* Hole size should be less than half of range0 size: */ if (hole_sizek >= (range0_sizek >> 1) && range0_sizek >= chunk_sizek) { range0_sizek -= chunk_sizek; @@ -491,32 +494,30 @@ second_try: } if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); } if (range0_sizek < state->range_sizek) { - /* need to handle left over */ + /* Need to handle left over range: */ range_sizek = state->range_sizek - range0_sizek; - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, MTRR_TYPE_WRBACK); } if (hole_sizek) { hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, MTRR_TYPE_UNCACHABLE); } @@ -537,23 +538,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, basek = base_pfn << (PAGE_SHIFT - 10); sizek = size_pfn << (PAGE_SHIFT - 10); - /* See if I can merge with the last range */ + /* See if I can merge with the last range: */ if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { unsigned long endk = basek + sizek; state->range_sizek = endk - state->range_startk; return; } - /* Write the range mtrrs */ + /* Write the range mtrrs: */ if (state->range_sizek != 0) second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - /* Allocate an msr */ + /* Allocate an msr: */ state->range_startk = basek + second_sizek; state->range_sizek = sizek - second_sizek; } -/* mininum size of mtrr block that can take hole */ +/* Mininum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) @@ -565,7 +566,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) } early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); -/* granity of mtrr of block */ +/* Granularity of mtrr of block: */ static u64 mtrr_gran_size __initdata; static int __init parse_mtrr_gran_size_opt(char *p) @@ -577,7 +578,7 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static int nr_mtrr_spare_reg __initdata = +static unsigned long nr_mtrr_spare_reg __initdata = CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; static int __init parse_mtrr_spare_reg(char *arg) @@ -586,7 +587,6 @@ static int __init parse_mtrr_spare_reg(char *arg) nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); return 0; } - early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init @@ -594,8 +594,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; - int i; int num_reg; + int i; var_state.range_startk = 0; var_state.range_sizek = 0; @@ -605,17 +605,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, memset(range_state, 0, sizeof(range_state)); - /* Write the range etc */ - for (i = 0; i < nr_range; i++) + /* Write the range: */ + for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + } - /* Write the last range */ + /* Write the last range: */ if (var_state.range_sizek != 0) range_to_mtrr_with_hole(&var_state, 0, 0); num_reg = var_state.reg; - /* Clear out the extra MTRR's */ + /* Clear out the extra MTRR's: */ while (var_state.reg < num_var_ranges) { save_var_mtrr(var_state.reg, 0, 0, 0); var_state.reg++; @@ -625,11 +626,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, } struct mtrr_cleanup_result { - unsigned long gran_sizek; - unsigned long chunk_sizek; - unsigned long lose_cover_sizek; - unsigned int num_reg; - int bad; + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; }; /* @@ -645,10 +646,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static void __init print_out_mtrr_range_state(void) { - int i; char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; mtrr_type type; + int i; for (i = 0; i < num_var_ranges; i++) { @@ -676,10 +677,10 @@ static int __init mtrr_need_cleanup(void) int i; mtrr_type type; unsigned long size; - /* extra one for all 0 */ + /* Extra one for all 0: */ int num[MTRR_NUM_TYPES + 1]; - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -693,88 +694,86 @@ static int __init mtrr_need_cleanup(void) num[type]++; } - /* check if we got UC entries */ + /* Check if we got UC entries: */ if (!num[MTRR_TYPE_UNCACHABLE]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) + num_var_ranges - num[MTRR_NUM_TYPES]) return 0; return 1; } static unsigned long __initdata range_sums; -static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, - unsigned long extra_remove_base, - unsigned long extra_remove_size, - int i) + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) { - int num_reg; static struct res_range range_new[RANGE_NUM]; - static int nr_range_new; unsigned long range_sums_new; + static int nr_range_new; + int num_reg; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - /* we got new setting in range_state, check it */ + /* We got new setting in range_state, check it: */ memset(range_new, 0, sizeof(range_new)); nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); + x_remove_base, x_remove_size); range_sums_new = sum_ranges(range_new, nr_range_new); result[i].chunk_sizek = chunk_size >> 10; result[i].gran_sizek = gran_size >> 10; result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } - /* double check it */ + /* Double check it: */ if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; } static void __init mtrr_print_out_one_result(int i) { - char gran_factor, chunk_factor, lose_factor; unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad ? "*BAD*" : " ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad ? "-" : "", - lose_base, lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); } static int __init mtrr_search_optimal_index(void) { - int i; int num_reg_good; int index_good; + int i; if (nr_mtrr_spare_reg >= num_var_ranges) nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { if (!min_loss_pfn[i]) @@ -796,24 +795,24 @@ static int __init mtrr_search_optimal_index(void) return index_good; } - int __init mtrr_cleanup(unsigned address_bits) { - unsigned long extra_remove_base, extra_remove_size; + unsigned long x_remove_base, x_remove_size; unsigned long base, size, def, dummy; - mtrr_type type; u64 chunk_size, gran_size; + mtrr_type type; int index_good; int i; if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -822,29 +821,28 @@ int __init mtrr_cleanup(unsigned address_bits) range_state[i].type = type; } - /* check if we need handle it and can handle it */ + /* Check if we need handle it and can handle it: */ if (!mtrr_need_cleanup()) return 0; - /* print original var MTRRs at first, for debugging: */ + /* Print original var MTRRs at first, for debugging: */ printk(KERN_DEBUG "original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); - extra_remove_size = 0; - extra_remove_base = 1 << (32 - PAGE_SHIFT); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); if (mtrr_tom2) - extra_remove_size = - (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, - extra_remove_size); + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); /* - * [0, 1M) should always be coverred by var mtrr with WB - * and fixed mtrrs should take effective before var mtrr for it + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); - /* sort the ranges */ + /* Sort the ranges: */ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); @@ -854,7 +852,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { i = 0; mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); mtrr_print_out_one_result(i); @@ -880,7 +878,7 @@ int __init mtrr_cleanup(unsigned address_bits) continue; mtrr_calc_range_state(chunk_size, gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); if (debug_print) { mtrr_print_out_one_result(i); printk(KERN_INFO "\n"); @@ -890,7 +888,7 @@ int __init mtrr_cleanup(unsigned address_bits) } } - /* try to find the optimal index */ + /* Try to find the optimal index: */ index_good = mtrr_search_optimal_index(); if (index_good != -1) { @@ -898,7 +896,7 @@ int __init mtrr_cleanup(unsigned address_bits) i = index_good; mtrr_print_out_one_result(i); - /* convert ranges to var ranges state */ + /* Convert ranges to var ranges state: */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; @@ -941,8 +939,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup); * Note this won't check if the MTRRs < 4GB where the magic bit doesn't * apply to are wrong, but so far we don't know of any such case in the wild. */ -#define Tom2Enabled (1U << 21) -#define Tom2ForceMemTypeWB (1U << 22) +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) int __init amd_special_default_mtrr(void) { @@ -952,7 +950,7 @@ int __init amd_special_default_mtrr(void) return 0; if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) return 0; - /* In case some hypervisor doesn't pass SYSCFG through */ + /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) return 0; /* @@ -965,19 +963,21 @@ int __init amd_special_default_mtrr(void) return 0; } -static u64 __init real_trim_memory(unsigned long start_pfn, - unsigned long limit_pfn) +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) { u64 trim_start, trim_size; + trim_start = start_pfn; trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); + return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } + /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs * @end_pfn: ending page frame number @@ -985,7 +985,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn, * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches * the end of memory, to make sure the MTRRs having a write back type cover - * all of the memory the kernel is intending to use. If not, it'll trim any + * all of the memory the kernel is intending to use. If not, it'll trim any * memory off the end by adjusting end_pfn, removing it from the kernel's * allocation pools, warning the user with an obnoxious message. */ @@ -994,21 +994,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 total_trim_size; - /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; + /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -1017,7 +1018,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) range_state[i].type = type; } - /* Find highest cached pfn */ + /* Find highest cached pfn: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_WRBACK) @@ -1028,13 +1029,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) highest_pfn = base + size; } - /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ if (!highest_pfn) { printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); return 0; } - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -1046,11 +1047,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) num[type]++; } - /* no entry for WB? */ + /* No entry for WB? */ if (!num[MTRR_TYPE_WRBACK]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC: */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != num_var_ranges - num[MTRR_NUM_TYPES]) return 0; @@ -1066,31 +1067,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + /* Check the head: */ total_trim_size = 0; - /* check the head */ if (range[0].start) total_trim_size += real_trim_memory(0, range[0].start); - /* check the holes */ + + /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) total_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); } - /* check the top */ + + /* Check the top: */ i = nr_range - 1; if (range[i].end + 1 < end_pfn) total_trim_size += real_trim_memory(range[i].end + 1, end_pfn); if (total_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_trim_size >> 20); + pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); if (!changed_by_mtrr_cleanup) WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr\n"); + pr_info("update e820 for mtrr\n"); update_e820(); return 1; @@ -1098,4 +1099,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } - diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ff14c320040..228d982ce09 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -1,38 +1,40 @@ #include <linux/init.h> +#include <linux/io.h> #include <linux/mm.h> -#include <asm/mtrr.h> -#include <asm/msr.h> -#include <asm/io.h> + #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + #include "mtrr.h" static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) { - unsigned long flags; unsigned char arr, ccr3, rcr, shift; + unsigned long flags; arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - /* Save flags and disable interrupts */ local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ((unsigned char *) base)[3] = getCx86(arr); - ((unsigned char *) base)[2] = getCx86(arr + 1); - ((unsigned char *) base)[1] = getCx86(arr + 2); + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); rcr = getCx86(CX86_RCR_BASE + reg); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ - /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); + shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; - /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. */ if (shift) @@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, } } +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ static int cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free ARR. - <base> The starting (base) address of the region. - <size> The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i; switch (replace_reg) { case 7: @@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) cyrix_get_arr(7, &lbase, &lsize, <ype); if (lsize == 0) return 7; - /* Else try ARR0-ARR6 first */ + /* Else try ARR0-ARR6 first */ } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } - /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ cyrix_get_arr(i, &lbase, &lsize, <ype); if ((lsize == 0) && (size >= 0x40)) return i; @@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) return -ENOSPC; } -static u32 cr4 = 0; -static u32 ccr3; +static u32 cr4, ccr3; static void prepare_set(void) { u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } - /* Disable and flush caches. Note that wbinvd flushes the TLBs as - a side-effect */ + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); @@ -147,22 +156,21 @@ static void prepare_set(void) /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); - } static void post_set(void) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ccr3); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); } @@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, size >>= 6; size &= 0x7fff; /* make sure arr_size <= 14 */ - for (arr_size = 0; size; arr_size++, size >>= 1) ; + for (arr_size = 0; size; arr_size++, size >>= 1) + ; if (reg < 7) { switch (type) { @@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, prepare_set(); base <<= PAGE_SHIFT; - setCx86(arr, ((unsigned char *) &base)[3]); - setCx86(arr + 1, ((unsigned char *) &base)[2]); - setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); setCx86(CX86_RCR_BASE + reg, arr_type); post_set(); } typedef struct { - unsigned long base; - unsigned long size; - mtrr_type type; + unsigned long base; + unsigned long size; + mtrr_type type; } arr_state_t; static arr_state_t arr_state[8] = { @@ -247,16 +256,17 @@ static void cyrix_set_all(void) setCx86(CX86_CCR0 + i, ccr_state[i]); for (; i < 7; i++) setCx86(CX86_CCR4 + i, ccr_state[i]); - for (i = 0; i < 8; i++) - cyrix_set_arr(i, arr_state[i].base, + + for (i = 0; i < 8; i++) { + cyrix_set_arr(i, arr_state[i].base, arr_state[i].size, arr_state[i].type); + } post_set(); } static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, -// .init = cyrix_arr_init, .set_all = cyrix_set_all, .set = cyrix_set_arr, .get = cyrix_get_arr, @@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void) set_mtrr_ops(&cyrix_mtrr_ops); return 0; } - -//arch_initcall(cyrix_init_mtrr); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0543f69f0b2..55da0c5f68d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,28 +1,34 @@ -/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - because MTRRs can span upto 40 bits (36bits on most modern x86) */ +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span upto 40 bits (36bits on most modern x86) + */ +#define DEBUG + +#include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/io.h> #include <linux/mm.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/msr.h> -#include <asm/system.h> -#include <asm/cpufeature.h> + #include <asm/processor-flags.h> +#include <asm/cpufeature.h> #include <asm/tlbflush.h> +#include <asm/system.h> +#include <asm/mtrr.h> +#include <asm/msr.h> #include <asm/pat.h> + #include "mtrr.h" struct fixed_range_block { - int base_msr; /* start address of an MTRR block */ - int ranges; /* number of MTRRs in this block */ + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { - { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -30,10 +36,10 @@ static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; -struct mtrr_state_type mtrr_state = {}; +struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); -/** +/* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section @@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) { + if (!(mtrr_state.enabled & 2)) return mtrr_state.def_type; - } prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { @@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) if (start_state != end_state) return 0xFE; - if ((start & mask) != (base & mask)) { + if ((start & mask) != (base & mask)) continue; - } curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; if (prev_match == 0xFF) { @@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) curr_match = MTRR_TYPE_WRTHROUGH; } - if (prev_match != curr_match) { + if (prev_match != curr_match) return MTRR_TYPE_UNCACHABLE; - } } if (mtrr_tom2) { @@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end) return mtrr_state.def_type; } -/* Get the MSR pair relating to a var range */ +/* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { @@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -/* fill the MSR pair relating to a var range */ +/* Fill the MSR pair relating to a var range */ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) { @@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index, vr[index].mask_hi = mask_hi; } -static void -get_fixed_ranges(mtrr_type * frs) +static void get_fixed_ranges(mtrr_type *frs) { - unsigned int *p = (unsigned int *) frs; + unsigned int *p = (unsigned int *)frs; int i; k8_check_syscfg_dram_mod_en(); @@ -217,22 +219,22 @@ static void __init print_fixed_last(void) if (!last_fixed_end) return; - printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, - last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + pr_debug(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } static void __init update_fixed_last(unsigned base, unsigned end, - mtrr_type type) + mtrr_type type) { last_fixed_start = base; last_fixed_end = end; last_fixed_type = type; } -static void __init print_fixed(unsigned base, unsigned step, - const mtrr_type *types) +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) { unsigned i; @@ -259,54 +261,55 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - printk(KERN_DEBUG "MTRR default type: %s\n", - mtrr_attrib_to_str(mtrr_state.def_type)); + pr_debug("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + pr_debug("MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) - print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) - print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); /* tail */ print_fixed_last(); } - printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + pr_debug("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); if (size_or_mask & 0xffffffffUL) high_width = ffs(size_or_mask & 0xffffffffUL) - 1; else high_width = ffs(size_or_mask>>32) + 32 - 1; high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); else - printk(KERN_DEBUG " %u disabled\n", i); - } - if (mtrr_tom2) { - printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", - mtrr_tom2, mtrr_tom2>>20); + pr_debug(" %u disabled\n", i); } + if (mtrr_tom2) + pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } -/* Grab all of the MTRR state for this CPU into *state */ +/* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { - unsigned int i; struct mtrr_var_range *vrs; - unsigned lo, dummy; unsigned long flags; + unsigned lo, dummy; + unsigned int i; vrs = mtrr_state.var_ranges; @@ -324,6 +327,7 @@ void __init get_mtrr_state(void) if (amd_special_default_mtrr()) { unsigned low, high; + /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); mtrr_tom2 = high; @@ -344,10 +348,9 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); - } -/* Some BIOS's are fucked and don't set all MTRRs the same! */ +/* Some BIOS's are messed up and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; @@ -355,28 +358,33 @@ void __init mtrr_state_warn(void) if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); printk(KERN_INFO "mtrr: corrected configuration.\n"); } -/* Doesn't attempt to pass an error out to MTRR users - because it's quite complicated in some cases and probably not - worth it because the best error handling is to ignore it. */ +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { - if (wrmsr_safe(msr, a, b) < 0) + if (wrmsr_safe(msr, a, b) < 0) { printk(KERN_ERR "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); + } } /** - * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have @@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) * * Returns: The index of the region on success, else negative on error. */ -int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } + return -ENOSPC; } @@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { - /* Invalid (i.e. free) range */ + /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; @@ -471,27 +482,31 @@ out_put_cpu: } /** - * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ -static int set_fixed_ranges(mtrr_type * frs) +static int set_fixed_ranges(mtrr_type *frs) { - unsigned long long *saved = (unsigned long long *) frs; + unsigned long long *saved = (unsigned long long *)frs; bool changed = false; - int block=-1, range; + int block = -1, range; k8_check_syscfg_dram_mod_en(); - while (fixed_range_blocks[++block].ranges) - for (range=0; range < fixed_range_blocks[block].ranges; range++) - set_fixed_range(fixed_range_blocks[block].base_msr + range, - &changed, (unsigned int *) saved++); + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } return changed; } -/* Set the MSR pair relating to a var range. Returns TRUE if - changes are made */ +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; @@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; } @@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi; */ static unsigned long set_mtrr_state(void) { - unsigned int i; unsigned long change_mask = 0; + unsigned int i; - for (i = 0; i < num_var_ranges; i++) + for (i = 0; i < num_var_ranges; i++) { if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; - /* Set_mtrr_restore restores the old value of MTRRdefType, - so to set it we fiddle with the saved value */ + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ if ((deftype_lo & 0xff) != mtrr_state.def_type || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); + + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | + (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void) } -static unsigned long cr4 = 0; +static unsigned long cr4; static DEFINE_SPINLOCK(set_atomicity_lock); /* - * Since we are disabling the cache don't allow any interrupts - they - * would run extremely slow and would only increase the pain. The caller must - * ensure that local interrupts are disabled and are reenabled after post_set() - * has been called. + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after post_set() has been called. */ - static void prepare_set(void) __acquires(set_atomicity_lock) { unsigned long cr0; - /* Note that this is not ideal, since the cache is only flushed/disabled - for this CPU while the MTRRs are changed, but changing this requires - more invasive changes to the way the kernel boots */ + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ spin_lock(&set_atomicity_lock); - /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } @@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ __flush_tlb(); - /* Save MTRR state */ + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - /* Disable MTRRs, and set the default type to uncached */ + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) { - /* Flush TLBs (no need to flush caches - they are disabled) */ + /* Flush TLBs (no need to flush caches - they are disabled) */ __flush_tlb(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); spin_unlock(&set_atomicity_lock); } @@ -623,24 +647,27 @@ static void generic_set_all(void) post_set(); local_irq_restore(flags); - /* Use the atomic bitops to update the global mask */ + /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } - + } +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - <reg> The register to set. - <base> The base address of the region. - <size> The size of the region. If this is 0 the region is disabled. - <type> The type of the region. - [RETURNS] Nothing. -*/ { unsigned long flags; struct mtrr_var_range *vr; @@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, prepare_set(); if (size == 0) { - /* The invalid bit is kept in the mask, so we simply clear the - relevant mask register to disable a range. */ + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { @@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, local_irq_restore(flags); } -int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) { unsigned long lbase, last; - /* For Intel PPro stepping <= 7, must be 4 MiB aligned - and not touch 0x70000000->0x7003FFFF */ + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { - printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } - /* Check upper bits of base and last are equal and lower bits are 0 - for base and 1 for last */ + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); - lbase = lbase >> 1, last = last >> 1) ; + lbase = lbase >> 1, last = last >> 1) + ; if (lbase != last) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", - base, size); + pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; } - static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); - return (config & (1 << 10)); + return config & (1 << 10); } int positive_have_wrcomb(void) @@ -716,14 +749,15 @@ int positive_have_wrcomb(void) return 1; } -/* generic structure... +/* + * Generic structure... */ struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .set_all = generic_set_all, - .get = generic_get_mtrr, - .get_free_region = generic_get_free_region, - .set = generic_set_mtrr, - .validate_add_page = generic_validate_add_page, - .have_wrcomb = generic_have_wrcomb, + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, }; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index fb73a52913a..3c1b12d461d 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -1,27 +1,28 @@ -#include <linux/init.h> -#include <linux/proc_fs.h> #include <linux/capability.h> -#include <linux/ctype.h> -#include <linux/module.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> #define LINE_SIZE 80 #include <asm/mtrr.h> + #include "mtrr.h" #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = { - "uncachable", /* 0 */ - "write-combining", /* 1 */ - "?", /* 2 */ - "?", /* 3 */ - "write-through", /* 4 */ - "write-protect", /* 5 */ - "write-back", /* 6 */ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ }; const char *mtrr_attrib_to_str(int x) @@ -35,8 +36,8 @@ static int mtrr_file_add(unsigned long base, unsigned long size, unsigned int type, bool increment, struct file *file, int page) { + unsigned int *fcount = FILE_FCOUNT(file); int reg, max; - unsigned int *fcount = FILE_FCOUNT(file); max = num_var_ranges; if (fcount == NULL) { @@ -61,8 +62,8 @@ static int mtrr_file_del(unsigned long base, unsigned long size, struct file *file, int page) { - int reg; unsigned int *fcount = FILE_FCOUNT(file); + int reg; if (!page) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) @@ -81,34 +82,45 @@ mtrr_file_del(unsigned long base, unsigned long size, return reg; } -/* RED-PEN: seq_file can seek now. this is ignored. */ +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ static ssize_t mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) -/* Format of control line: - "base=%Lx size=%Lx type=%s" OR: - "disable=%d" -*/ { int i, err; unsigned long reg; unsigned long long base, size; char *ptr; char line[LINE_SIZE]; + int length; size_t linelen; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!len) - return -EINVAL; + memset(line, 0, LINE_SIZE); - if (len > LINE_SIZE) - len = LINE_SIZE; - if (copy_from_user(line, buf, len - 1)) + + length = len; + length--; + + if (length > LINE_SIZE - 1) + length = LINE_SIZE - 1; + + if (length < 0) + return -EINVAL; + + if (copy_from_user(line, buf, length)) return -EFAULT; + linelen = strlen(line); ptr = line + linelen - 1; if (linelen && *ptr == '\n') *ptr = '\0'; + if (!strncmp(line, "disable=", 8)) { reg = simple_strtoul(line + 8, &ptr, 0); err = mtrr_del_page(reg, 0, 0); @@ -116,28 +128,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return err; return len; } + if (strncmp(line, "base=", 5)) return -EINVAL; + base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) ; + while (isspace(*ptr)) + ptr++; + if (strncmp(ptr, "size=", 5)) return -EINVAL; + size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) ; + while (isspace(*ptr)) + ptr++; + if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) ; + while (isspace(*ptr)) + ptr++; + for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; - err = - mtrr_add_page((unsigned long) base, (unsigned long) size, i, - true); + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); if (err < 0) return err; return len; @@ -181,7 +200,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC32_SET_PAGE_ENTRY: case MTRRIOC32_DEL_PAGE_ENTRY: case MTRRIOC32_KILL_PAGE_ENTRY: { - struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; err = get_user(sentry.base, &s32->base); err |= get_user(sentry.size, &s32->size); err |= get_user(sentry.type, &s32->type); @@ -191,7 +212,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) } case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = get_user(gentry.regnum, &g32->regnum); err |= get_user(gentry.base, &g32->base); err |= get_user(gentry.size, &g32->size); @@ -314,7 +337,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (err) return err; - switch(cmd) { + switch (cmd) { case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: if (copy_to_user(arg, &gentry, sizeof gentry)) @@ -323,7 +346,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = put_user(gentry.base, &g32->base); err |= put_user(gentry.size, &g32->size); err |= put_user(gentry.regnum, &g32->regnum); @@ -335,11 +360,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) return err; } -static int -mtrr_close(struct inode *ino, struct file *file) +static int mtrr_close(struct inode *ino, struct file *file) { - int i, max; unsigned int *fcount = FILE_FCOUNT(file); + int i, max; if (fcount != NULL) { max = num_var_ranges; @@ -359,22 +383,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset); static int mtrr_open(struct inode *inode, struct file *file) { - if (!mtrr_if) + if (!mtrr_if) return -EIO; - if (!mtrr_if->get) - return -ENXIO; + if (!mtrr_if->get) + return -ENXIO; return single_open(file, mtrr_seq_show, NULL); } static const struct file_operations mtrr_fops = { - .owner = THIS_MODULE, - .open = mtrr_open, - .read = seq_read, - .llseek = seq_lseek, - .write = mtrr_write, - .unlocked_ioctl = mtrr_ioctl, - .compat_ioctl = mtrr_ioctl, - .release = mtrr_close, + .owner = THIS_MODULE, + .open = mtrr_open, + .read = seq_read, + .llseek = seq_lseek, + .write = mtrr_write, + .unlocked_ioctl = mtrr_ioctl, + .compat_ioctl = mtrr_ioctl, + .release = mtrr_close, }; static int mtrr_seq_show(struct seq_file *seq, void *offset) @@ -388,23 +412,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); - if (size == 0) + if (size == 0) { mtrr_usage_table[i] = 0; - else { - if (size < (0x100000 >> PAGE_SHIFT)) { - /* less than 1MB */ - factor = 'K'; - size <<= PAGE_SHIFT - 10; - } else { - factor = 'M'; - size >>= 20 - PAGE_SHIFT; - } - /* RED-PEN: base can be > 32bit */ - len += seq_printf(seq, - "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_usage_table[i], mtrr_attrib_to_str(type)); + continue; } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + len += seq_printf(seq, "reg%02i: base=0x%06lx000 " + "(%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), size, + factor, mtrr_usage_table[i], + mtrr_attrib_to_str(type)); } return 0; } @@ -422,6 +447,5 @@ static int __init mtrr_if_init(void) proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); return 0; } - arch_initcall(mtrr_if_init); #endif /* CONFIG_PROC_FS */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8fc248b5aea..84e83de5457 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -25,43 +25,49 @@ Operating System Writer's Guide" (Intel document number 242692), section 11.11.7 - This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> - on 6-7 March 2002. - Source: Intel Architecture Software Developers Manual, Volume 3: + This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: System Programming Guide; Section 9.11. (1997 edition - PPro). */ +#define DEBUG + +#include <linux/types.h> /* FIXME: kvm_para.h needs this */ + +#include <linux/kvm_para.h> +#include <linux/uaccess.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/init.h> +#include <linux/sort.h> +#include <linux/cpu.h> #include <linux/pci.h> #include <linux/smp.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/sort.h> +#include <asm/processor.h> #include <asm/e820.h> #include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> #include <asm/msr.h> -#include <asm/kvm_para.h> + #include "mtrr.h" -u32 num_var_ranges = 0; +u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +static bool mtrr_aps_delayed_init; -static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; +static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops * mtrr_if = NULL; +struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops * ops) +void set_mtrr_ops(struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; @@ -72,30 +78,36 @@ static int have_wrcomb(void) { struct pci_dev *dev; u8 rev; - - if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { - /* ServerWorks LE chipsets < rev 6 have problems with write-combining - Don't allow it and leave room for other chipsets to be tagged */ + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev <= 5) { - printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } } - /* Intel 450NX errata # 23. Non ascending cacheline evictions to - write combining memory may resulting in data corruption */ + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ if (dev->vendor == PCI_VENDOR_ID_INTEL && dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } pci_dev_put(dev); - } - return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; } /* This function returns the number of variable MTRRs */ @@ -103,12 +115,13 @@ static void __init set_num_var_ranges(void) { unsigned long config = 0, dummy; - if (use_intel()) { + if (use_intel()) rdmsr(MSR_MTRRcap, config, dummy); - } else if (is_cpu(AMD)) + else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; + num_var_ranges = config & 0xff; } @@ -130,10 +143,12 @@ struct set_mtrr_data { mtrr_type smp_type; }; +/** + * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * + * Returns nothing. + */ static void ipi_handler(void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; @@ -142,18 +157,22 @@ static void ipi_handler(void *info) local_irq_save(flags); atomic_dec(&data->count); - while(!atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ - if (data->smp_reg != ~0U) - mtrr_if->set(data->smp_reg, data->smp_base, + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - else + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); + } atomic_dec(&data->count); - while(atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); @@ -161,7 +180,8 @@ static void ipi_handler(void *info) #endif } -static inline int types_compatible(mtrr_type type1, mtrr_type type2) { +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ return type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || @@ -176,10 +196,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * @type: mtrr type * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: - * + * * 1. Send IPI to do the following: * 2. Disable Interrupts - * 3. Wait for all procs to do so + * 3. Wait for all procs to do so * 4. Enter no-fill cache mode * 5. Flush caches * 6. Clear PGE bit @@ -189,26 +209,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * 10. Enable all range registers * 11. Flush all TLBs and caches again * 12. Enter normal cache mode and reenable caching - * 13. Set PGE + * 13. Set PGE * 14. Wait for buddies to catch up * 15. Enable interrupts. - * + * * What does that mean for us? Well, first we set data.count to the number * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it - * differently, so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate to - * be reset. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. + * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * CPU goes through the transition of updating MTRRs. + * The CPU vendors may each do it differently, + * so we call mtrr_if->set() callback and let them take care of it. + * When they're done, they again decrement data->count and wait for data.gate + * to be reset. + * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { struct set_mtrr_data data; unsigned long flags; @@ -218,121 +239,124 @@ static void set_mtrr(unsigned int reg, unsigned long base, data.smp_size = size; data.smp_type = type; atomic_set(&data.count, num_booting_cpus() - 1); - /* make sure data.count is visible before unleashing other CPUs */ + + /* Make sure data.count is visible before unleashing other CPUs */ smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); - /* Start the ball rolling on other CPUs */ + /* Start the ball rolling on other CPUs */ if (smp_call_function(ipi_handler, &data, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); local_irq_save(flags); - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); - /* ok, reset count and toggle gate */ + /* Ok, reset count and toggle gate */ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,1); + atomic_set(&data.gate, 1); - /* do our MTRR business */ + /* Do our MTRR business */ - /* HACK! + /* + * HACK! * We use this same function to initialize the mtrrs on boot. * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. + * to replicate across all the APs. * If we're doing that @reg is set to something special... */ - if (reg != ~0U) - mtrr_if->set(reg,base,size,type); + if (reg != ~0U) + mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); - /* wait for the others */ - while(atomic_read(&data.count)) + /* Wait for the others */ + while (atomic_read(&data.count)) cpu_relax(); atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); /* * Wait here for everyone to have seen the gate change * So we're the last ones to touch 'data' */ - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); local_irq_restore(flags); } /** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (in units of 4 kB!) - * @size: Physical size of region in pages (4 kB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int mtrr_add_page(unsigned long base, unsigned long size, +int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment) { + unsigned long lbase, lsize; int i, replace, error; mtrr_type ltype; - unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; - - if ((error = mtrr_if->validate_add_page(base,size,type))) + + error = mtrr_if->validate_add_page(base, size, type); + if (error) return error; if (type >= MTRR_NUM_TYPES) { - printk(KERN_WARNING "mtrr: type: %u invalid\n", type); + pr_warning("mtrr: type: %u invalid\n", type); return -EINVAL; } - /* If the type is WC, check that this processor supports it */ + /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - printk(KERN_WARNING - "mtrr: your processor doesn't support write-combining\n"); + pr_warning("mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - printk(KERN_WARNING "mtrr: zero sized request\n"); + pr_warning("mtrr: zero sized request\n"); return -EINVAL; } if (base & size_or_mask || size & size_or_mask) { - printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); + pr_warning("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -341,36 +365,40 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* No CPU hotplug when we change MTRR entries */ get_online_cpus(); - /* Search for existing MTRR */ + + /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); - if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) continue; - /* At this point we know there is some kind of overlap/enclosure */ + /* + * At this point we know there is some kind of + * overlap/enclosure + */ if (base < lbase || base + size - 1 > lbase + lsize - 1) { - if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { /* New region encloses an existing region */ if (type == ltype) { replace = replace == -1 ? i : -2; continue; - } - else if (types_compatible(type, ltype)) + } else if (types_compatible(type, ltype)) continue; } - printk(KERN_WARNING - "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, - lsize); + pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); goto out; } - /* New region is enclosed by an existing region */ + /* New region is enclosed by an existing region */ if (ltype != type) { if (types_compatible(type, ltype)) continue; - printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, mtrr_attrib_to_str(ltype), - mtrr_attrib_to_str(type)); + pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); goto out; } if (increment) @@ -378,7 +406,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; goto out; } - /* Search for an empty MTRR */ + /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); @@ -393,8 +421,9 @@ int mtrr_add_page(unsigned long base, unsigned long size, mtrr_usage_table[replace] = 0; } } - } else - printk(KERN_INFO "mtrr: no more MTRRs available\n"); + } else { + pr_info("mtrr: no more MTRRs available\n"); + } error = i; out: mutex_unlock(&mtrr_mutex); @@ -405,10 +434,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING - "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG - "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -416,66 +443,64 @@ static int mtrr_check(unsigned long base, unsigned long size) } /** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } +EXPORT_SYMBOL(mtrr_add); /** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; @@ -500,22 +525,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); goto out; } } if (reg >= max) { - printk(KERN_WARNING "mtrr: register: %d too big\n", reg); + pr_warning("mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); + pr_warning("mtrr: MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + pr_warning("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) @@ -526,33 +551,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) put_online_cpus(); return error; } + /** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - -int -mtrr_del(int reg, unsigned long base, unsigned long size) +int mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } - -EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); -/* HACK ALERT! +/* + * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ @@ -576,29 +599,28 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device * sysdev, pm_message_t state) +static int mtrr_save(struct sys_device *sysdev, pm_message_t state) { int i; for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, - &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); } return 0; } -static int mtrr_restore(struct sys_device * sysdev) +static int mtrr_restore(struct sys_device *sysdev) { int i; for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) - set_mtrr(i, - mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } } return 0; } @@ -615,26 +637,29 @@ int __initdata changed_by_mtrr_cleanup; /** * mtrr_bp_init - initialize mtrrs on the boot CPU * - * This needs to be called early; before any of the other CPUs are + * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). - * + * */ void __init mtrr_bp_init(void) { u32 phys_addr; + init_ifs(); phys_addr = 32; if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ + size_or_mask = 0xff000000; /* 36 bits */ size_and_mask = 0x00f00000; phys_addr = 36; - /* This is an AMD specific MSR, but we assume(hope?) that - Intel will implement it to when they extend the address - bus of the Xeon. */ + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it to when they extend the address + * bus of the Xeon. + */ if (cpuid_eax(0x80000000) >= 0x80000008) { phys_addr = cpuid_eax(0x80000008) & 0xff; /* CPUID workaround for Intel 0F33/0F34 CPU */ @@ -649,9 +674,11 @@ void __init mtrr_bp_init(void) size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { - /* VIA C* family have Intel style MTRRs, but - don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = 0xfff00000; /* 32 bits */ size_and_mask = 0; phys_addr = 32; } @@ -694,30 +721,28 @@ void __init mtrr_bp_init(void) changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); } - } } } void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, - * but this routine will be called in cpu boot time, holding the lock - * breaks it. This routine is called in two cases: 1.very earily time - * of software resume, when there absolutely isn't mtrr entry changes; - * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to - * prevent mtrr entry changes + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -728,23 +753,55 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = true; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = false; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; + if (use_intel()) { if (!changed_by_mtrr_cleanup) mtrr_state_warn(); - } else { - /* The CPUs haven't MTRR and seem to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * TBD: is there any system with such CPU which supports - * suspend/resume? if no, we should remove the code. - */ - sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); + return 0; } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + return 0; } subsys_initcall(mtrr_init_finialize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7538b767f20..a501dee9a87 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -1,5 +1,5 @@ /* - * local mtrr defines. + * local MTRR defines. */ #include <linux/types.h> @@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; u32 use_intel_if; -// void (*init)(void); void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type); + unsigned long *size, mtrr_type *type); int (*get_free_region)(unsigned long base, unsigned long size, int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, @@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { - unsigned long flags; - unsigned long cr4val; - u32 deftype_lo; - u32 deftype_hi; - u32 ccr3; + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; }; void set_mtrr_done(struct set_mtrr_context *ctxt); @@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops * ops); +extern void set_mtrr_ops(struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops * mtrr_if; +extern struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 1f5fb1588d1..dfc80b4e6b0 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -1,24 +1,25 @@ -#include <linux/mm.h> #include <linux/init.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/msr.h> +#include <linux/io.h> +#include <linux/mm.h> + #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> -#include "mtrr.h" +#include <asm/mtrr.h> +#include <asm/msr.h> +#include "mtrr.h" -/* Put the processor into a state where MTRRs can be safely set */ +/* Put the processor into a state where MTRRs can be safely set */ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) { unsigned int cr0; - /* Disable interrupts locally */ + /* Disable interrupts locally */ local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { - /* Save value of CR4 and clear Page Global Enable (bit 7) */ + /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { ctxt->cr4val = read_cr4(); write_cr4(ctxt->cr4val & ~X86_CR4_PGE); @@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) write_cr0(cr0); wbinvd(); - if (use_intel()) - /* Save MTRR state */ + if (use_intel()) { + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else were excluded at the top */ + } else { + /* + * Cyrix ARRs - + * everything else were excluded at the top + */ ctxt->ccr3 = getCx86(CX86_CCR3); + } } } void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { - if (use_intel()) - /* Disable MTRRs, and set the default type to uncached */ + if (use_intel()) { + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); - else if (is_cpu(CYRIX)) - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } else { + if (is_cpu(CYRIX)) { + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } + } } -/* Restore the processor after a set_mtrr_prepare */ +/* Restore the processor after a set_mtrr_prepare */ void set_mtrr_done(struct set_mtrr_context *ctxt) { if (use_intel() || is_cpu(CYRIX)) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); - /* Restore MTRRdefType */ - if (use_intel()) + /* Restore MTRRdefType */ + if (use_intel()) { /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else was excluded at the top */ + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, + ctxt->deftype_hi); + } else { + /* + * Cyrix ARRs - + * everything else was excluded at the top + */ setCx86(CX86_CCR3, ctxt->ccr3); + } - /* Enable caches */ + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ + /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(ctxt->cr4val); } - /* Re-enable interrupts locally (if enabled previously) */ + /* Re-enable interrupts locally (if enabled previously) */ local_irq_restore(ctxt->flags); } - diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c index a7aa8f90095..b5801c31184 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1,16 +1,17 @@ /* - * Performance counter x86 architecture code + * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * * For licencing details see kernel-base/COPYING */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> @@ -20,19 +21,60 @@ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/highmem.h> +#include <linux/cpu.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> -static u64 perf_counter_mask __read_mostly; +static u64 perf_event_mask __read_mostly; -struct cpu_hw_counters { - struct perf_counter *counters[X86_PMC_IDX_MAX]; +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +struct cpu_hw_events { + struct perf_event *events[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; + struct debug_store *ds; }; /* @@ -44,24 +86,27 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_counter *, int); - void (*disable)(struct hw_perf_counter *, int); + void (*enable)(struct hw_perf_event *, int); + void (*disable)(struct hw_perf_event *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; - int num_counters; - int num_counters_fixed; - int counter_bits; - u64 counter_mask; + int num_events; + int num_events_fixed; + int event_bits; + u64 event_mask; + int apic; u64 max_period; u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); }; static struct x86_pmu x86_pmu __read_mostly; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -72,42 +117,42 @@ static const u64 p6_perfmon_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, }; -static u64 p6_pmu_event_map(int event) +static u64 p6_pmu_event_map(int hw_event) { - return p6_perfmon_event_map[event]; + return p6_perfmon_event_map[hw_event]; } /* - * Counter setting that is specified not to count anything. + * Event setting that is specified not to count anything. * We use this to effectively disable a counter. * * L2_RQSTS with 0 MESI unit mask. */ -#define P6_NOP_COUNTER 0x0000002EULL +#define P6_NOP_EVENT 0x0000002EULL -static u64 p6_pmu_raw_event(u64 event) +static u64 p6_pmu_raw_event(u64 hw_event) { #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL #define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL #define P6_EVNTSEL_MASK \ (P6_EVNTSEL_EVENT_MASK | \ P6_EVNTSEL_UNIT_MASK | \ P6_EVNTSEL_EDGE_MASK | \ P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_COUNTER_MASK) + P6_EVNTSEL_REG_MASK) - return event & P6_EVNTSEL_MASK; + return hw_event & P6_EVNTSEL_MASK; } @@ -125,16 +170,16 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static u64 intel_pmu_event_map(int event) +static u64 intel_pmu_event_map(int hw_event) { - return intel_perfmon_event_map[event]; + return intel_perfmon_event_map[hw_event]; } /* - * Generalized hw caching related event table, filled + * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'event makes no sense on - * this CPU', any other value means the raw event + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event * ID. */ @@ -418,22 +463,22 @@ static const u64 atom_hw_cache_event_ids }, }; -static u64 intel_pmu_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 hw_event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ CORE_EVNTSEL_EDGE_MASK | \ CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_COUNTER_MASK) + CORE_EVNTSEL_REG_MASK) - return event & CORE_EVNTSEL_MASK; + return hw_event & CORE_EVNTSEL_MASK; } static const u64 amd_hw_cache_event_ids @@ -540,52 +585,55 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; -static u64 amd_pmu_event_map(int event) +static u64 amd_pmu_event_map(int hw_event) { - return amd_perfmon_event_map[event]; + return amd_perfmon_event_map[hw_event]; } -static u64 amd_pmu_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 hw_event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL #define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ K7_EVNTSEL_EDGE_MASK | \ K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_COUNTER_MASK) + K7_EVNTSEL_REG_MASK) - return event & K7_EVNTSEL_MASK; + return hw_event & K7_EVNTSEL_MASK; } /* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ static u64 -x86_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { - int shift = 64 - x86_pmu.counter_bits; + int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; s64 delta; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* - * Careful: an NMI might modify the previous counter value. + * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta - * count to the generic counter atomically: + * count to the generic event atomically: */ again: prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->counter_base + idx, new_raw_count); + rdmsrl(hwc->event_base + idx, new_raw_count); if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -594,7 +642,7 @@ again: /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta - * (counter-)time and add that to the generic counter. + * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. @@ -602,39 +650,42 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &counter->count); + atomic64_add(delta, &event->count); atomic64_sub(delta, &hwc->period_left); return new_raw_count; } -static atomic_t active_counters; +static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = x86_pmu.num_counters; + i = x86_pmu.num_events; perfctr_fail: for (i--; i >= 0; i--) @@ -644,25 +695,128 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef CONFIG_X86_LOCAL_APIC int i; - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif +} + +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_events, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_events, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; } -static void hw_perf_counter_destroy(struct perf_counter *counter) +static void hw_perf_event_destroy(struct perf_event *event) { - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { release_pmc_hardware(); + release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -673,7 +827,7 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) { unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -705,13 +859,49 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) return 0; } +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + /* * Setup the hardware configuration for a given attr_type */ -static int __hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_event_init(struct perf_event *event) { - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; u64 config; int err; @@ -719,17 +909,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (!atomic_inc_not_zero(&active_counters)) { + if (!atomic_inc_not_zero(&active_events)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) - err = -EBUSY; - else - atomic_inc(&active_counters); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + err = reserve_bts_hardware(); + } + if (!err) + atomic_inc(&active_events); mutex_unlock(&pmc_reserve_mutex); } if (err) return err; + event->destroy = hw_perf_event_destroy; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -748,12 +944,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } - counter->destroy = hw_perf_counter_destroy; - /* - * Raw event type provide the config in the event structure + * Raw hw_event type provide the config in the hw_event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); @@ -777,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (config == -1LL) return -EINVAL; + /* + * Branch tracing: + */ + if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && + (hwc->sample_period == 1)) { + /* BTS is not supported by this architecture. */ + if (!bts_available()) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + return -EOPNOTSUPP; + } + hwc->config |= config; return 0; @@ -784,7 +1001,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) static void p6_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; if (!cpuc->enabled) @@ -801,12 +1018,23 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); } static void amd_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (!cpuc->enabled) @@ -815,12 +1043,12 @@ static void amd_pmu_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that amd_pmu_enable_counter() does the + * events proper, so that amd_pmu_enable_event() does the * right thing. */ barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -842,7 +1070,7 @@ void hw_perf_disable(void) static void p6_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long val; if (cpuc->enabled) @@ -859,12 +1087,30 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } } static void amd_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (cpuc->enabled) @@ -873,14 +1119,14 @@ static void amd_pmu_enable_all(void) cpuc->enabled = 1; barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; + for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - val = counter->hw.config; + val = event->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -907,19 +1153,19 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -932,10 +1178,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) } static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val = P6_NOP_COUNTER; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -944,36 +1190,44 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) } static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc, idx); return; } - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: + * To be called with the event disabled in hw: */ static int -x86_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; int err, ret = 0; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * If we are way outside a reasoable range then just skip forward: */ @@ -991,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, ret = 1; } /* - * Quirk: certain CPUs dont like it if just 1 event is left: + * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; @@ -999,24 +1253,24 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* - * The hw counter starts counting from this counter offset, + * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ atomic64_set(&hwc->prev_count, (u64)-left); - err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & x86_pmu.counter_mask); + err = checking_wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.event_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return ret; } static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -1041,9 +1295,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; val = hwc->config; @@ -1054,128 +1308,149 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) } -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc, idx); return; } - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) { - unsigned int event; + unsigned int hw_event; - if (!x86_pmu.num_counters_fixed) - return -1; + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + + if (!x86_pmu.num_events_fixed) + return -1; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; } /* - * Find a PMC slot for the freshly enabled / scheduled in counter: + * Find a PMC slot for the freshly enabled / scheduled in event: */ -static int x86_pmu_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; int idx; - idx = fixed_mode_idx(counter, hwc); - if (idx >= 0) { + idx = fixed_mode_idx(event, hwc); + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* BTS is already occupied. */ + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + hwc->config_base = 0; + hwc->event_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { /* - * Try to get the fixed counter, if that is already taken - * then try to get a generic counter: + * Try to get the fixed event, if that is already taken + * then try to get a generic event: */ if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; /* - * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * We set it so that event_base + idx in wrmsr/rdmsr maps to * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: */ - hwc->counter_base = + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; } else { idx = hwc->idx; - /* Try to get the previous generic counter again */ + /* Try to get the previous generic event again */ if (test_and_set_bit(idx, cpuc->used_mask)) { try_generic: idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_counters); - if (idx == x86_pmu.num_counters) + x86_pmu.num_events); + if (idx == x86_pmu.num_events) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } hwc->config_base = x86_pmu.eventsel; - hwc->counter_base = x86_pmu.perfctr; + hwc->event_base = x86_pmu.perfctr; } - perf_counters_lapic_init(); + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); - cpuc->counters[idx] = counter; + cpuc->events[idx] = event; set_bit(idx, cpuc->active_mask); - x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_set_period(event, hwc, idx); x86_pmu.enable(hwc, idx); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return 0; } -static void x86_pmu_unthrottle(struct perf_counter *counter) +static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->counters[hwc->idx] != counter)) + cpuc->events[hwc->idx] != event)) return; x86_pmu.enable(hwc, hwc->idx); } -void perf_counter_print_debug(void) +void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &per_cpu(cpu_hw_events, cpu); if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); @@ -1191,11 +1466,11 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1204,7 +1479,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1213,10 +1488,69 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } -static void x86_pmu_disable(struct perf_counter *counter) +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + + data.period = event->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static void x86_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; /* @@ -1228,59 +1562,67 @@ static void x86_pmu_disable(struct perf_counter *counter) /* * Make sure the cleared pointer becomes visible before we - * (potentially) free the counter: + * (potentially) free the event: */ barrier(); /* - * Drain the remaining delta count out of a counter + * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_counter_update(counter, hwc, idx); - cpuc->counters[idx] = NULL; + x86_perf_event_update(event, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); + + cpuc->events[idx] = NULL; clear_bit(idx, cpuc->used_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); } /* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; int ret; - x86_perf_counter_update(counter, hwc, idx); - ret = x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - intel_pmu_enable_counter(hwc, idx); + if (event->state == PERF_EVENT_STATE_ACTIVE) + intel_pmu_enable_event(hwc, idx); return ret; } static void intel_pmu_reset(void) { + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; unsigned long flags; int idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } + if (ds) + ds->bts_index = ds->bts_buffer_base; local_irq_restore(flags); } @@ -1288,39 +1630,38 @@ static void intel_pmu_reset(void) static int p6_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) - p6_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + p6_pmu_disable_event(hwc, idx); } if (handled) @@ -1336,16 +1677,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; int bit, loops; u64 ack, status; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); perf_disable(); + intel_pmu_drain_bts_buffer(cpuc); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1355,8 +1696,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: if (++loops > 100) { - WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); - perf_counter_print_debug(); + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); intel_pmu_reset(); perf_enable(); return 1; @@ -1365,19 +1706,19 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); if (!test_bit(bit, cpuc->active_mask)) continue; - if (!intel_pmu_save_and_restart(counter)) + if (!intel_pmu_save_and_restart(event)) continue; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (perf_counter_overflow(counter, 1, &data)) - intel_pmu_disable_counter(&counter->hw, bit); + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); } intel_pmu_ack_status(ack); @@ -1397,39 +1738,38 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) - amd_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + amd_pmu_disable_event(hwc, idx); } if (handled) @@ -1443,34 +1783,41 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) irq_enter(); ack_APIC_irq(); inc_irq_stat(apic_pending_irqs); - perf_counter_do_pending(); + perf_event_do_pending(); irq_exit(); } -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } -void perf_counters_lapic_init(void) +void perf_events_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, +perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; struct pt_regs *regs; - if (!atomic_read(&active_counters)) + if (!atomic_read(&active_events)) return NOTIFY_DONE; switch (cmd) { @@ -1484,10 +1831,12 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two - * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * events could trigger 'simultaneously' raising two back-to-back NMIs. * * If the first NMI handles both, the latter will be empty and daze * the CPU. @@ -1497,8 +1846,8 @@ perf_counter_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, .next = NULL, .priority = 1 }; @@ -1508,25 +1857,26 @@ static struct x86_pmu p6_pmu = { .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_counter, - .disable = p6_pmu_disable_counter, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, .eventsel = MSR_P6_EVNTSEL0, .perfctr = MSR_P6_PERFCTR0, .event_map = p6_pmu_event_map, .raw_event = p6_pmu_raw_event, .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, - .num_counters = 2, + .num_events = 2, /* - * Counters have 40 bits implemented. However they are designed such + * Events have 40 bits implemented. However they are designed such * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a counter for P6-like PMU is 32 bits only. + * effective width of a event for P6-like PMU is 32 bits only. * * See IA-32 Intel Architecture Software developer manual Vol 3B */ - .counter_bits = 32, - .counter_mask = (1ULL << 32) - 1, + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, }; static struct x86_pmu intel_pmu = { @@ -1534,19 +1884,22 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_counter, - .disable = intel_pmu_disable_counter, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of - * the generic counter period: + * the generic event period: */ .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, }; static struct x86_pmu amd_pmu = { @@ -1554,16 +1907,17 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_counter, - .disable = amd_pmu_disable_counter, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, - .counter_bits = 48, - .counter_mask = (1ULL << 48) - 1, + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; @@ -1589,13 +1943,14 @@ static int p6_pmu_init(void) return -ENODEV; } + x86_pmu = p6_pmu; + if (!cpu_has_apic) { - pr_info("no Local APIC, try rebooting with lapic"); - return -ENODEV; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; } - x86_pmu = p6_pmu; - return 0; } @@ -1618,7 +1973,7 @@ static int intel_pmu_init(void) /* * Check whether the Architectural PerfMon supports - * Branch Misses Retired Event or not. + * Branch Misses Retired hw_event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) @@ -1630,15 +1985,15 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; /* - * Quirk: v2 perfmon does not report fixed-purpose counters, so - * assume at least 3 counters: + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); /* * Install the hw-cache-events table: @@ -1685,11 +2040,11 @@ static int amd_pmu_init(void) return 0; } -void __init init_hw_perf_counters(void) +void __init init_hw_perf_events(void) { int err; - pr_info("Performance Counters: "); + pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: @@ -1702,45 +2057,45 @@ void __init init_hw_perf_counters(void) return; } if (err != 0) { - pr_cont("no PMU driver, software counters only.\n"); + pr_cont("no PMU driver, software events only.\n"); return; } pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_events, X86_PMC_MAX_GENERIC); + x86_pmu.num_events = X86_PMC_MAX_GENERIC; } - perf_counter_mask = (1 << x86_pmu.num_counters) - 1; - perf_max_counters = x86_pmu.num_counters; + perf_event_mask = (1 << x86_pmu.num_events) - 1; + perf_max_events = x86_pmu.num_events; - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; } - perf_counter_mask |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_counter_mask; + perf_event_mask |= + ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_event_mask; - perf_counters_lapic_init(); - register_die_notifier(&perf_counter_nmi_notifier); + perf_events_lapic_init(); + register_die_notifier(&perf_event_nmi_notifier); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); - pr_info("... generic counters: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); - pr_info("... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); } -static inline void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); + x86_perf_event_update(event, &event->hw, event->hw.idx); } static const struct pmu pmu = { @@ -1750,13 +2105,16 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; - err = __hw_perf_counter_init(counter); - if (err) + err = __hw_perf_event_init(event); + if (err) { + if (event->destroy) + event->destroy(event); return ERR_PTR(err); + } return &pmu; } @@ -1772,8 +2130,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static DEFINE_PER_CPU(int, in_nmi_frame); @@ -1926,9 +2284,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; @@ -1936,3 +2294,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void hw_perf_event_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index e60ed740d2b..fab786f60ed 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include <linux/kprobes.h> #include <asm/apic.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> struct nmi_watchdog_ctlblk { unsigned int cccr_msr; @@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); + return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); + return msr - MSR_ARCH_PERFMON_PERFCTR0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_PERFCTR0); + return msr - MSR_P6_PERFCTR0; case 15: - return (msr - MSR_P4_BPU_PERFCTR0); + return msr - MSR_P4_BPU_PERFCTR0; } } return 0; @@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); + return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + return msr - MSR_ARCH_PERFMON_EVENTSEL0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_EVNTSEL0); + return msr - MSR_P6_EVNTSEL0; case 15: - return (msr - MSR_P4_BSU_ESCR0); + return msr - MSR_P4_BSU_ESCR0; } } return 0; @@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) { BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } /* checks the an msr for availability */ @@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) counter = nmi_perfctr_msr_to_bit(msr); BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); @@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) */ counter_val = (u64)cpu_khz * 1000; do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { + if (counter_val > 0x7fffffffULL) { u64 count = (u64)cpu_khz * 1000; do_div(count, 0x7fffffffUL); retval = count + 1; @@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } @@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) apic_write(APIC_LVTPC, APIC_DM_NMI); /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); + write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); } static const struct wd_ops p6_wd_ops = { @@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz) if (smp_num_siblings == 2) { unsigned int ebx, apicid; - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; } else #endif ht_num = 0; @@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz) } evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS + | P4_ESCR_OS | P4_ESCR_USR; cccr_val |= P4_CCCR_THRESHOLD(15) @@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* - * P4 quirks: + * P4 quirks: * - An overflown perfctr will assert its interrupt * until the OVF flag in its CCCR is cleared. * - LVTPC is masked on interrupt and must be @@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) * NOTE: Corresponding bit = 0 in ebx indicates event present. */ cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + if ((eax.split.mask_length < + (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) return 0; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246..62ac8cb6ba2 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); -#ifdef CONFIG_X86_64 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); -#endif seq_printf(m, "power management:"); for (i = 0; i < 32; i++) { @@ -128,7 +126,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (i < ARRAY_SIZE(x86_power_flags) && x86_power_flags[i]) seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", + x86_power_flags[i][0] ? " " : "", x86_power_flags[i]); else seq_printf(m, " [%d]", i); diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 00000000000..a640ae5ad20 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,55 @@ +#include <linux/sched.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/irqflags.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); + unsigned long ratio, flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + ratio = calc_aperfmperf_ratio(old, &val); + *old = val; + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} + +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 284c399e323..1cbed97b59c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,6 +24,7 @@ #include <linux/dmi.h> #include <asm/div64.h> #include <asm/vmware.h> +#include <asm/x86_init.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -47,19 +48,33 @@ static inline int __vmware_platform(void) return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } -static unsigned long __vmware_get_tsc_khz(void) +static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; - uint32_t eax, ebx, ecx, edx; + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_hz / 1000, + (unsigned long) tsc_hz % 1000); + return tsc_hz; +} + +void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - return tsc_hz; + if (ebx != UINT_MAX) + x86_platform.calibrate_tsc = vmware_get_tsc_khz; + else + printk(KERN_WARNING + "Failed to get TSC freq from the hypervisor\n"); } /* @@ -87,12 +102,6 @@ int vmware_platform(void) return 0; } -unsigned long vmware_get_tsc_khz(void) -{ - BUG_ON(!vmware_platform()); - return __vmware_get_tsc_khz(); -} - /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. * Still, due to timing difference when running on virtual cpus, the TSC can diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index b07af886124..6a52d4b36a3 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_nodename(struct device *dev) +static char *cpuid_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } @@ -203,7 +203,7 @@ static int __init cpuid_init(void) err = PTR_ERR(cpuid_class); goto out_chrdev; } - cpuid_class->nodename = cpuid_nodename; + cpuid_class->devnode = cpuid_devnode; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index b4f14c6c09d..37250fe490b 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -27,9 +27,7 @@ static void doublefault_fn(void) if (ptr_ok(gdt)) { gdt += GDT_ENTRY_TSS << 3; - tss = *(u16 *)(gdt+2); - tss += *(u8 *)(gdt+4) << 16; - tss += *(u8 *)(gdt+7) << 24; + tss = get_desc_base((struct desc_struct *)gdt); printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 48bfe138603..ef42a038f1a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -509,15 +509,15 @@ enum bts_field { bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; -static inline unsigned long bts_get(const char *base, enum bts_field field) +static inline unsigned long bts_get(const char *base, unsigned long field) { base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } -static inline void bts_set(char *base, enum bts_field field, unsigned long val) +static inline void bts_set(char *base, unsigned long field, unsigned long val) { - base += (ds_cfg.sizeof_ptr_field * field);; + base += (ds_cfg.sizeof_ptr_field * field); (*(unsigned long *)base) = val; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c8405718a4c..2d8a371d433 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -15,7 +15,6 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/sysfs.h> -#include <linux/ftrace.h> #include <asm/stacktrace.h> diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba91c9..f7dd2a7c3bf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a327676..a071e6be177 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5cb5725b2ba..d17d482a04f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, { int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { + if (x >= ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } @@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; @@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos) if (mb < 16) return 1024*1024; - /* To 32MB for anything above that */ - return 32*1024*1024; + /* To 64MB for anything above that */ + return 64*1024*1024; } #define MAX_RESOURCE_SIZE ((resource_size_t)-1) @@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void) return who; } -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ - if (x86_quirks->arch_memory_setup) { - char *who = x86_quirks->arch_memory_setup(); - - if (who) - return who; - } - return default_machine_specific_memory_setup(); -} - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - void __init setup_memory_map(void) { char *who; - who = memory_setup(); + who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 335f049d110..b9c830c12b4 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -160,721 +160,6 @@ static struct console early_serial_console = { .index = -1, }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { - u32 bus; - u32 slot; - u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE 0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ - return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ - return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT 0xe1 -#define USB_PID_IN 0x69 -#define USB_PID_SOF 0xa5 -#define USB_PID_SETUP 0x2d -/* handshake */ -#define USB_PID_ACK 0xd2 -#define USB_PID_NAK 0x5a -#define USB_PID_STALL 0x1e -#define USB_PID_NYET 0x96 -/* data */ -#define USB_PID_DATA0 0xc3 -#define USB_PID_DATA1 0x4b -#define USB_PID_DATA2 0x87 -#define USB_PID_MDATA 0x0f -/* Special */ -#define USB_PID_PREAMBLE 0x3c -#define USB_PID_ERR 0x3c -#define USB_PID_SPLIT 0x78 -#define USB_PID_PING 0xb4 -#define USB_PID_UNDEF_0 0xf0 - -#define USB_PID_DATA_TOGGLE 0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG 0xa - -#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ -#define HUB_SHORT_RESET_TIME 10 -#define HUB_LONG_RESET_TIME 200 -#define HUB_RESET_TIMEOUT 500 - -#define DBGP_MAX_PACKET 8 - -static int dbgp_wait_until_complete(void) -{ - u32 ctrl; - int loop = 0x100000; - - do { - ctrl = readl(&ehci_debug->control); - /* Stop when the transaction is finished */ - if (ctrl & DBGP_DONE) - break; - } while (--loop > 0); - - if (!loop) - return -1; - - /* - * Now that we have observed the completed transaction, - * clear the done bit. - */ - writel(ctrl | DBGP_DONE, &ehci_debug->control); - return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ - int i; - - while (ms--) { - for (i = 0; i < 1000; i++) - outb(0x1, 0x80); - } -} - -static void dbgp_breath(void) -{ - /* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ - u32 pids, lpid; - int ret; - int loop = 3; - -retry: - writel(ctrl | DBGP_GO, &ehci_debug->control); - ret = dbgp_wait_until_complete(); - pids = readl(&ehci_debug->pids); - lpid = DBGP_PID_GET(pids); - - if (ret < 0) - return ret; - - /* - * If the port is getting full or it has dropped data - * start pacing ourselves, not necessary but it's friendly. - */ - if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) - dbgp_breath(); - - /* If I get a NACK reissue the transmission */ - if (lpid == USB_PID_NAK) { - if (--loop > 0) - goto retry; - } - - return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ - const unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = hi = 0; - for (i = 0; i < 4 && i < size; i++) - lo |= bytes[i] << (8*i); - for (; i < 8 && i < size; i++) - hi |= bytes[i] << (8*(i - 4)); - writel(lo, &ehci_debug->data03); - writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ - unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = readl(&ehci_debug->data03); - hi = readl(&ehci_debug->data47); - for (i = 0; i < 4 && i < size; i++) - bytes[i] = (lo >> (8*i)) & 0xff; - for (; i < 8 && i < size; i++) - bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, - const char *bytes, int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_OUT); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - dbgp_set_data(bytes, size); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, - int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_IN); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl &= ~DBGP_OUT; - ctrl |= DBGP_GO; - - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - if (size > ret) - size = ret; - dbgp_get_data(data, size); - return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, - int request, int value, int index, void *data, int size) -{ - u32 pids, addr, ctrl; - struct usb_ctrlrequest req; - int read; - int ret; - - read = (requesttype & USB_DIR_IN) != 0; - if (size > (read ? DBGP_MAX_PACKET:0)) - return -1; - - /* Compute the control message */ - req.bRequestType = requesttype; - req.bRequest = request; - req.wValue = cpu_to_le16(value); - req.wIndex = cpu_to_le16(index); - req.wLength = cpu_to_le16(size); - - pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); - addr = DBGP_EPADDR(devnum, 0); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, sizeof(req)); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - /* Send the setup message */ - dbgp_set_data(&req, sizeof(req)); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - /* Read the result */ - return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ - u8 pos; - int bytes; - - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - - pos = read_pci_config_byte(num, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ - u32 class; - - class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); - if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) - return 0; - - return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ - u32 bus, slot, func; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - unsigned cap; - - cap = __find_dbgp(bus, slot, func); - - if (!cap) - continue; - if (ehci_num-- != 0) - continue; - *rbus = bus; - *rslot = slot; - *rfunc = func; - return cap; - } - } - } - return 0; -} - -static int __init ehci_reset_port(int port) -{ - u32 portsc; - u32 delay_time, delay; - int loop; - - /* Reset the usb debug port */ - portsc = readl(&ehci_regs->port_status[port - 1]); - portsc &= ~PORT_PE; - portsc |= PORT_RESET; - writel(portsc, &ehci_regs->port_status[port - 1]); - - delay = HUB_ROOT_RESET_TIME; - for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; - delay_time += delay) { - dbgp_mdelay(delay); - - portsc = readl(&ehci_regs->port_status[port - 1]); - if (portsc & PORT_RESET) { - /* force reset to complete */ - loop = 2; - writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), - &ehci_regs->port_status[port - 1]); - do { - portsc = readl(&ehci_regs->port_status[port-1]); - } while ((portsc & PORT_RESET) && (--loop > 0)); - } - - /* Device went away? */ - if (!(portsc & PORT_CONNECT)) - return -ENOTCONN; - - /* bomb out completely if something weird happend */ - if ((portsc & PORT_CSC)) - return -EINVAL; - - /* If we've finished resetting, then break out of the loop */ - if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) - return 0; - } - return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ - u32 status; - int ret, reps; - - for (reps = 0; reps < 3; reps++) { - dbgp_mdelay(100); - status = readl(&ehci_regs->status); - if (status & STS_PCD) { - ret = ehci_reset_port(port); - if (ret == 0) - return 0; - } - } - return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ - u32 dword; - dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x74); - dword &= ~(0x0f<<12); - dword |= ((port & 0x0f)<<12); - write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, - dword); - dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ - u32 vendorid; - - vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x00); - - if ((vendorid & 0xffff) == 0x10de) { - dbgp_printk("using nvidia set_debug_port\n"); - set_debug_port = nvidia_set_debug_port; - } -} - -static int __init ehci_setup(void) -{ - struct usb_debug_descriptor dbgp_desc; - u32 cmd, ctrl, status, portsc, hcs_params; - u32 debug_port, new_debug_port = 0, n_ports; - u32 devnum; - int ret, i; - int loop; - int port_map_tried; - int playtimes = 3; - -try_next_time: - port_map_tried = 0; - -try_next_port: - - hcs_params = readl(&ehci_caps->hcs_params); - debug_port = HCS_DEBUG_PORT(hcs_params); - n_ports = HCS_N_PORTS(hcs_params); - - dbgp_printk("debug_port: %d\n", debug_port); - dbgp_printk("n_ports: %d\n", n_ports); - - for (i = 1; i <= n_ports; i++) { - portsc = readl(&ehci_regs->port_status[i-1]); - dbgp_printk("portstatus%d: %08x\n", i, portsc); - } - - if (port_map_tried && (new_debug_port != debug_port)) { - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - return -1; - } - - loop = 10; - /* Reset the EHCI controller */ - cmd = readl(&ehci_regs->command); - cmd |= CMD_RESET; - writel(cmd, &ehci_regs->command); - do { - cmd = readl(&ehci_regs->command); - } while ((cmd & CMD_RESET) && (--loop > 0)); - - if (!loop) { - dbgp_printk("can not reset ehci\n"); - return -1; - } - dbgp_printk("ehci reset done\n"); - - /* Claim ownership, but do not enable yet */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_OWNER; - ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); - writel(ctrl, &ehci_debug->control); - - /* Start the ehci running */ - cmd = readl(&ehci_regs->command); - cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); - cmd |= CMD_RUN; - writel(cmd, &ehci_regs->command); - - /* Ensure everything is routed to the EHCI */ - writel(FLAG_CF, &ehci_regs->configured_flag); - - /* Wait until the controller is no longer halted */ - loop = 10; - do { - status = readl(&ehci_regs->status); - } while ((status & STS_HALT) && (--loop > 0)); - - if (!loop) { - dbgp_printk("ehci can be started\n"); - return -1; - } - dbgp_printk("ehci started\n"); - - /* Wait for a device to show up in the debug port */ - ret = ehci_wait_for_port(debug_port); - if (ret < 0) { - dbgp_printk("No device found in debug port\n"); - goto next_debug_port; - } - dbgp_printk("ehci wait for port done\n"); - - /* Enable the debug port */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_CLAIM; - writel(ctrl, &ehci_debug->control); - ctrl = readl(&ehci_debug->control); - if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { - dbgp_printk("No device in debug port\n"); - writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); - goto err; - } - dbgp_printk("debug ported enabled\n"); - - /* Completely transfer the debug device to the debug controller */ - portsc = readl(&ehci_regs->port_status[debug_port - 1]); - portsc &= ~PORT_PE; - writel(portsc, &ehci_regs->port_status[debug_port - 1]); - - dbgp_mdelay(100); - - /* Find the debug device and make it device number 127 */ - for (devnum = 0; devnum <= 127; devnum++) { - ret = dbgp_control_msg(devnum, - USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, - &dbgp_desc, sizeof(dbgp_desc)); - if (ret > 0) - break; - } - if (devnum > 127) { - dbgp_printk("Could not find attached debug device\n"); - goto err; - } - if (ret < 0) { - dbgp_printk("Attached device is not a debug device\n"); - goto err; - } - dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - - /* Move the device to 127 if it isn't already there */ - if (devnum != USB_DEBUG_DEVNUM) { - ret = dbgp_control_msg(devnum, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); - if (ret < 0) { - dbgp_printk("Could not move attached device to %d\n", - USB_DEBUG_DEVNUM); - goto err; - } - devnum = USB_DEBUG_DEVNUM; - dbgp_printk("debug device renamed to 127\n"); - } - - /* Enable the debug interface */ - ret = dbgp_control_msg(USB_DEBUG_DEVNUM, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); - if (ret < 0) { - dbgp_printk(" Could not enable the debug device\n"); - goto err; - } - dbgp_printk("debug interface enabled\n"); - - /* Perform a small write to get the even/odd data state in sync - */ - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); - if (ret < 0) { - dbgp_printk("dbgp_bulk_write failed: %d\n", ret); - goto err; - } - dbgp_printk("small write doned\n"); - - return 0; -err: - /* Things didn't work so remove my claim */ - ctrl = readl(&ehci_debug->control); - ctrl &= ~(DBGP_CLAIM | DBGP_OUT); - writel(ctrl, &ehci_debug->control); - return -1; - -next_debug_port: - port_map_tried |= (1<<(debug_port - 1)); - new_debug_port = ((debug_port-1+1)%n_ports) + 1; - if (port_map_tried != ((1<<n_ports) - 1)) { - set_debug_port(new_debug_port); - goto try_next_port; - } - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - - return -1; -} - -static int __init early_dbgp_init(char *s) -{ - u32 debug_port, bar, offset; - u32 bus, slot, func, cap; - void __iomem *ehci_bar; - u32 dbgp_num; - u32 bar_val; - char *e; - int ret; - u8 byte; - - if (!early_pci_allowed()) - return -1; - - dbgp_num = 0; - if (*s) - dbgp_num = simple_strtoul(s, &e, 10); - dbgp_printk("dbgp_num: %d\n", dbgp_num); - - cap = find_dbgp(dbgp_num, &bus, &slot, &func); - if (!cap) - return -1; - - dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, - func); - - debug_port = read_pci_config(bus, slot, func, cap); - bar = (debug_port >> 29) & 0x7; - bar = (bar * 4) + 0xc; - offset = (debug_port >> 16) & 0xfff; - dbgp_printk("bar: %02x offset: %03x\n", bar, offset); - if (bar != PCI_BASE_ADDRESS_0) { - dbgp_printk("only debug ports on bar 1 handled.\n"); - - return -1; - } - - bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); - dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); - if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { - dbgp_printk("only simple 32bit mmio bars supported\n"); - - return -1; - } - - /* double check if the mem space is enabled */ - byte = read_pci_config_byte(bus, slot, func, 0x04); - if (!(byte & 0x2)) { - byte |= 0x02; - write_pci_config_byte(bus, slot, func, 0x04, byte); - dbgp_printk("mmio for ehci enabled\n"); - } - - /* - * FIXME I don't have the bar size so just guess PAGE_SIZE is more - * than enough. 1K is the biggest I have seen. - */ - set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); - ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); - ehci_bar += bar_val & ~PAGE_MASK; - dbgp_printk("ehci_bar: %p\n", ehci_bar); - - ehci_caps = ehci_bar; - ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); - ehci_debug = ehci_bar + offset; - ehci_dev.bus = bus; - ehci_dev.slot = slot; - ehci_dev.func = func; - - detect_set_debug_port(); - - ret = ehci_setup(); - if (ret < 0) { - dbgp_printk("ehci_setup failed\n"); - ehci_debug = NULL; - - return -1; - } - - return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ - int chunk, ret; - - if (!ehci_debug) - return; - while (n > 0) { - chunk = n; - if (chunk > DBGP_MAX_PACKET) - chunk = DBGP_MAX_PACKET; - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, - dbgp_endpoint_out, str, chunk); - str += chunk; - n -= chunk; - } -} - -static struct console early_dbgp_console = { - .name = "earlydbg", - .write = early_dbgp_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -#endif - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} static int __init setup_early_printk(char *buf) { - int keep_early; + int keep; if (!buf) return 0; @@ -903,42 +202,37 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - keep_early = (strstr(buf, "keep") != NULL); - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3) - && boot_params.screen_info.orig_video_isVGA == 1) { - max_xpos = boot_params.screen_info.orig_video_cols; - max_ypos = boot_params.screen_info.orig_video_lines; - current_ypos = boot_params.screen_info.orig_y; - early_console = &early_vga_console; + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } #ifdef CONFIG_EARLY_PRINTK_DBGP - } else if (!strncmp(buf, "dbgp", 4)) { - if (early_dbgp_init(buf+4) < 0) - return 0; - early_console = &early_dbgp_console; - /* - * usb subsys will reset ehci controller, so don't keep - * that early console - */ - keep_early = 0; + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); #endif #ifdef CONFIG_HVC_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); #endif + buf++; } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); return 0; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 96f7ac0bbf0..ad5bd988fb7 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -42,6 +42,7 @@ #include <asm/time.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm/x86_init.h> #define EFI_DEBUG 1 #define PFX "EFI: " @@ -354,7 +355,7 @@ void __init efi_init(void) */ c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); if (c16) { - for (i = 0; i < sizeof(vendor) && *c16; ++i) + for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else @@ -453,6 +454,9 @@ void __init efi_init(void) if (add_efi_memmap) do_add_efi_memmap(); + x86_platform.get_wallclock = efi_get_time; + x86_platform.set_wallclock = efi_set_rtc_mmss; + /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; @@ -512,7 +516,7 @@ void __init efi_enter_virtual_mode(void) && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else - va = efi_ioremap(md->phys_addr, size); + va = efi_ioremap(md->phys_addr, size, md->type); md->virt_addr = (u64) (unsigned long) va; diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 22c3b7828c5..ac0621a7ac3 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void) early_runtime_code_mapping_set_exec(0); } -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, + u32 type) { unsigned long last_map_pfn; + if (type == EFI_MEMORY_MAPPED_IO) + return ioremap(phys_addr, size); + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be74510..b5c061f8f35 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -155,10 +155,10 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, 16(%rsp) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp + addq $16, %rsp retq #endif @@ -536,20 +536,13 @@ sysret_signal: bt $TIF_SYSCALL_AUDIT,%edx jc sysret_audit #endif - /* edx: work flags (arg3) */ - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call do_notify_resume - RESTORE_TOP_OF_STACK %r11 - RESTORE_REST - movl $_TIF_WORK_MASK,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -654,6 +647,7 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: SAVE_REST /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx @@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d94e1ea3b9f..9dbb527e165 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; - /* Nmi's are currently unsupported */ - if (unlikely(in_nmi())) - return; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; @@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) struct syscall_metadata *syscall_nr_to_meta(int nr) { - if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; return syscalls_metadata[nr]; } -void arch_init_ftrace_syscalls(void) +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < NR_syscalls; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + +static int __init arch_init_ftrace_syscalls(void) { int i; struct syscall_metadata *meta; unsigned long **psys_syscall_table = &sys_call_table; - static atomic_t refs; - - if (atomic_inc_return(&refs) != 1) - goto end; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - FTRACE_SYSCALL_MAX, GFP_KERNEL); + NR_syscalls, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return; + return -ENOMEM; } - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } - return; - - /* Paranoid: avoid overflow */ -end: - atomic_dec(&refs); + return 0; } +arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3f8579f8d42..4f8e2507e8f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -11,8 +11,21 @@ #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> +#include <asm/page.h> #include <asm/trampoline.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/bios_ebda.h> + +static void __init i386_default_early_setup(void) +{ + /* Initilize 32bit specific setup functions */ + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +} void __init i386_start_kernel(void) { @@ -29,7 +42,16 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_ebda_region(); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa852c73..0b06cd778fd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -23,8 +23,8 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/e820.h> -#include <asm/bios_ebda.h> #include <asm/trampoline.h> +#include <asm/bios_ebda.h> static void __init zap_identity_mappings(void) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 8663afb5653..050c278481b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -.section .text.head,"ax",@progbits +__HEAD ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -157,6 +157,7 @@ subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #endif /* CONFIG_PARAVIRT */ @@ -261,9 +262,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * which will be freed later */ -#ifndef CONFIG_HOTPLUG_CPU -.section .init.text,"ax",@progbits -#endif +__CPUINIT #ifdef CONFIG_SMP ENTRY(startup_32_smp) @@ -441,7 +440,6 @@ is386: movl $2,%ecx # set MP jne 1f movl $per_cpu__gdt_page,%eax movl $per_cpu__stack_canary,%ecx - subl $20, %ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -602,7 +600,7 @@ ignore_int: #endif iret -.section .cpuinit.data,"wa" + __REFDATA .align 4 ENTRY(initial_code) .long i386_start_kernel @@ -610,7 +608,7 @@ ENTRY(initial_code) /* * BSS section */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE swapper_pg_pmd: @@ -628,7 +626,7 @@ ENTRY(empty_zero_page) * This starts the data section. */ #ifdef CONFIG_X86_PAE -.section ".data.page_aligned","wa" +__PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a0..780cd928fcd 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text - .section .text.head + __HEAD .code64 .globl startup_64 startup_64: @@ -418,7 +418,7 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 - .section .bss.page_aligned, "aw", @nobits + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 43cec6bdda6..9c3bd4a2050 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -10,6 +10,16 @@ EXPORT_SYMBOL(mcount); #endif +/* + * Note, this is a prototype to get at the symbol for + * the export, but dont use it from C code, it is used + * by assembly code and is not using C calling convention! + */ +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +EXPORT_SYMBOL(cmpxchg8b_emu); +#endif + /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cf36c053ac..23c167925a5 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -19,12 +19,6 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -#ifdef CONFIG_X86_32 -static void pit_disable_clocksource(void); -#else -static inline void pit_disable_clocksource(void) { } -#endif - /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode, outb_pit(0, PIT_CH0); outb_pit(0, PIT_CH0); } - pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - pit_disable_clocksource(); outb_pit(0x38, PIT_MODE); break; @@ -200,17 +192,6 @@ static struct clocksource pit_cs = { .shift = 20, }; -static void pit_disable_clocksource(void) -{ - /* - * Use mult to check whether it is registered or not - */ - if (pit_cs.mult) { - clocksource_unregister(&pit_cs); - pit_cs.mult = 0; - } -} - static int __init init_pit_clocksource(void) { /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc1..3a54dcb9cd0 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f..04bbd527856 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,10 +63,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); @@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_threshold_count; # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 3b09634a515..7d35d0fe232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) void fixup_irqs(void) { unsigned int irq; - static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { @@ -236,8 +235,8 @@ void fixup_irqs(void) } if (desc->chip->set_affinity) desc->chip->set_affinity(irq, affinity); - else if (desc->action && !(warned++)) - printk("Cannot set affinity for irq %i\n", irq); + else if (desc->action) + printk_once("Cannot set affinity for irq %i\n", irq); } #if 0 diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d5..40f30773fb2 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -static void __init init_ISA_irqs(void) +void __init init_ISA_irqs(void) { int i; @@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void) } } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +void __init init_IRQ(void) +{ + x86_init.irqs.intr_init(); +} static void __init smp_intr_init(void) { @@ -190,7 +192,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif @@ -206,39 +208,19 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif #endif } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ -#ifdef CONFIG_X86_32 - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } -#endif - init_ISA_irqs(); -} - void __init native_init_IRQ(void) { int i; /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); + x86_init.irqs.pre_vector_init(); apic_intr_init(); @@ -258,12 +240,6 @@ void __init native_init_IRQ(void) #ifdef CONFIG_X86_32 /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c664d515f61..63b0ec8d3d4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,7 +34,6 @@ struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; - enum paravirt_lazy_mode mode; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); @@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) { struct kvm_para_state *state = kvm_para_state(); - if (state->mode != PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { kvm_mmu_op(buffer, len); return; } @@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) static void kvm_enter_lazy_mmu(void) { - struct kvm_para_state *state = kvm_para_state(); - paravirt_enter_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void kvm_leave_lazy_mmu(void) @@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void) mmu_queue_flush(state); paravirt_leave_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void __init paravirt_ops_setup(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f152..feaeb0d3aa4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,8 @@ #include <asm/msr.h> #include <asm/apic.h> #include <linux/percpu.h> + +#include <asm/x86_init.h> #include <asm/reboot.h> #define KVM_SCALE 22 @@ -50,8 +52,8 @@ static unsigned long kvm_get_wallclock(void) struct timespec ts; int low, high; - low = (int)__pa(&wall_clock); - high = ((u64)__pa(&wall_clock) >> 32); + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(MSR_KVM_WALL_CLOCK, low, high); vcpu_time = &get_cpu_var(hv_clock); @@ -182,12 +184,13 @@ void __init kvmclock_init(void) if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { if (kvm_register_clock("boot clock")) return; - pv_time_ops.get_wallclock = kvm_get_wallclock; - pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; - pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 71f1d99a635..ec6ef60cbd1 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #ifdef CONFIG_SMP preempt_disable(); load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9371448290a..378e9a8f1bf 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = { static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", - .devnode = "cpu/microcode", + .nodename = "cpu/microcode", .fops = µcode_fops, }; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 651c93b2886..5be95ef4ffe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} + static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - if (x86_quirks->mpc_apic_id) - apicid = x86_quirks->mpc_apic_id(m); - else - apicid = m->apicid; + apicid = x86_init.mpparse.mpc_apic_id(m); if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) { - char str[7]; memcpy(str, m->bustype, 6); str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} - if (x86_quirks->mpc_oem_bus_info) - x86_quirks->mpc_oem_bus_info(m, str); - else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_quirks->mpc_oem_pci_bus) - x86_quirks->mpc_oem_pci_bus(m); + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { - struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); - } + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); /* * Now process the configuration blocks. */ - if (x86_quirks->mpc_record) - *x86_quirks->mpc_record = 0; + x86_init.mpparse.mpc_record(0); while (count < mpc->length) { switch (*mpt) { @@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - if (x86_quirks->mpc_record) - (*x86_quirks->mpc_record)++; + x86_init.mpparse.mpc_record(1); } #ifdef CONFIG_X86_BIGSMP @@ -482,11 +484,11 @@ static void __init construct_ioapic_table(int mpc_default_type) MP_bus_info(&bus); } - ioapic.type = MP_IOAPIC; - ioapic.apicid = 2; - ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.flags = MPC_APIC_USABLE; - ioapic.apicaddr = 0xFEC00000; + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; MP_ioapic_info(&ioapic); /* @@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; @@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early) */ } -void __init early_get_smp_config(void) -{ - __get_smp_config(1); -} - -void __init get_smp_config(void) -{ - __get_smp_config(0); -} - static void __init smp_reserve_bootmem(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); @@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve) { unsigned int address; - if (x86_quirks->mach_find_smp_config) { - if (x86_quirks->mach_find_smp_config(reserve)) - return; - } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... @@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve) smp_scan_config(address, 0x400, reserve); } -void __init early_find_smp_config(void) -{ - __find_smp_config(0); -} - -void __init find_smp_config(void) -{ - __find_smp_config(1); -} - #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c new file mode 100644 index 00000000000..3b7078abc87 --- /dev/null +++ b/arch/x86/kernel/mrst.c @@ -0,0 +1,24 @@ +/* + * mrst.c: Intel Moorestown platform specific setup code + * + * (C) Copyright 2008 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/init.h> + +#include <asm/setup.h> + +/* + * Moorestown specific x86_init function overrides and early setup + * calls. + */ +void __init x86_mrst_early_setup(void) +{ + x86_init.resources.probe_roms = x86_init_noop; + x86_init.resources.reserve_resources = x86_init_noop; +} diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 98fd6cd4e3a..6a3cefc7dda 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -1,6 +1,7 @@ /* ----------------------------------------------------------------------- * * * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * Copyright 2009 Intel Corporation; author: H. Peter Anvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -80,11 +81,8 @@ static ssize_t msr_read(struct file *file, char __user *buf, for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } if (copy_to_user(tmp, &data, 8)) { err = -EFAULT; break; @@ -115,11 +113,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf, break; } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } tmp += 2; bytes += 8; } @@ -127,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf, return bytes ? bytes : err; } +static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + int cpu = iminor(file->f_path.dentry->d_inode); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); @@ -157,6 +200,8 @@ static const struct file_operations msr_fops = { .read = msr_read, .write = msr_write, .open = msr_open, + .unlocked_ioctl = msr_ioctl, + .compat_ioctl = msr_ioctl, }; static int __cpuinit msr_device_create(int cpu) @@ -196,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_nodename(struct device *dev) +static char *msr_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } @@ -217,7 +262,7 @@ static int __init msr_init(void) err = PTR_ERR(msr_class); goto out_chrdev; } - msr_class->nodename = msr_nodename; + msr_class->devnode = msr_devnode; for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b951d7..1b1739d1631 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x) return x; } -static void __init default_banner(void) +void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } -char *memory_setup(void) -{ - return pv_init_ops.memory_setup(); -} - /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ extern const char start_##ops##_##name[], end_##ops##_##name[]; \ @@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, return insn_len; } -void init_IRQ(void) -{ - pv_irq_ops.init_IRQ(); -} - static void native_flush_tlb(void) { __native_flush_tlb(); @@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); -static int __init print_banner(void) -{ - pv_init_ops.banner(); - return 0; -} -core_initcall(print_banner); - static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, @@ -320,21 +303,13 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, - .banner = default_banner, - .arch_setup = paravirt_nop, - .memory_setup = machine_specific_memory_setup, }; struct pv_time_ops pv_time_ops = { - .time_init = hpet_time_init, - .get_wallclock = native_get_wallclock, - .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { - .init_IRQ = native_init_IRQ, .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -362,8 +337,9 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .read_msr_amd = native_read_msr_amd_safe, + .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, + .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, @@ -408,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = setup_boot_APIC_clock, - .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif }; @@ -423,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = { #endif struct pv_mmu_ops pv_mmu_ops = { -#ifndef CONFIG_X86_64 - .pagetable_setup_start = native_pagetable_setup_start, - .pagetable_setup_done = native_pagetable_setup_done, -#else - .pagetable_setup_start = paravirt_nop, - .pagetable_setup_done = paravirt_nop, -#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 3c945c0b350..b2a71dca564 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -3,6 +3,7 @@ #include <linux/dmar.h> #include <linux/bootmem.h> #include <linux/pci.h> +#include <linux/kmemleak.h> #include <asm/proto.h> #include <asm/dma.h> @@ -32,7 +33,14 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -int iommu_pass_through; +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA translation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -88,6 +96,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else @@ -212,10 +225,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); @@ -300,7 +311,7 @@ void pci_iommu_shutdown(void) amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 98a827ee9ed..a7f1b64f86e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -16,6 +16,7 @@ #include <linux/agp_backend.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e8a35016115..aaa6b7839f1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -46,9 +46,8 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || - iommu_pass_through) - swiotlb = 1; + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) + swiotlb = 1; #endif if (swiotlb_force) swiotlb = 1; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 994dd6a4a2a..5284cd2b577 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,7 @@ #include <linux/pm.h> #include <linux/clockchips.h> #include <linux/random.h> -#include <trace/power.h> +#include <trace/events/power.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -299,9 +296,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -314,7 +309,6 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -372,9 +366,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,15 +376,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -402,7 +392,6 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); } else local_irq_enable(); } @@ -414,13 +403,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } /* @@ -519,16 +506,12 @@ static void c1e_idle(void) if (!cpumask_test_cpu(cpu, c1e_mask)) { cpumask_set_cpu(cpu, c1e_mask); /* - * Force broadcast so ACPI can not interfere. Needs - * to run with interrupts enabled as it uses - * smp_function_call. + * Force broadcast so ACPI can not interfere. */ - local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); - local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -572,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) void __init init_c1e_mask(void) { /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) { - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); - } + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984a..4cf79567cda 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,9 +61,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ @@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9..ad535b68317 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,9 +55,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); @@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + bool preload_fpu; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* Must be after DS reload */ + unlazy_fpu(prev_p); + + /* Make sure cpu is ready for new context */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - /* Must be after DS reload */ - unlazy_fpu(prev_p); - /* * Switch the PDA and FPU contexts. */ @@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c1..7b058a2dc66 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -35,10 +35,11 @@ #include <asm/proto.h> #include <asm/ds.h> -#include <trace/syscall.h> - #include "tls.h" +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -324,16 +325,6 @@ static int putreg(struct task_struct *child, return set_flags(child, value); #ifdef CONFIG_X86_64 - /* - * Orig_ax is really just a flag with small positive and - * negative values, so make sure to always sign-extend it - * from 32 bits so that it works correctly regardless of - * whether we come from a 32-bit environment or not. - */ - case offsetof(struct user_regs_struct, orig_ax): - value = (long) (s32) value; - break; - case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_OF(child)) return -EIO; @@ -1125,10 +1116,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) case offsetof(struct user32, regs.orig_eax): /* - * Sign-extend the value so that orig_eax = -1 - * causes (long)orig_ax < 0 tests to fire correctly. + * A 32-bit debugger setting orig_eax means to restore + * the state of the task restarting a 32-bit syscall. + * Make sure we interpret the -ERESTART* codes correctly + * in case the task is not actually still sitting at the + * exit from a 32-bit syscall with TS_COMPAT still set. */ - regs->orig_ax = (long) (s32) value; + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) + task_thread_info(child)->status |= TS_COMPAT; break; case offsetof(struct user32, regs.eflags): @@ -1497,8 +1493,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_enter(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1523,8 +1519,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_exit(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index af71d06624b..6c3b2c6fd77 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) pci_read_config_dword(nb_ht, 0x60, &val); set_dev_node(&dev->dev, val & 7); - pci_dev_put(dev); + pci_dev_put(nb_ht); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 508e982dd07..a1a3cdda06e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,9 @@ #include <linux/init.h> #include <linux/pm.h> #include <linux/efi.h> +#include <linux/dmi.h> +#include <linux/sched.h> +#include <linux/tboot.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -17,7 +20,6 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -# include <linux/dmi.h> # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else @@ -404,6 +406,46 @@ EXPORT_SYMBOL(machine_real_restart); #endif /* CONFIG_X86_32 */ +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + }, + }, + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + }, + }, + { } +}; + +static int __init pci_reboot_init(void) +{ + dmi_check_system(pci_reboot_dmi_table); + return 0; +} +core_initcall(pci_reboot_init); + static inline void kb_wait(void) { int i; @@ -468,6 +510,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -594,6 +638,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -605,6 +651,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e7..1cfbbfc3ae2 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -8,6 +8,7 @@ #include <linux/pnp.h> #include <asm/vsyscall.h> +#include <asm/x86_init.h> #include <asm/time.h> #ifdef CONFIG_X86_32 @@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) } EXPORT_SYMBOL(rtc_cmos_write); -static int set_rtc_mmss(unsigned long nowtime) +int update_persistent_clock(struct timespec now) { unsigned long flags; int retval; spin_lock_irqsave(&rtc_lock, flags); - retval = set_wallclock(nowtime); + retval = x86_platform.set_wallclock(now.tv_sec); spin_unlock_irqrestore(&rtc_lock, flags); return retval; } /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long retval, flags; spin_lock_irqsave(&rtc_lock, flags); - retval = get_wallclock(); + retval = x86_platform.get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); + ts->tv_sec = retval; + ts->tv_nsec = 0; } unsigned long long native_read_tsc(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..e09f0e2c14b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -27,6 +27,7 @@ #include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/acpi.h> +#include <linux/sfi.h> #include <linux/apm_bios.h> #include <linux/initrd.h> #include <linux/bootmem.h> @@ -66,6 +67,7 @@ #include <linux/percpu.h> #include <linux/crash_dump.h> +#include <linux/tboot.h> #include <video/edid.h> @@ -108,10 +110,6 @@ #include <asm/numa_64.h> #endif -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif - /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. * The direct mapping extends to max_pfn_mapped, so that we can directly access @@ -133,9 +131,9 @@ int default_cpu_present_to_apicid(int mps_cpu) return __default_cpu_present_to_apicid(mps_cpu); } -int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +int default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #endif @@ -171,13 +169,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ @@ -605,7 +596,7 @@ static struct resource standard_io_resources[] = { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static void __init reserve_standard_io_resources(void) +void __init reserve_standard_io_resources(void) { int i; @@ -637,10 +628,6 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -757,7 +744,7 @@ void __init setup_arch(char **cmdline_p) } #endif - ARCH_SETUP + x86_init.oem.arch_setup(); setup_memory_map(); parse_setup_data(); @@ -796,6 +783,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + parse_early_param(); #ifdef CONFIG_X86_64 @@ -833,11 +830,9 @@ void __init setup_arch(char **cmdline_p) * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. */ - init_hypervisor(&boot_cpu_data); + init_hypervisor_platform(); -#ifdef CONFIG_X86_32 - probe_roms(); -#endif + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); @@ -972,10 +967,11 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - paravirt_pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); - paravirt_pagetable_setup_done(swapper_pg_dir); - paravirt_post_allocator_init(); + x86_init.paging.pagetable_setup_done(swapper_pg_dir); + + tboot_probe(); #ifdef CONFIG_X86_64 map_vsyscall(); @@ -990,13 +986,13 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) + sfi_init(); + /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); -#endif prefill_possible_map(); @@ -1015,10 +1011,7 @@ void __init setup_arch(char **cmdline_p) e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); -#ifdef CONFIG_X86_32 - request_resource(&iomem_resource, &video_ram_resource); -#endif - reserve_standard_io_resources(); + x86_init.resources.reserve_resources(); e820_setup_gap(); @@ -1030,78 +1023,22 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + x86_init.oem.banner(); } #ifdef CONFIG_X86_32 -/** - * x86_quirk_intr_init - post gate setup interrupt initialisation - * - * Description: - * Fill in any interrupts that may have been left out by the general - * init_IRQ() routine. interrupts having to do with the machine rather - * than the devices on the I/O bus (like APIC interrupts in intel MP - * systems) are started here. - **/ -void __init x86_quirk_intr_init(void) -{ - if (x86_quirks->arch_intr_init) { - if (x86_quirks->arch_intr_init()) - return; - } -} - -/** - * x86_quirk_trap_init - initialise system specific traps - * - * Description: - * Called as the final act of trap_init(). Used in VISWS to initialise - * the various board specific APIC traps. - **/ -void __init x86_quirk_trap_init(void) -{ - if (x86_quirks->arch_trap_init) { - if (x86_quirks->arch_trap_init()) - return; - } -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) +void __init i386_reserve_resources(void) { - if (x86_quirks->arch_pre_time_init) - x86_quirks->arch_pre_time_init(); + request_resource(&iomem_resource, &video_ram_resource); + reserve_standard_io_resources(); } -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - * Must plug the system timer interrupt source at HZ into the IRQ listed - * in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ - if (x86_quirks->arch_time_init) { - /* - * A nonzero return code does not mean failure, it means - * that the architecture quirk does not want any - * generic (timer) setup to be performed after this: - */ - if (x86_quirks->arch_time_init()) - return; - } - - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4..d559af913e1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Helpers for first chunk memory allocation */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + return pcpu_alloc_bootmem(cpu, size, align); } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static void __init pcpu_fc_free(void *ptr, size_t size) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = num_possible_cpus() * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - return -EINVAL; - } - } - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; + free_bootmem(__pa(ptr), size); } -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = num_possible_cpus() - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; -} +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); } -/* - * 4k page allocator - * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) -{ - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; -} - -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ - ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) - ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); - else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", - pcpu_chosen_alloc, ret); - } - } else { - ret = setup_pcpu_lpage(static_size, false); - if (ret < 0) - ret = setup_pcpu_embed(static_size, false); +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } - if (ret < 0) - ret = setup_pcpu_4k(static_size); - if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 00000000000..34e09938265 --- /dev/null +++ b/arch/x86/kernel/sfi.c @@ -0,0 +1,122 @@ +/* + * sfi.c - x86 architecture SFI support. + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/io.h> + +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + +void __init mp_sfi_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); + + pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +/* All CPUs enumerated by SFI must be present and enabled */ +void __cpuinit mp_sfi_register_lapic(u8 id) +{ + if (MAX_APICS - id <= 0) { + pr_warning("Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + pr_info("registering lapic[%d]\n", id); + + generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); +} + +static int __init sfi_parse_cpus(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_cpu_table_entry *pentry; + int i; + int cpu_num; + + sb = (struct sfi_table_simple *)table; + cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); + pentry = (struct sfi_cpu_table_entry *)sb->pentry; + + for (i = 0; i < cpu_num; i++) { + mp_sfi_register_lapic(pentry->apic_id); + pentry++; + } + + smp_found_config = 1; + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static u32 gsi_base; + +static int __init sfi_parse_ioapic(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_apic_table_entry *pentry; + int i, num; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); + pentry = (struct sfi_apic_table_entry *)sb->pentry; + + for (i = 0; i < num; i++) { + mp_register_ioapic(i, pentry->phys_addr, gsi_base); + gsi_base += io_apic_get_redir_entries(i); + pentry++; + } + + WARN(pic_mode, KERN_WARNING + "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); + pic_mode = 0; + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + +/* + * sfi_platform_init(): register lapics & io-apics + */ +int __init sfi_platform_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + mp_sfi_register_lapic_address(sfi_lapic_addr); + sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); +#endif +#ifdef CONFIG_X86_IO_APIC + sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); +#endif + return 0; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94..6a44a76055a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); @@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..565ebc65920 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -47,6 +47,7 @@ #include <linux/bootmem.h> #include <linux/err.h> #include <linux/nmi.h> +#include <linux/tboot.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -323,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - setup_secondary_clock(); + x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_idle(); @@ -434,7 +435,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu) * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map */ - if (sched_mc_power_savings || sched_smt_power_savings) + if ((sched_mc_power_savings || sched_smt_power_savings) && + !(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else return c->llc_shared_map; @@ -1057,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) #endif current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { - alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); - cpumask_clear(per_cpu(cpu_core_map, i)); - cpumask_clear(per_cpu(cpu_sibling_map, i)); - cpumask_clear(cpu_data(i).llc_shared_map); + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); } set_cpu_sibling_map(0); @@ -1112,13 +1111,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1140,6 +1152,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; @@ -1317,6 +1330,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index e8b9863ef8c..3149032ff10 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/ptrace.h> +#include <asm/desc.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { @@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u32 *desc; + struct desc_struct *desc; unsigned long base; seg &= ~7UL; @@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re addr = -1L; /* bogus selector, access would fault */ else { desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); + base = get_desc_base(desc); /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) + if (!desc->d) addr &= 0xffff; addr += base; } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6bc211accf0..45e00eb09c3 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,9 +18,9 @@ #include <asm/ia32.h> #include <asm/syscalls.h> -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) { long error; struct file *file; @@ -226,7 +226,7 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user *name) +SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) { int err; down_read(&uts_sem); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafd..0157cd26d7c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 00000000000..86c9f91b48a --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,447 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dma_remapping.h> +#include <linux/init_task.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/dmar.h> +#include <linux/cpu.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <linux/tboot.h> + +#include <asm/trampoline.h> +#include <asm/processor.h> +#include <asm/bootparam.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/io.h> + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +static void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); +} + +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) +{ + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + + /* S3 resume code */ + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE + /* AP trampoline code */ + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + + /* kernel code + data + bss */ + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + if (tboot_setup_sleep()) + return; + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +static atomic_t ap_wfs_count; + +static int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + timeout = AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + timeout) { + mdelay(1); + timeout--; + } + + if (timeout) + pr_warning("tboot wait for APs timeout\n"); + + return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); +} + +static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_DYING: + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +static struct notifier_block tboot_cpu_notifier __cpuinitdata = +{ + .notifier_call = tboot_cpu_callback, +}; + +static __init int tboot_late_init(void) +{ + if (!tboot_enabled()) + return 0; + + tboot_create_trampoline(); + + atomic_set(&ap_wfs_count, 0); + register_hotcpu_notifier(&tboot_cpu_notifier); + return 0; +} + +late_initcall(tboot_late_init); + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 00000000000..be2573448ed --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * + */ + +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/time.h> +#include <linux/mca.h> + +#include <asm/vsyscall.h> +#include <asm/x86_init.h> +#include <asm/i8259.h> +#include <asm/i8253.h> +#include <asm/timer.h> +#include <asm/hpet.h> +#include <asm/time.h> + +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + /* Keep nmi watchdog up to date */ + inc_irq_stat(irq0_irqs); + + /* Optimized out for !IO_APIC and x86_64 */ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } + + global_clock_event->event_handler(global_clock_event); + + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); + + return IRQ_HANDLED; +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + setup_irq(0, &irq0); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ + if (!hpet_enable()) + setup_pit_timer(); + setup_default_timer_irq(); +} + +static __init void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); + tsc_init(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ + late_time_init = x86_late_time_init; +} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c deleted file mode 100644 index 5c5d87f0b2e..00000000000 --- a/arch/x86/kernel/time_32.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (C) 1991, 1992, 1995 Linus Torvalds - * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. - * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; - * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/mca.h> - -#include <asm/setup.h> -#include <asm/hpet.h> -#include <asm/time.h> -#include <asm/timer.h> - -#include <asm/do_timer.h> - -int timer_ack; - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - -#ifdef CONFIG_SMP - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)®s->sp; - - /* Return address is either directly at stack pointer - or above a saved flags. Eflags has bits 22-31 zero, - kernel addresses don't. */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } -#endif - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - -#ifdef CONFIG_X86_IO_APIC - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } -#endif - - do_timer_interrupt_hook(); - -#ifdef CONFIG_MCA - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ - - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */ - } -#endif - - return IRQ_HANDLED; -} - -/* Duplicate of time_init() below, with hpet_enable part added */ -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - x86_quirk_time_init(); -} - -/* - * This is called directly from init code; we must delay timer setup in the - * HPET case as we can't make the decision to turn on HPET this early in the - * boot process. - * - * The chosen time_init function will usually be hpet_time_init, above, but - * in the case of virtual hardware, an alternative function may be substituted. - */ -void __init time_init(void) -{ - x86_quirk_pre_time_init(); - tsc_init(); - late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c deleted file mode 100644 index 5ba343e6184..00000000000 --- a/arch/x86/kernel/time_64.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * "High Precision Event Timer" based timekeeping. - * - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * RTC support code taken from arch/i386/kernel/timers/time_hpet.c - */ - -#include <linux/clockchips.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/time.h> -#include <linux/mca.h> -#include <linux/nmi.h> - -#include <asm/i8253.h> -#include <asm/hpet.h> -#include <asm/vgtod.h> -#include <asm/time.h> -#include <asm/timer.h> - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - /* Assume the lock function has either no stack frame or a copy - of flags from PUSHF - Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - inc_irq_stat(irq0_irqs); - - global_clock_event->event_handler(global_clock_event); - -#ifdef CONFIG_MCA - if (MCA_bus) { - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ - } -#endif - - return IRQ_HANDLED; -} - -/* calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency */ -#define TICK_COUNT 100000000 -unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, - .name = "timer" -}; - -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - - setup_irq(0, &irq0); -} - -void __init time_init(void) -{ - tsc_init(); - - late_time_init = choose_time_init(); -} diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8ccabb8a2f6..503c1f2e883 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -640,13 +640,13 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); + proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, + &proc_uv_ptc_operations); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); return -EINVAL; } - proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; return 0; } @@ -744,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode) * note that base_dest_nodeid is actually a nasid. */ ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; + ad2->header.dest_subnodeid = 0x10; /* the LB */ ad2->header.command = UV_NET_ENDPOINT_INTD; ad2->header.int_both = 1; /* diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 808031a5ba1..cd022121cab 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -3,8 +3,16 @@ #include <asm/trampoline.h> #include <asm/e820.h> +#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) +#define __trampinit +#define __trampinitdata +#else +#define __trampinit __cpuinit +#define __trampinitdata __cpuinitdata +#endif + /* ready for x86_64 and x86 */ -unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); void __init reserve_trampoline_memory(void) { @@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void) * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -unsigned long setup_trampoline(void) +unsigned long __trampinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 66d874e5404..8508237e8e4 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -28,16 +28,12 @@ */ #include <linux/linkage.h> +#include <linux/init.h> #include <asm/segment.h> #include <asm/page_types.h> /* We can free up trampoline after bootup if cpu hotplug is not supported. */ -#ifndef CONFIG_HOTPLUG_CPU -.section ".cpuinit.data","aw",@progbits -#else -.section .rodata,"a",@progbits -#endif - +__CPUINITRODATA .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index cddfb8d386b..3af2dff58b2 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -25,14 +25,19 @@ */ #include <linux/linkage.h> +#include <linux/init.h> #include <asm/pgtable_types.h> #include <asm/page_types.h> #include <asm/msr.h> #include <asm/segment.h> #include <asm/processor-flags.h> +#ifdef CONFIG_ACPI_SLEEP .section .rodata, "a", @progbits - +#else +/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ +__CPUINITRODATA +#endif .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332f475..7e37dcee0cc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -14,7 +14,6 @@ #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/module.h> @@ -59,12 +58,12 @@ #include <asm/mach_traps.h> #ifdef CONFIG_X86_64 +#include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/traps.h> asmlinkage int system_call(void); @@ -73,11 +72,9 @@ char ignore_fpu_irq; /* * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. + * F0 0F bug workaround. */ -gate_desc idt_table[256] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif DECLARE_BITMAP(used_vectors, NR_VECTORS); @@ -786,33 +783,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; } -#endif -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +/* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) { + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } /* @@ -846,17 +844,8 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); @@ -980,7 +969,5 @@ void __init trap_init(void) */ cpu_init(); -#ifdef CONFIG_X86_32 - x86_quirk_trap_init(); -#endif + x86_init.irqs.trap_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6e1a368d21d..cd982f48e23 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,6 +17,8 @@ #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> +#include <asm/nmi.h> +#include <asm/x86_init.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -275,15 +277,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequencty. */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0; for (count = 0; count < 50000; count++) { - /* Ignore LSB */ - inb(0x42); - if (inb(0x42) != val) + if (!pit_verify_msb(val)) break; tsc = get_cycles(); } @@ -336,8 +343,7 @@ static unsigned long quick_pit_calibrate(void) * to do that is to just read back the 16-bit counter * once from the PIT. */ - inb(0x42); - inb(0x42); + pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { @@ -348,8 +354,19 @@ static unsigned long quick_pit_calibrate(void) * Iterate until the error is less than 500 ppm */ delta -= tsc; - if (d1+d2 < delta >> 11) - goto success; + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; } } printk("Fast TSC calibration failed\n"); @@ -385,15 +402,9 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; + unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - hv_tsc_khz = get_hypervisor_tsc_freq(); - if (hv_tsc_khz) { - printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return hv_tsc_khz; - } - local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -551,7 +562,7 @@ int recalibrate_cpu_khz(void) unsigned long cpu_khz_old = cpu_khz; if (cpu_has_tsc) { - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, @@ -655,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || (val == CPUFREQ_RESUMECHANGE)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -729,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif +static void resume_tsc(void) +{ + clocksource_tsc.cycle_last = 0; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, + .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | @@ -746,12 +763,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } @@ -837,15 +856,71 @@ static void __init init_tsc_clocksource(void) clocksource_register(&clocksource_tsc); } +#ifdef CONFIG_X86_64 +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ +#define TICK_COUNT 100000000 +static unsigned long __init calibrate_cpu(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start measuring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} +#else +static inline unsigned long calibrate_cpu(void) { return cpu_khz; } +#endif + void __init tsc_init(void) { u64 lpj; int cpu; + x86_init.timers.tsc_pre_init(); + if (!cpu_has_tsc) return; - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { @@ -853,11 +928,9 @@ void __init tsc_init(void) return; } -#ifdef CONFIG_X86_64 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calibrate_cpu(); -#endif printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 027b5b49899..f37930954d1 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - pr_info("Skipping synchronization checks as TSC is reliable.\n"); + printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); return; } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24eec4..f068553a1b1 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -30,6 +30,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/e820.h> +#include <asm/time.h> #include <asm/io.h> #include <linux/kernel_stat.h> @@ -53,7 +54,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init(void) +static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -66,21 +67,13 @@ static int __init visws_time_init(void) /* Enable (unmask) the timer interrupt */ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - /* - * Zero return means the generic timer setup code will set up - * the standard vector: - */ - return 0; + setup_default_timer_irq(); } -static int __init visws_pre_intr_init(void) +/* Replaces the default init_ISA_irqs in the generic setup */ +static void __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); - - /* - * We dont want ISA irqs to be set up by the generic code: - */ - return 1; } /* Quirk for machine specific memory setup. */ @@ -156,12 +149,8 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config(unsigned int early) +static void __init visws_get_smp_config(unsigned int early) { - /* - * Prevent MP-table parsing by the generic code: - */ - return 1; } /* @@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static int __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(unsigned int reserve) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve) MP_processor_info(mp++); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - return 1; } -static int visws_trap_init(void); - -static struct x86_quirks visws_x86_quirks __initdata = { - .arch_time_init = visws_time_init, - .arch_pre_intr_init = visws_pre_intr_init, - .arch_memory_setup = visws_memory_setup, - .arch_intr_init = NULL, - .arch_trap_init = visws_trap_init, - .mach_get_smp_config = visws_get_smp_config, - .mach_find_smp_config = visws_find_smp_config, -}; +static void visws_trap_init(void); void __init visws_early_detect(void) { @@ -257,11 +234,14 @@ void __init visws_early_detect(void) return; /* - * Install special quirks for timer, interrupt and memory setup: - * Fall back to generic behavior for traps: - * Override generic MP-table parsing: + * Override the default platform setup functions */ - x86_quirks = &visws_x86_quirks; + x86_init.resources.memory_setup = visws_memory_setup; + x86_init.mpparse.get_smp_config = visws_get_smp_config; + x86_init.mpparse.find_smp_config = visws_find_smp_config; + x86_init.irqs.pre_vector_init = visws_pre_intr_init; + x86_init.irqs.trap_init = visws_trap_init; + x86_init.timers.timer_init = visws_time_init; /* * Install reboot quirks: @@ -400,12 +380,10 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -static int __init visws_trap_init(void) +static void __init visws_trap_init(void) { lithium_init(); cobalt_init(); - - return 1; } /* diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b263423fbe2..d430e4c3019 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; ap.fs = __KERNEL_PERCPU; - ap.gs = 0; + ap.gs = __KERNEL_STACK_CANARY; ap.eflags = 0; @@ -648,7 +648,7 @@ static inline int __init activate_vmi(void) pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi"; + pv_info.name = "vmi [deprecated]"; pv_init_ops.patch = vmi_patch; @@ -817,15 +817,15 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - pv_time_ops.time_init = vmi_time_init; - pv_time_ops.get_wallclock = vmi_get_wallclock; - pv_time_ops.set_wallclock = vmi_set_wallclock; + x86_init.timers.timer_init = vmi_time_init; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; - pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; + x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; + x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_tsc_khz = vmi_tsc_khz; + x86_platform.calibrate_tsc = vmi_tsc_khz; + x86_platform.get_wallclock = vmi_get_wallclock; + x86_platform.set_wallclock = vmi_set_wallclock; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efee..611b9e2360d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +/* x86_platform.calibrate_tsc = vmi_tsc_khz */ unsigned long vmi_tsc_khz(void) { unsigned long long khz; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 59f31d2dd43..92929fb3f9f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -45,12 +45,11 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ + percpu PT_LOAD FLAGS(6); /* RW_ */ #endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ + init PT_LOAD FLAGS(7); /* RWE */ #endif note PT_NOTE FLAGS(0); /* ___ */ } @@ -66,17 +65,11 @@ SECTIONS #endif /* Text and read-only data */ - - /* bootstrapping code */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* The rest of the text */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* bootstrapping code */ + HEAD_TEXT #ifdef CONFIG_X86_32 - /* not really needed, already page aligned */ . = ALIGN(PAGE_SIZE); *(.text.page_aligned) #endif @@ -95,87 +88,55 @@ SECTIONS NOTES :text :note - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + EXCEPTION_TABLE(16) :text = 0x9090 - RODATA + RO_DATA(PAGE_SIZE) /* Data */ - . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Start of data section */ _sdata = .; - DATA_DATA - CONSTRUCTORS - } :data + + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) #ifdef CONFIG_X86_32 - /* 32 bit has nosave before _edata */ - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } + /* 32 bit has nosave before _edata */ + NOSAVE_DATA #endif - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } + PAGE_ALIGNED_DATA(PAGE_SIZE) -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -#endif - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } + CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) - /* rarely changed data like cpu maps */ -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); -#endif - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) + DATA_DATA + CONSTRUCTORS + + /* rarely changed data like cpu maps */ + READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) /* End of data section */ _edata = .; - } + } :data #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + . = ALIGN(4096); + __vsyscall_0 = .; + . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) @@ -215,11 +176,9 @@ SECTIONS *(.vsyscall_3) } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET @@ -227,57 +186,27 @@ SECTIONS #endif /* CONFIG_X86_64 */ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } -#ifdef CONFIG_X86_64 - :data.init -#endif - - /* - * smp_locks might be freed after init - * start/end must be page aligned - */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - } - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { __init_begin = .; /* paired with __init_end */ - _sinittext = .; - INIT_TEXT - _einittext = .; } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should + * start another segment - init. + */ + PERCPU_VADDR(0, :percpu) +#endif - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } + INIT_TEXT_SECTION(PAGE_SIZE) +#ifdef CONFIG_X86_64 + :init +#endif - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; @@ -285,8 +214,6 @@ SECTIONS __x86_cpu_dev_end = .; } - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; @@ -317,26 +244,7 @@ SECTIONS EXIT_DATA } -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif @@ -347,15 +255,22 @@ SECTIONS __init_end = .; } + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ + NOSAVE_DATA + } #endif /* BSS */ @@ -380,21 +295,18 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } #ifdef CONFIG_X86_32 -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -407,12 +319,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -420,7 +332,7 @@ ASSERT((per_cpu__irq_stack_union == 0), #ifdef CONFIG_KEXEC #include <asm/kexec.h> -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aa..8cb4974ff59 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -static int -vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .proc_handler = proc_dointvec }, {} }; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 00000000000..4449a4a2c2e --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/init.h> + +#include <asm/bios_ebda.h> +#include <asm/paravirt.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/time.h> +#include <asm/irq.h> +#include <asm/tsc.h> + +void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + + .resources = { + .probe_roms = x86_init_noop, + .reserve_resources = reserve_standard_io_resources, + .memory_setup = default_machine_specific_memory_setup, + }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = default_mpc_apic_id, + .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, + }, + + .irqs = { + .pre_vector_init = init_ISA_irqs, + .intr_init = native_init_IRQ, + .trap_init = x86_init_noop, + }, + + .oem = { + .arch_setup = x86_init_noop, + .banner = default_banner, + }, + + .paging = { + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + }, + + .timers = { + .setup_percpu_clockev = setup_boot_APIC_clock, + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, + }, +}; + +struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .setup_percpu_clockev = setup_secondary_APIC_clock, +}; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, +}; |