diff options
Diffstat (limited to 'arch/x86')
79 files changed, 1054 insertions, 965 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7109037bdf7..c95482b6b6d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,6 +18,8 @@ config X86_64 ### Arch settings config X86 def_bool y + select HAVE_OPROFILE + select HAVE_KPROBES config GENERIC_LOCKBREAK def_bool n @@ -103,13 +105,12 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 +config ARCH_HAS_CPU_RELAX + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool X86_64 -config ARCH_SUPPORTS_OPROFILE - bool - default y - select HAVE_KVM config ARCH_HIBERNATION_POSSIBLE @@ -204,8 +205,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also the <file:Documentation/smp.txt>, - <file:Documentation/i386/IO-APIC.txt>, + See also <file:Documentation/i386/IO-APIC.txt>, <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at <http://www.tldp.org/docs.html#howto>. @@ -309,6 +309,7 @@ config X86_RDC321X select M486 select X86_REBOOTFIXUPS select GENERIC_GPIO + select LEDS_CLASS select LEDS_GPIO help This option is needed for RDC R-321x system-on-chip, also known @@ -417,7 +418,7 @@ config HPET_TIMER config HPET_EMULATE_RTC def_bool y - depends on HPET_TIMER && (RTC=y || RTC=m) + depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. @@ -467,6 +468,9 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT Calgary anyway, pass 'iommu=calgary' on the kernel command line. If unsure, say Y. +config IOMMU_HELPER + def_bool (CALGARY_IOMMU || GART_IOMMU) + # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool @@ -630,7 +634,6 @@ config TOSHIBA config I8K tristate "Dell laptop support" - depends on X86_32 ---help--- This adds a driver to safely access the System Management Mode of the CPU on the Dell Inspiron 8000. The System Management Mode @@ -1597,8 +1600,6 @@ source "drivers/firmware/Kconfig" source "fs/Kconfig" -source "kernel/Kconfig.instrumentation" - source "arch/x86/Kconfig.debug" source "security/Kconfig" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2e1e3af28c3..fa555148823 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -220,9 +220,9 @@ config DEBUG_BOOT_PARAMS This option will cause struct boot_params to be exported via debugfs. config CPA_DEBUG - bool "CPA self test code" + bool "CPA self-test code" depends on DEBUG_KERNEL help - Do change_page_attr self tests at boot. + Do change_page_attr() self-tests every 30 seconds. endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8978e98bed5..364865b1b08 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -92,7 +92,6 @@ KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) LDFLAGS := -m elf_$(UTS_MACHINE) -OBJCOPYFLAGS := -O binary -R .note -R .comment -S # Speed up the build KBUILD_CFLAGS += -pipe diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 349b81a39c4..f88458e83ef 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,7 +26,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA #RAMDISK := -DRAMDISK=512 targets := vmlinux.bin setup.bin setup.elf zImage bzImage -subdir- := compressed +subdir- := compressed setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o @@ -43,9 +43,17 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs-y := tools/build +hostprogs-y := mkcpustr tools/build -HOSTCFLAGS_build.o := $(LINUXINCLUDE) +HOST_EXTRACFLAGS += $(LINUXINCLUDE) + +$(obj)/cpu.o: $(obj)/cpustr.h + +quiet_cmd_cpustr = CPUSTR $@ + cmd_cpustr = $(obj)/mkcpustr > $@ +targets += cpustr.h +$(obj)/cpustr.h: $(obj)/mkcpustr FORCE + $(call if_changed,cpustr) # --------------------------------------------------------------------------- @@ -80,6 +88,7 @@ $(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ $(call if_changed,image) @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE $(call if_changed,objcopy) @@ -90,7 +99,6 @@ $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) OBJCOPYFLAGS_setup.bin := -O binary - $(obj)/setup.bin: $(obj)/setup.elf FORCE $(call if_changed,objcopy) @@ -98,7 +106,7 @@ $(obj)/compressed/vmlinux: FORCE $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel -FDARGS = +FDARGS = # Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel FDINITRD = diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fe24ceabd90..d2b9f3bb87c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -22,6 +22,7 @@ $(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $ $(call if_changed,ld) @: +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 1ccb38a7f0d..e8657b98c90 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -80,8 +80,8 @@ startup_32: #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(LARGE_PAGE_SIZE -1), %ebx - andl $LARGE_PAGE_MASK, %ebx + addl $(PMD_PAGE_SIZE -1), %ebx + andl $PMD_PAGE_MASK, %ebx #else movl $CONFIG_PHYSICAL_START, %ebx #endif @@ -220,8 +220,8 @@ ENTRY(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(LARGE_PAGE_SIZE - 1), %rbp - andq $LARGE_PAGE_MASK, %rbp + addq $(PMD_PAGE_SIZE - 1), %rbp + andq $PMD_PAGE_MASK, %rbp movq %rbp, %rbx #else movq $CONFIG_PHYSICAL_START, %rbp diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 2a5c32da585..00e19edd852 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -1,7 +1,7 @@ /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2007-2008 rPath, Inc. - All Rights Reserved * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -9,7 +9,7 @@ * ----------------------------------------------------------------------- */ /* - * arch/i386/boot/cpu.c + * arch/x86/boot/cpu.c * * Check for obligatory CPU features and abort if the features are not * present. @@ -19,6 +19,8 @@ #include "bitops.h" #include <asm/cpufeature.h> +#include "cpustr.h" + static char *cpu_name(int level) { static char buf[6]; @@ -35,6 +37,7 @@ int validate_cpu(void) { u32 *err_flags; int cpu_level, req_level; + const unsigned char *msg_strs; check_cpu(&cpu_level, &req_level, &err_flags); @@ -51,13 +54,26 @@ int validate_cpu(void) puts("This kernel requires the following features " "not present on the CPU:\n"); + msg_strs = (const unsigned char *)x86_cap_strs; + for (i = 0; i < NCAPINTS; i++) { u32 e = err_flags[i]; for (j = 0; j < 32; j++) { - if (e & 1) - printf("%d:%d ", i, j); - + int n = (i << 5)+j; + if (*msg_strs < n) { + /* Skip to the next string */ + do { + msg_strs++; + } while (*msg_strs); + msg_strs++; + } + if (e & 1) { + if (*msg_strs == n && msg_strs[1]) + printf("%s ", msg_strs+1); + else + printf("%d:%d ", i, j); + } e >>= 1; } } diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c new file mode 100644 index 00000000000..bbe76953bae --- /dev/null +++ b/arch/x86/boot/mkcpustr.c @@ -0,0 +1,49 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2008 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * This is a host program to preprocess the CPU strings into a + * compact format suitable for the setup code. + */ + +#include <stdio.h> + +#include "../kernel/cpu/feature_names.c" + +#if NCAPFLAGS > 8 +# error "Need to adjust the boot code handling of CPUID strings" +#endif + +int main(void) +{ + int i; + const char *str; + + printf("static const char x86_cap_strs[] = \n"); + + for (i = 0; i < NCAPINTS*32; i++) { + str = x86_cap_flags[i]; + + if (i == NCAPINTS*32-1) { + /* The last entry must be unconditional; this + also consumes the compiler-added null character */ + if (!str) + str = ""; + printf("\t\"\\x%02x\"\"%s\"\n", i, str); + } else if (str) { + printf("#if REQUIRED_MASK%d & (1 << %d)\n" + "\t\"\\x%02x\"\"%s\\0\"\n" + "#endif\n", + i >> 5, i & 31, i, str); + } + } + printf("\t;\n"); + return 0; +} diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index e4c12079171..58cccb6483b 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -172,8 +172,7 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, has_dumped = 1; current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(current->comm)); - dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - - ((unsigned long)(&dump))); + dump.u_ar0 = offsetof(struct user32, regs); dump.signal = signr; dump_thread32(regs, &dump); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 0db0a6291bb..8022d3c695c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -722,7 +722,9 @@ ia32_sys_call_table: .quad sys_epoll_pwait .quad compat_sys_utimensat /* 320 */ .quad compat_sys_signalfd - .quad compat_sys_timerfd + .quad sys_timerfd_create .quad sys_eventfd .quad sys32_fallocate + .quad compat_sys_timerfd_settime /* 325 */ + .quad compat_sys_timerfd_gettime ia32_syscall_end: diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6f813009d44..21dc1a061bf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -37,7 +37,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_PCI) += early-quirks.o -obj-$(CONFIG_APM) += apm_32.o +apm-y := apm_32.o +obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o @@ -74,7 +75,8 @@ ifdef CONFIG_INPUT_PCSPKR obj-y += pcspeaker.o endif -obj-$(CONFIG_SCx200) += scx200_32.o +obj-$(CONFIG_SCx200) += scx200.o +scx200-y += scx200_32.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d2a58431a07..680b7300a48 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -78,7 +78,6 @@ int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; int acpi_ioapic; int acpi_strict; -EXPORT_SYMBOL(acpi_strict); u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; @@ -106,7 +105,7 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; #ifdef CONFIG_X86_64 /* rely on all ACPI tables being in the direct mapping */ -char *__acpi_map_table(unsigned long phys_addr, unsigned long size) +char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) { if (!phys_addr || !size) return NULL; @@ -131,7 +130,7 @@ char *__acpi_map_table(unsigned long phys_addr, unsigned long size) * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and * count idx down while incrementing the phys address. */ -char *__acpi_map_table(unsigned long phys, unsigned long size) +char *__init __acpi_map_table(unsigned long phys, unsigned long size) { unsigned long base, offset, mapped_size; int idx; @@ -490,8 +489,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) return irq; } -EXPORT_SYMBOL(acpi_register_gsi); - /* * ACPI based hotplug support for CPU */ @@ -587,25 +584,6 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); -static unsigned long __init -acpi_scan_rsdp(unsigned long start, unsigned long length) -{ - unsigned long offset = 0; - unsigned long sig_len = sizeof("RSD PTR ") - 1; - - /* - * Scan all 16-byte boundaries of the physical memory region for the - * RSDP signature. - */ - for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len)) - continue; - return (start + offset); - } - - return 0; -} - static int __init acpi_parse_sbf(struct acpi_table_header *table) { struct acpi_table_boot *sb; @@ -748,27 +726,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) return 0; } -unsigned long __init acpi_find_rsdp(void) -{ - unsigned long rsdp_phys = 0; - - if (efi_enabled) { - if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) - return efi.acpi20; - else if (efi.acpi != EFI_INVALID_TABLE_ADDR) - return efi.acpi; - } - /* - * Scan memory looking for the RSDP signature. First search EBDA (low - * memory) paragraphs and then search upper memory (E0000-FFFFF). - */ - rsdp_phys = acpi_scan_rsdp(0, 0x400); - if (!rsdp_phys) - rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000); - - return rsdp_phys; -} - #ifdef CONFIG_X86_LOCAL_APIC /* * Parse LAPIC entries in MADT diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index a25db514c71..324eb0cab19 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -46,6 +46,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) buf[1] = 1; buf[2] = ACPI_PDC_C_CAPABILITY_SMP; + /* + * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so + * that OSPM is capable of native ACPI throttling software + * coordination using BIOS supplied _TSD info. + */ + buf[2] |= ACPI_PDC_SMP_T_SWCOORD; if (cpu_has(c, X86_FEATURE_EST)) buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfdb2f3bd76..a0c4d7c5dbd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -3,6 +3,7 @@ # obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y += feature_names.o obj-$(CONFIG_X86_32) += common.o proc.o bugs.o obj-$(CONFIG_X86_32) += amd.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b7b2142b58e..f86a3c4a266 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -623,16 +623,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; * They will insert themselves into the cpu_devs structure. * Then, when cpu_init() is called, we can just iterate over that array. */ - -extern int intel_cpu_init(void); -extern int cyrix_init_cpu(void); -extern int nsc_init_cpu(void); -extern int amd_init_cpu(void); -extern int centaur_init_cpu(void); -extern int transmeta_init_cpu(void); -extern int nexgen_init_cpu(void); -extern int umc_init_cpu(void); - void __init early_cpu_init(void) { intel_cpu_init(); @@ -647,7 +637,7 @@ void __init early_cpu_init(void) } /* Make sure %fs is initialized properly in idle threads */ -struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); regs->fs = __KERNEL_PERCPU; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ad6527a5beb..e0b38c33d84 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -27,3 +27,12 @@ extern void display_cacheinfo(struct cpuinfo_x86 *c); extern void early_init_intel(struct cpuinfo_x86 *c); extern void early_init_amd(struct cpuinfo_x86 *c); +/* Specific CPU type init functions */ +int intel_cpu_init(void); +int amd_init_cpu(void); +int cyrix_init_cpu(void); +int nsc_init_cpu(void); +int centaur_init_cpu(void); +int transmeta_init_cpu(void); +int nexgen_init_cpu(void); +int umc_init_cpu(void); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 151eda0a23f..cb7a5715596 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -29,7 +29,7 @@ config X86_ACPI_CPUFREQ config ELAN_CPUFREQ tristate "AMD Elan SC400 and SC410" select CPU_FREQ_TABLE - depends on X86_32 && X86_ELAN + depends on X86_ELAN ---help--- This adds the CPUFreq driver for AMD Elan SC400 and SC410 processors. @@ -45,7 +45,7 @@ config ELAN_CPUFREQ config SC520_CPUFREQ tristate "AMD Elan SC520" select CPU_FREQ_TABLE - depends on X86_32 && X86_ELAN + depends on X86_ELAN ---help--- This adds the CPUFreq driver for AMD Elan SC520 processor. diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c index 326a4c81f68..39f8cb18296 100644 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c @@ -23,6 +23,7 @@ #define EPS_BRAND_C7 1 #define EPS_BRAND_EDEN 2 #define EPS_BRAND_C3 3 +#define EPS_BRAND_C7D 4 struct eps_cpu_data { u32 fsb; @@ -54,6 +55,7 @@ static int eps_set_state(struct eps_cpu_data *centaur, { struct cpufreq_freqs freqs; u32 lo, hi; + u8 current_multiplier, current_voltage; int err = 0; int i; @@ -93,6 +95,15 @@ postchange: rdmsr(MSR_IA32_PERF_STATUS, lo, hi); freqs.new = centaur->fsb * ((lo >> 8) & 0xff); + /* Print voltage and multiplier */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + current_voltage = lo & 0xff; + printk(KERN_INFO "eps: Current voltage = %dmV\n", + current_voltage * 16 + 700); + current_multiplier = (lo >> 8) & 0xff; + printk(KERN_INFO "eps: Current multiplier = %d\n", + current_multiplier); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); return err; } @@ -141,9 +152,10 @@ static int eps_cpu_init(struct cpufreq_policy *policy) u8 current_multiplier, current_voltage; u8 max_multiplier, max_voltage; u8 min_multiplier, min_voltage; - u8 brand; + u8 brand = 0; u32 fsb; struct eps_cpu_data *centaur; + struct cpuinfo_x86 *c = &cpu_data(0); struct cpufreq_frequency_table *f_table; int k, step, voltage; int ret; @@ -153,21 +165,36 @@ static int eps_cpu_init(struct cpufreq_policy *policy) return -ENODEV; /* Check brand */ - printk("eps: Detected VIA "); - rdmsr(0x1153, lo, hi); - brand = (((lo >> 2) ^ lo) >> 18) & 3; + printk(KERN_INFO "eps: Detected VIA "); + + switch (c->x86_model) { + case 10: + rdmsr(0x1153, lo, hi); + brand = (((lo >> 2) ^ lo) >> 18) & 3; + printk(KERN_CONT "Model A "); + break; + case 13: + rdmsr(0x1154, lo, hi); + brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff; + printk(KERN_CONT "Model D "); + break; + } + switch(brand) { case EPS_BRAND_C7M: - printk("C7-M\n"); + printk(KERN_CONT "C7-M\n"); break; case EPS_BRAND_C7: - printk("C7\n"); + printk(KERN_CONT "C7\n"); break; case EPS_BRAND_EDEN: - printk("Eden\n"); + printk(KERN_CONT "Eden\n"); + break; + case EPS_BRAND_C7D: + printk(KERN_CONT "C7-D\n"); break; case EPS_BRAND_C3: - printk("C3\n"); + printk(KERN_CONT "C3\n"); return -ENODEV; break; } @@ -179,7 +206,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy) /* Can be locked at 0 */ rdmsrl(MSR_IA32_MISC_ENABLE, val); if (!(val & 1 << 16)) { - printk("eps: Can't enable Enhanced PowerSaver\n"); + printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); return -ENODEV; } } @@ -187,19 +214,19 @@ static int eps_cpu_init(struct cpufreq_policy *policy) /* Print voltage and multiplier */ rdmsr(MSR_IA32_PERF_STATUS, lo, hi); current_voltage = lo & 0xff; - printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700); + printk(KERN_INFO "eps: Current voltage = %dmV\n", current_voltage * 16 + 700); current_multiplier = (lo >> 8) & 0xff; - printk("eps: Current multiplier = %d\n", current_multiplier); + printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); /* Print limits */ max_voltage = hi & 0xff; - printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); + printk(KERN_INFO "eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); max_multiplier = (hi >> 8) & 0xff; - printk("eps: Highest multiplier = %d\n", max_multiplier); + printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); min_voltage = (hi >> 16) & 0xff; - printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); + printk(KERN_INFO "eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); min_multiplier = (hi >> 24) & 0xff; - printk("eps: Lowest multiplier = %d\n", min_multiplier); + printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); /* Sanity checks */ if (current_multiplier == 0 || max_multiplier == 0 @@ -208,7 +235,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy) if (current_multiplier > max_multiplier || max_multiplier <= min_multiplier) return -EINVAL; - if (current_voltage > 0x1c || max_voltage > 0x1c) + if (current_voltage > 0x1f || max_voltage > 0x1f) return -EINVAL; if (max_voltage < min_voltage) return -EINVAL; @@ -310,7 +337,7 @@ static int __init eps_init(void) /* This driver will work only on Centaur C7 processors with * Enhanced SpeedStep/PowerSaver registers */ if (c->x86_vendor != X86_VENDOR_CENTAUR - || c->x86 != 6 || c->x86_model != 10) + || c->x86 != 6 || c->x86_model < 10) return -ENODEV; if (!cpu_has(c, X86_FEATURE_EST)) return -ENODEV; diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 2ed7db2fd25..9d9eae82e60 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -181,8 +181,8 @@ static __init struct pci_dev *gx_detect_chipset(void) struct pci_dev *gx_pci = NULL; /* check if CPU is a MediaGX or a Geode. */ - if ((current_cpu_data.x86_vendor != X86_VENDOR_NSC) && - (current_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { + if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && + (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { dprintk("error: no MediaGX/Geode processor found!\n"); return NULL; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index b5a9863d6cd..0a61159d7b7 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -460,7 +460,7 @@ static int powernow_decode_bios (int maxfid, int startvid) latency = psb->settlingtime; if (latency < 100) { - printk (KERN_INFO PFX "BIOS set settling time to %d microseconds." + printk(KERN_INFO PFX "BIOS set settling time to %d microseconds. " "Should be at least 100. Correcting.\n", latency); latency = 100; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a0522735dd9..c99d59d8ef2 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -578,10 +578,9 @@ static void print_basics(struct powernow_k8_data *data) for (j = 0; j < data->numps; j++) { if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) { if (cpu_family == CPU_HW_PSTATE) { - printk(KERN_INFO PFX " %d : fid 0x%x did 0x%x (%d MHz)\n", + printk(KERN_INFO PFX " %d : pstate %d (%d MHz)\n", j, - (data->powernow_table[j].index & 0xff00) >> 8, - (data->powernow_table[j].index & 0xff0000) >> 16, + data->powernow_table[j].index, data->powernow_table[j].frequency/1000); } else { printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", @@ -827,7 +826,6 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; - u32 hi = 0, lo = 0; index = data->acpi_data.states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { @@ -1236,8 +1234,10 @@ static unsigned int powernowk8_get (unsigned int cpu) struct powernow_k8_data *data; cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; + unsigned int first; - data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu))); + first = first_cpu(per_cpu(cpu_core_map, cpu)); + data = per_cpu(powernow_data, first); if (!data) return -EINVAL; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index afd2b520d35..ab48cfed4d9 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -47,7 +47,7 @@ struct powernow_k8_data { #define CPUID_XFAM 0x0ff00000 /* extended family */ #define CPUID_XFAM_K8 0 #define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_MASK 0x00080000 +#define CPUID_XMOD_REV_MASK 0x000c0000 #define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ #define CPUID_USE_XFAM_XMOD 0x00000f00 #define CPUID_GET_MAX_CAPABILITIES 0x80000000 diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 76c3ab0da46..98d4fdb7dc0 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -189,10 +189,7 @@ static unsigned int pentium4_get_frequency(void) printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n"); /* Multiplier. */ - if (c->x86_model < 2) - mult = msr_lo >> 27; - else - mult = msr_lo >> 24; + mult = msr_lo >> 24; dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 404a6a2d401..7139b026270 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -83,8 +83,6 @@ static char cyrix_model_mult2[] __cpuinitdata = "12233445"; * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP */ -extern void calibrate_delay(void) __init; - static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c new file mode 100644 index 00000000000..ee975ac6bbc --- /dev/null +++ b/arch/x86/kernel/cpu/feature_names.c @@ -0,0 +1,83 @@ +/* + * Strings for the various x86 capability flags. + * + * This file must not contain any executable code. + */ + +#include "asm/cpufeature.h" + +/* + * These flag bits must match the definitions in <asm/cpufeature.h>. + * NULL means this bit is undefined or reserved; either way it doesn't + * have meaning as far as Linux is concerned. Note that it's important + * to realize there is a difference between this table and CPUID -- if + * applications want to get the raw CPUID data, they should access + * /dev/cpu/<cpu_nr>/cpuid instead. + */ +const char * const x86_cap_flags[NCAPINTS*32] = { + /* Intel-defined */ + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", + + /* AMD-defined */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", + + /* Transmeta-defined */ + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Other (Linux-defined) */ + "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", + NULL, NULL, NULL, NULL, + "constant_tsc", "up", NULL, "arch_perfmon", + "pebs", "bts", NULL, NULL, + "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Intel-defined (#2) */ + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* VIA/Cyrix/Centaur-defined */ + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", + "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* AMD-defined (#2) */ + "lahf_lm", "cmp_legacy", "svm", "extapic", + "cr8_legacy", "abm", "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", "sse5", + "skinit", "wdt", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Auxiliary (Linux-defined) */ + "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc", + "100mhzsteps", + "hwpstate", + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ +}; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d1c372b018d..fae31ce747b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/ds.h> +#include <asm/bugs.h> #include "cpu.h" diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 8e139c70f88..ff14c320040 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -7,8 +7,6 @@ #include <asm/processor-flags.h> #include "mtrr.h" -int arr3_protected; - static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) @@ -99,8 +97,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) case 4: return replace_reg; case 3: - if (arr3_protected) - break; case 2: case 1: case 0: @@ -115,8 +111,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); - if ((i == 3) && arr3_protected) - continue; if (lsize == 0) return i; } @@ -260,107 +254,6 @@ static void cyrix_set_all(void) post_set(); } -#if 0 -/* - * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection - * with the SMM (System Management Mode) mode. So we need the following: - * Check whether SMI_LOCK (CCR3 bit 0) is set - * if it is set, write a warning message: ARR3 cannot be changed! - * (it cannot be changed until the next processor reset) - * if it is reset, then we can change it, set all the needed bits: - * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset) - * - disable access to SMM memory (CCR1 bit 2 reset) - * - disable SMM mode (CCR1 bit 1 reset) - * - disable write protection of ARR3 (CCR6 bit 1 reset) - * - (maybe) disable ARR3 - * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set) - */ -static void __init -cyrix_arr_init(void) -{ - struct set_mtrr_context ctxt; - unsigned char ccr[7]; - int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 }; -#ifdef CONFIG_SMP - int i; -#endif - - /* flush cache and enable MAPEN */ - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - /* Save all CCRs locally */ - ccr[0] = getCx86(CX86_CCR0); - ccr[1] = getCx86(CX86_CCR1); - ccr[2] = getCx86(CX86_CCR2); - ccr[3] = ctxt.ccr3; - ccr[4] = getCx86(CX86_CCR4); - ccr[5] = getCx86(CX86_CCR5); - ccr[6] = getCx86(CX86_CCR6); - - if (ccr[3] & 1) { - ccrc[3] = 1; - arr3_protected = 1; - } else { - /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and - * access to SMM memory through ARR3 (bit 7). - */ - if (ccr[1] & 0x80) { - ccr[1] &= 0x7f; - ccrc[1] |= 0x80; - } - if (ccr[1] & 0x04) { - ccr[1] &= 0xfb; - ccrc[1] |= 0x04; - } - if (ccr[1] & 0x02) { - ccr[1] &= 0xfd; - ccrc[1] |= 0x02; - } - arr3_protected = 0; - if (ccr[6] & 0x02) { - ccr[6] &= 0xfd; - ccrc[6] = 1; /* Disable write protection of ARR3 */ - setCx86(CX86_CCR6, ccr[6]); - } - /* Disable ARR3. This is safe now that we disabled SMM. */ - /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */ - } - /* If we changed CCR1 in memory, change it in the processor, too. */ - if (ccrc[1]) - setCx86(CX86_CCR1, ccr[1]); - - /* Enable ARR usage by the processor */ - if (!(ccr[5] & 0x20)) { - ccr[5] |= 0x20; - ccrc[5] = 1; - setCx86(CX86_CCR5, ccr[5]); - } -#ifdef CONFIG_SMP - for (i = 0; i < 7; i++) - ccr_state[i] = ccr[i]; - for (i = 0; i < 8; i++) - cyrix_get_arr(i, - &arr_state[i].base, &arr_state[i].size, - &arr_state[i].type); -#endif - - set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */ - - if (ccrc[5]) - printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n"); - if (ccrc[3]) - printk(KERN_INFO "mtrr: ARR3 cannot be changed\n"); -/* - if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n"); - if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n"); - if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n"); -*/ - if (ccrc[6]) - printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n"); -} -#endif - static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, // .init = cyrix_arr_init, diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 71591958265..b6e136f23d3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,12 +59,6 @@ struct mtrr_ops * mtrr_if = NULL; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -#ifndef CONFIG_X86_64 -extern int arr3_protected; -#else -#define arr3_protected 0 -#endif - void set_mtrr_ops(struct mtrr_ops * ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) @@ -513,12 +507,6 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) printk(KERN_WARNING "mtrr: register: %d too big\n", reg); goto out; } - if (is_cpu(CYRIX) && !use_intel()) { - if ((reg == 3) && arr3_protected) { - printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n"); - goto out; - } - } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); @@ -566,10 +554,6 @@ EXPORT_SYMBOL(mtrr_del); * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ -extern void amd_init_mtrr(void); -extern void cyrix_init_mtrr(void); -extern void centaur_init_mtrr(void); - static void __init init_ifs(void) { #ifndef CONFIG_X86_64 @@ -675,7 +659,7 @@ static __init int amd_special_default_mtrr(void) */ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { - unsigned long i, base, size, highest_addr = 0, def, dummy; + unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 trim_start, trim_size; @@ -698,28 +682,27 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) mtrr_if->get(i, &base, &size, &type); if (type != MTRR_TYPE_WRBACK) continue; - base <<= PAGE_SHIFT; - size <<= PAGE_SHIFT; - if (highest_addr < base + size) - highest_addr = base + size; + if (highest_pfn < base + size) + highest_pfn = base + size; } /* kvm/qemu doesn't have mtrr set right, don't trim them all */ - if (!highest_addr) { + if (!highest_pfn) { printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); WARN_ON(1); return 0; } - if ((highest_addr >> PAGE_SHIFT) < end_pfn) { + if (highest_pfn < end_pfn) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %LdMB of RAM.\n", - (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20); + " all of memory, losing %luMB of RAM.\n", + (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); WARN_ON(1); printk(KERN_INFO "update e820 for mtrr\n"); - trim_start = highest_addr; + trim_start = highest_pfn; + trim_start <<= PAGE_SHIFT; trim_size = end_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index fb74a2c2081..2cc77eb6fea 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -97,3 +97,7 @@ void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); +/* CPU specific mtrr init functions */ +int amd_init_mtrr(void); +int cyrix_init_mtrr(void); +int centaur_init_mtrr(void); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 02821326014..af11d31dce0 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -10,80 +10,6 @@ */ static int show_cpuinfo(struct seq_file *m, void *v) { - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static const char * const x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static const char * const x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* constant_tsc - moved to flags */ - /* nothing */ - }; struct cpuinfo_x86 *c = v; int i, n = 0; int fpu_exception; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a63432d800f..288e7a6598a 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,6 +17,10 @@ * and then read in chunks of 16 bytes. A larger size means multiple * reads of consecutive levels. * + * The lower 32 bits of the file position is used as the incoming %eax, + * and the upper 32 bits of the file position as the incoming %ecx, + * the latter intended for "counting" eax levels like eax=4. + * * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on * an SMP box will direct the access to CPU %d. */ @@ -43,35 +47,24 @@ static struct class *cpuid_class; -struct cpuid_command { - u32 reg; - u32 *data; +struct cpuid_regs { + u32 eax, ebx, ecx, edx; }; static void cpuid_smp_cpuid(void *cmd_block) { - struct cpuid_command *cmd = cmd_block; - - cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], - &cmd->data[3]); -} - -static inline void do_cpuid(int cpu, u32 reg, u32 * data) -{ - struct cpuid_command cmd; - - cmd.reg = reg; - cmd.data = data; + struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + cpuid_count(cmd->eax, cmd->ecx, + &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx); } static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) { loff_t ret; + struct inode *inode = file->f_mapping->host; - lock_kernel(); - + mutex_lock(&inode->i_mutex); switch (orig) { case 0: file->f_pos = offset; @@ -84,8 +77,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) default: ret = -EINVAL; } - - unlock_kernel(); + mutex_unlock(&inode->i_mutex); return ret; } @@ -93,19 +85,21 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, size_t count, loff_t * ppos) { char __user *tmp = buf; - u32 data[4]; - u32 reg = *ppos; + struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); + u64 pos = *ppos; if (count % 16) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 16) { - do_cpuid(cpu, reg, data); - if (copy_to_user(tmp, &data, 16)) + cmd.eax = pos; + cmd.ecx = pos >> 32; + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + if (copy_to_user(tmp, &cmd, 16)) return -EFAULT; tmp += 16; - *ppos = reg++; + *ppos = ++pos; } return tmp - buf; @@ -193,7 +187,7 @@ static int __init cpuid_init(void) } for_each_online_cpu(i) { err = cpuid_device_create(i); - if (err != 0) + if (err != 0) goto out_class; } register_hotcpu_notifier(&cpuid_class_cpu_notifier); @@ -208,7 +202,7 @@ out_class: } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); out: return err; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1411324a625..32dd62b36ff 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -379,11 +379,9 @@ void __init efi_init(void) #endif } -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; - unsigned long end; void *p; if (!(__supported_pte_mask & _PAGE_NX)) @@ -392,18 +390,13 @@ static void __init runtime_code_page_mkexec(void) /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - if (md->type == EFI_RUNTIME_SERVICES_CODE && - (end >> PAGE_SHIFT) <= max_pfn_mapped) { - set_memory_x(md->virt_addr, md->num_pages); - set_memory_uc(md->virt_addr, md->num_pages); - } + + if (md->type != EFI_RUNTIME_SERVICES_CODE) + continue; + + set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT); } - __flush_tlb_all(); } -#else -static inline void __init runtime_code_page_mkexec(void) { } -#endif /* * This function will switch the EFI runtime services to virtual mode. @@ -417,30 +410,40 @@ void __init efi_enter_virtual_mode(void) { efi_memory_desc_t *md; efi_status_t status; - unsigned long end; - void *p; + unsigned long size; + u64 end, systab; + void *p, *va; efi.systab = NULL; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - if ((md->attribute & EFI_MEMORY_WB) && - ((end >> PAGE_SHIFT) <= max_pfn_mapped)) - md->virt_addr = (unsigned long)__va(md->phys_addr); + + size = md->num_pages << EFI_PAGE_SHIFT; + end = md->phys_addr + size; + + if ((end >> PAGE_SHIFT) <= max_pfn_mapped) + va = __va(md->phys_addr); else - md->virt_addr = (unsigned long) - efi_ioremap(md->phys_addr, - md->num_pages << EFI_PAGE_SHIFT); - if (!md->virt_addr) + va = efi_ioremap(md->phys_addr, size); + + if (md->attribute & EFI_MEMORY_WB) + set_memory_uc(md->virt_addr, size); + + md->virt_addr = (u64) (unsigned long) va; + + if (!va) { printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", (unsigned long long)md->phys_addr); - if ((md->phys_addr <= (unsigned long)efi_phys.systab) && - ((unsigned long)efi_phys.systab < end)) - efi.systab = (efi_system_table_t *)(unsigned long) - (md->virt_addr - md->phys_addr + - (unsigned long)efi_phys.systab); + continue; + } + + systab = (u64) (unsigned long) efi_phys.systab; + if (md->phys_addr <= systab && systab < end) { + systab += md->virt_addr - md->phys_addr; + efi.systab = (efi_system_table_t *) (unsigned long) systab; + } } BUG_ON(!efi.systab); diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 674f2379480..09d5c233093 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -54,10 +54,10 @@ static void __init early_mapping_set_exec(unsigned long start, else set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ __supported_pte_mask)); - if (level == 4) - start = (start + PMD_SIZE) & PMD_MASK; - else + if (level == PG_LEVEL_4K) start = (start + PAGE_SIZE) & PAGE_MASK; + else + start = (start + PMD_SIZE) & PMD_MASK; } } @@ -109,23 +109,23 @@ void __init efi_reserve_bootmem(void) memmap.nr_map * memmap.desc_size); } -void __iomem * __init efi_ioremap(unsigned long offset, - unsigned long size) +void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) { static unsigned pages_mapped; - unsigned long last_addr; unsigned i, pages; - last_addr = offset + size - 1; - offset &= PAGE_MASK; - pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT; + /* phys_addr and size must be page aligned */ + if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK)) + return NULL; + + pages = size >> PAGE_SHIFT; if (pages_mapped + pages > MAX_EFI_IO_PAGES) return NULL; for (i = 0; i < pages; i++) { __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, - offset, PAGE_KERNEL_EXEC_NOCACHE); - offset += PAGE_SIZE; + phys_addr, PAGE_KERNEL); + phys_addr += PAGE_SIZE; pages_mapped++; } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bea8474744f..c7341e81941 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -582,7 +582,6 @@ retint_restore_args: /* return to kernel space */ TRACE_IRQS_IRETQ restore_args: RESTORE_ARGS 0,8,0 -iret_label: #ifdef CONFIG_PARAVIRT INTERRUPT_RETURN #endif @@ -593,13 +592,22 @@ ENTRY(native_iret) .quad native_iret, bad_iret .previous .section .fixup,"ax" - /* force a signal here? this matches i386 behaviour */ - /* running with kernel gs */ bad_iret: - movq $11,%rdi /* SIGSEGV */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) - jmp do_exit + /* + * The iret traps when the %cs or %ss being restored is bogus. + * We've lost the original trap vector and error code. + * #GPF is the most likely one to get for an invalid selector. + * So pretend we completed the iret and took the #GPF in user mode. + * + * We are now running with the kernel GS after exception recovery. + * But error_entry expects us to have user GS to match the user %cs, + * so swap back. + */ + pushq $0 + + SWAPGS + jmp general_protection + .previous /* edi: workmask, edx: work */ @@ -911,7 +919,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp + leaq native_iret(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d5a7a36120..09b38d539b0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -63,7 +63,7 @@ startup_64: /* Is the address not 2M aligned? */ movq %rbp, %rax - andl $~LARGE_PAGE_MASK, %eax + andl $~PMD_PAGE_MASK, %eax testl %eax, %eax jnz bad_address @@ -88,7 +88,7 @@ startup_64: /* Add an Identity mapping if I am above 1G */ leaq _text(%rip), %rdi - andq $LARGE_PAGE_MASK, %rdi + andq $PMD_PAGE_MASK, %rdi movq %rdi, %rax shrq $PUD_SHIFT, %rax @@ -250,18 +250,13 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ -#ifndef CONFIG_HOTPLUG_CPU - .pushsection .init.data -#endif + __CPUINITDATA .align 8 - .globl initial_code -initial_code: + ENTRY(initial_code) .quad x86_64_start_kernel -#ifndef CONFIG_HOTPLUG_CPU - .popsection -#endif - .globl init_rsp -init_rsp: + __FINITDATA + + ENTRY(init_rsp) .quad init_thread_union+THREAD_SIZE-8 bad_address: diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 8a7660c8394..0224c3637c7 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -35,7 +35,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (mincount <= pc->size) return 0; oldsize = pc->size; - mincount = (mincount + 511) & (~511); + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) newldt = vmalloc(mincount * LDT_ENTRY_SIZE); else diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1cfd60639d..d0b234c9fc3 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -151,7 +151,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { -#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE +#ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a1fef42f8cd..236d2f8f7dd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -234,5 +234,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_SYMBOL(init_level4_pgt); + +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif } diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c index 67009cdd5ec..f349e68e45a 100644 --- a/arch/x86/kernel/mpparse_32.c +++ b/arch/x86/kernel/mpparse_32.c @@ -736,7 +736,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) smp_found_config = 1; printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", mpf, virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute @@ -751,7 +752,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) unsigned long end = max_low_pfn * PAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size); + reserve_bootmem(mpf->mpf_physptr, size, + BOOTMEM_DEFAULT); } mpf_found = mpf; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index bd82850e651..af51ea8400b 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,9 +45,10 @@ static struct class *msr_class; static loff_t msr_seek(struct file *file, loff_t offset, int orig) { - loff_t ret = -EINVAL; + loff_t ret; + struct inode *inode = file->f_mapping->host; - lock_kernel(); + mutex_lock(&inode->i_mutex); switch (orig) { case 0: file->f_pos = offset; @@ -56,8 +57,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig) case 1: file->f_pos += offset; ret = file->f_pos; + break; + default: + ret = -EINVAL; } - unlock_kernel(); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 1fe7f043ebd..1b5464c2434 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -35,6 +35,7 @@ #include <linux/pci.h> #include <linux/delay.h> #include <linux/scatterlist.h> +#include <linux/iommu-helper.h> #include <asm/gart.h> #include <asm/calgary.h> #include <asm/tce.h> @@ -260,22 +261,28 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_unlock_irqrestore(&tbl->it_lock, flags); } -static unsigned long iommu_range_alloc(struct iommu_table *tbl, - unsigned int npages) +static unsigned long iommu_range_alloc(struct device *dev, + struct iommu_table *tbl, + unsigned int npages) { unsigned long flags; unsigned long offset; + unsigned long boundary_size; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; BUG_ON(npages == 0); spin_lock_irqsave(&tbl->it_lock, flags); - offset = find_next_zero_string(tbl->it_map, tbl->it_hint, - tbl->it_size, npages); + offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint, + npages, 0, boundary_size, 0); if (offset == ~0UL) { tbl->chip_ops->tce_cache_blast(tbl); - offset = find_next_zero_string(tbl->it_map, 0, - tbl->it_size, npages); + + offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, + npages, 0, boundary_size, 0); if (offset == ~0UL) { printk(KERN_WARNING "Calgary: IOMMU full.\n"); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -286,7 +293,6 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, } } - set_bit_string(tbl->it_map, offset, npages); tbl->it_hint = offset + npages; BUG_ON(tbl->it_hint > tbl->it_size); @@ -295,13 +301,13 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, return offset; } -static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, - unsigned int npages, int direction) +static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *vaddr, unsigned int npages, int direction) { unsigned long entry; dma_addr_t ret = bad_dma_address; - entry = iommu_range_alloc(tbl, npages); + entry = iommu_range_alloc(dev, tbl, npages); if (unlikely(entry == bad_dma_address)) goto error; @@ -354,7 +360,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, badbit, tbl, dma_addr, entry, npages); } - __clear_bit_string(tbl->it_map, entry, npages); + iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } @@ -438,7 +444,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, vaddr = (unsigned long) sg_virt(s); npages = num_dma_pages(vaddr, s->length); - entry = iommu_range_alloc(tbl, npages); + entry = iommu_range_alloc(dev, tbl, npages); if (entry == bad_dma_address) { /* makes sure unmap knows to stop */ s->dma_length = 0; @@ -476,7 +482,7 @@ static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, npages = num_dma_pages(uaddr, size); if (translation_enabled(tbl)) - dma_handle = iommu_alloc(tbl, vaddr, npages, direction); + dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); else dma_handle = virt_to_bus(vaddr); @@ -516,7 +522,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, if (translation_enabled(tbl)) { /* set up tces to cover the allocated range */ - mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); if (mapping == bad_dma_address) goto free; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 4d5cc718198..65f6acb025c 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -25,6 +25,7 @@ #include <linux/bitops.h> #include <linux/kdebug.h> #include <linux/scatterlist.h> +#include <linux/iommu-helper.h> #include <asm/atomic.h> #include <asm/io.h> #include <asm/mtrr.h> @@ -82,17 +83,24 @@ AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ static int need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(int size) +static unsigned long alloc_iommu(struct device *dev, int size) { unsigned long offset, flags; + unsigned long boundary_size; + unsigned long base_index; + + base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), + PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = find_next_zero_string(iommu_gart_bitmap, next_bit, - iommu_pages, size); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, + size, base_index, boundary_size, 0); if (offset == -1) { need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap, 0, - iommu_pages, size); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, + size, base_index, boundary_size, 0); } if (offset != -1) { set_bit_string(iommu_gart_bitmap, offset, size); @@ -114,7 +122,7 @@ static void free_iommu(unsigned long offset, int size) unsigned long flags; spin_lock_irqsave(&iommu_bitmap_lock, flags); - __clear_bit_string(iommu_gart_bitmap, offset, size); + iommu_area_free(iommu_gart_bitmap, offset, size); spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } @@ -235,7 +243,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) { unsigned long npages = to_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(npages); + unsigned long iommu_page = alloc_iommu(dev, npages); int i; if (iommu_page == -1) { @@ -355,10 +363,11 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, } /* Map multiple scatterlist entries continuous into the first. */ -static int __dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, unsigned long pages) +static int __dma_map_cont(struct device *dev, struct scatterlist *start, + int nelems, struct scatterlist *sout, + unsigned long pages) { - unsigned long iommu_start = alloc_iommu(pages); + unsigned long iommu_start = alloc_iommu(dev, pages); unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; @@ -394,8 +403,8 @@ static int __dma_map_cont(struct scatterlist *start, int nelems, } static inline int -dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, - unsigned long pages, int need) +dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, + struct scatterlist *sout, unsigned long pages, int need) { if (!need) { BUG_ON(nelems != 1); @@ -403,7 +412,7 @@ dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, sout->dma_length = start->length; return 0; } - return __dma_map_cont(start, nelems, sout, pages); + return __dma_map_cont(dev, start, nelems, sout, pages); } /* @@ -416,6 +425,8 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) struct scatterlist *s, *ps, *start_sg, *sgmap; int need = 0, nextneed, i, out, start; unsigned long pages = 0; + unsigned int seg_size; + unsigned int max_seg_size; if (nents == 0) return 0; @@ -426,6 +437,8 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) out = 0; start = 0; start_sg = sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); ps = NULL; /* shut up gcc */ for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -443,11 +456,13 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) * offset. */ if (!iommu_merge || !nextneed || !need || s->offset || + (s->length + seg_size > max_seg_size) || (ps->offset + ps->length) % PAGE_SIZE) { - if (dma_map_cont(start_sg, i - start, sgmap, - pages, need) < 0) + if (dma_map_cont(dev, start_sg, i - start, + sgmap, pages, need) < 0) goto error; out++; + seg_size = 0; sgmap = sg_next(sgmap); pages = 0; start = i; @@ -455,11 +470,12 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) } } + seg_size += s->length; need = nextneed; pages += to_pages(s->offset, s->length); ps = s; } - if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0) + if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) goto error; out++; flush_gart(); @@ -501,7 +517,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) } a = aper + iommu_size; - iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; + iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { printk(KERN_WARNING @@ -731,7 +747,8 @@ void __init gart_iommu_init(void) * the backing memory. The GART address is only used by PCI * devices. */ - clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); + set_memory_np((unsigned long)__va(iommu_bus_base), + iommu_size >> PAGE_SHIFT); /* * Try to workaround a bug (thanks to BenH) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 968371ab223..dabdbeff1f7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -251,7 +251,7 @@ void cpu_idle_wait(void) * because it has nothing to do. * Give all the remaining CPUS a kick. */ - smp_call_function_mask(map, do_nothing, 0, 0); + smp_call_function_mask(map, do_nothing, NULL, 0); } while (!cpus_empty(map)); set_cpus_allowed(current, tmp); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 96286df1bb8..702c33efea8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -103,9 +103,26 @@ static int set_segment_reg(struct task_struct *task, if (invalid_selector(value)) return -EIO; - if (offset != offsetof(struct user_regs_struct, gs)) + /* + * For %cs and %ss we cannot permit a null selector. + * We can permit a bogus selector as long as it has USER_RPL. + * Null selectors are fine for other segment registers, but + * we will never get back to user mode with invalid %cs or %ss + * and will take the trap in iret instead. Much code relies + * on user_mode() to distinguish a user trap frame (which can + * safely use invalid selectors) from a kernel trap frame. + */ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + if (unlikely(value == 0)) + return -EIO; + + default: *pt_regs_access(task_pt_regs(task), offset) = value; - else { + break; + + case offsetof(struct user_regs_struct, gs): task->thread.gs = value; if (task == current) /* @@ -227,12 +244,16 @@ static int set_segment_reg(struct task_struct *task, * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->cs = value; #endif break; case offsetof(struct user_regs_struct,ss): + if (unlikely(value == 0)) + return -EIO; #ifdef CONFIG_IA32_EMULATION if (test_tsk_thread_flag(task, TIF_IA32)) task_pt_regs(task)->ss = value; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 3cd7a2dcd4f..6ba33ca8715 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -380,19 +380,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, void force_hpet_resume(void) { switch (force_hpet_resume_type) { - case ICH_FORCE_HPET_RESUME: - return ich_force_hpet_resume(); - - case OLD_ICH_FORCE_HPET_RESUME: - return old_ich_force_hpet_resume(); - - case VT8237_FORCE_HPET_RESUME: - return vt8237_force_hpet_resume(); - - case NVIDIA_FORCE_HPET_RESUME: - return nvidia_force_hpet_resume(); - - default: + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + default: break; } } diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 62adc5f20be..d1d8c347cc0 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -390,7 +390,7 @@ static void __init reserve_ebda_region(void) unsigned int addr; addr = get_bios_ebda(); if (addr) - reserve_bootmem(addr, PAGE_SIZE); + reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -484,7 +484,8 @@ static void __init reserve_crashkernel(void) (unsigned long)(total_mem >> 20)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size); + reserve_bootmem(crash_base, crash_size, + BOOTMEM_DEFAULT); } else printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); @@ -525,7 +526,7 @@ static void __init reserve_initrd(void) } if (ramdisk_end <= end_of_lowmem) { /* All in lowmem, easy case */ - reserve_bootmem(ramdisk_image, ramdisk_size); + reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; return; @@ -536,7 +537,7 @@ static void __init reserve_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_bootmem(ramdisk_here, ramdisk_size); + reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -606,13 +607,14 @@ void __init setup_bootmem_allocator(void) * bootmem allocator with an invalid RAM area. */ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text), + BOOTMEM_DEFAULT); /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem(0, PAGE_SIZE); + reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT); /* reserve EBDA region, it's a 4K region */ reserve_ebda_region(); @@ -622,7 +624,7 @@ void __init setup_bootmem_allocator(void) unless you have no PS/2 mouse plugged in. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 == 6) - reserve_bootmem(0xa0000 - 4096, 4096); + reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT); #ifdef CONFIG_SMP /* @@ -630,7 +632,7 @@ void __init setup_bootmem_allocator(void) * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); + reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT); #endif #ifdef CONFIG_ACPI_SLEEP /* diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 18df70c534b..a49f5f734a5 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -189,7 +189,7 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); - reserve_bootmem(bootmap, bootmap_size); + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); } #endif @@ -220,28 +220,35 @@ static inline void copy_edd(void) #ifdef CONFIG_KEXEC static void __init reserve_crashkernel(void) { - unsigned long long free_mem; + unsigned long long total_mem; unsigned long long crash_size, crash_base; int ret; - free_mem = - ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; - ret = parse_crashkernel(boot_command_line, free_mem, + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret == 0 && crash_size) { - if (crash_base > 0) { - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(free_mem >> 20)); - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size); - } else + if (crash_base <= 0) { printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; } } #else @@ -1068,82 +1075,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) struct cpuinfo_x86 *c = v; int cpu = 0, i; - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static const char *const x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static const char *const x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ - }; - - #ifdef CONFIG_SMP cpu = c->cpu_index; #endif diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c index 5787a0c3e29..579b9b740c7 100644 --- a/arch/x86/kernel/smpboot_32.c +++ b/arch/x86/kernel/smpboot_32.c @@ -202,8 +202,6 @@ valid_k7: ; } -extern void calibrate_delay(void); - static atomic_t init_deasserted; static void __cpuinit smp_callin(void) diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 2bf6903cb44..b72e61359c3 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -274,7 +274,7 @@ int __init get_memcfg_from_srat(void) int tables = 0; int i = 0; - rsdp_address = acpi_find_rsdp(); + rsdp_address = acpi_os_get_root_pointer(); if (!rsdp_address) { printk("%s: System description tables not found\n", __FUNCTION__); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8344c70adf6..adff5562f5f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -321,6 +321,8 @@ ENTRY(sys_call_table) .long sys_epoll_pwait .long sys_utimensat /* 320 */ .long sys_signalfd - .long sys_timerfd + .long sys_timerfd_create .long sys_eventfd .long sys_fallocate + .long sys_timerfd_settime /* 325 */ + .long sys_timerfd_gettime diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index ae0ef2e304c..10b8a6f69f8 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/sort.h> #include <asm/uaccess.h> +#include <asm/asm.h> extern int rodata_test_data; @@ -89,16 +90,7 @@ static noinline int test_address(void *address) "2: mov %[zero], %[rslt]\n" " ret\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" -#ifdef CONFIG_X86_32 - " .long 0b\n" - " .long 2b\n" -#else - " .quad 0b\n" - " .quad 2b\n" -#endif - ".previous\n" + _ASM_EXTABLE(0b,2b) : [rslt] "=r" (result) : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) ); @@ -147,7 +139,6 @@ static int test_NX(void) * Until then, don't run them to avoid too many people getting scared * by the error message */ -#if 0 #ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ @@ -160,6 +151,7 @@ static int test_NX(void) } #endif +#if 0 /* Test 4: Check if the .data section of a module is executable */ if (test_address(&test_data)) { printk(KERN_ERR "test_nx: .data section is executable\n"); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 9bcc1c6aca3..64580679861 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -11,12 +11,7 @@ * trampoline page to make our stack and everything else * is a mystery. * - * In fact we don't actually need a stack so we don't - * set one up. - * - * We jump into the boot/compressed/head.S code. So you'd - * better be running a compressed kernel image or you - * won't get very far. + * We jump into arch/x86/kernel/head_32.S. * * On entry to trampoline_data, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index e30b67c6a9f..4aedd0bcee4 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -10,9 +10,6 @@ * trampoline page to make our stack and everything else * is a mystery. * - * In fact we don't actually need a stack so we don't - * set one up. - * * On entry to trampoline_data, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value * and IP is zero. Thus, data addresses need to be absolute diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 3cf72977d01..b22c01e05a1 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1176,17 +1176,12 @@ void __init trap_init(void) #endif set_trap_gate(19,&simd_coprocessor_error); + /* + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. + * Generate a build-time error if the alignment is wrong. + */ + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15); if (cpu_has_fxsr) { - /* - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. - * Generates a compile-time "error: zero width for bit-field" if - * the alignment is wrong. - */ - struct fxsrAlignAssert { - int _:!(offsetof(struct task_struct, - thread.i387.fxsave) & 15); - }; - printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); printk("done.\n"); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 4525bc2c2e1..12affe1f9bc 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -220,21 +220,21 @@ static void vmi_set_tr(void) static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { u32 *idt_entry = (u32 *)g; - vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]); + vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]); } static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, const void *desc, int type) { u32 *gdt_entry = (u32 *)desc; - vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]); + vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]); } static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]); + vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c83e1c9b512..41962e793c0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -53,5 +53,6 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig +source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f94a0b89df..cf530814868 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1739,7 +1739,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, if (bytes == 8) { gpa_t gpa; struct page *page; - char *addr; + char *kaddr; u64 val; down_read(¤t->mm->mmap_sem); @@ -1754,9 +1754,9 @@ static int emulator_cmpxchg_emulated(unsigned long addr, val = *(u64 *)new; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - addr = kmap_atomic(page, KM_USER0); - set_64bit((u64 *)(addr + offset_in_page(gpa)), val); - kunmap_atomic(addr, KM_USER0); + kaddr = kmap_atomic(page, KM_USER0); + set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); + kunmap_atomic(kaddr, KM_USER0); kvm_release_page_dirty(page); emul_write: up_read(¤t->mm->mmap_sem); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4876182daf8..25df1c1989f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -21,7 +21,7 @@ else lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o - lib-y += bitstr_64.o bitops_64.o + lib-y += bitops_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o endif diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c index afd0045595d..b6544045985 100644 --- a/arch/x86/lib/bitops_32.c +++ b/arch/x86/lib/bitops_32.c @@ -2,7 +2,7 @@ #include <linux/module.h> /** - * find_next_bit - find the first set bit in a memory region + * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The maximum size to search diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c index 95b6d9639fb..0e8f491e6cc 100644 --- a/arch/x86/lib/bitops_64.c +++ b/arch/x86/lib/bitops_64.c @@ -58,7 +58,7 @@ long find_first_zero_bit(const unsigned long * addr, unsigned long size) } /** - * find_next_zero_bit - find the first zero bit in a memory region + * find_next_zero_bit - find the next zero bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The maximum size to search diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c deleted file mode 100644 index 7445caf1b5d..00000000000 --- a/arch/x86/lib/bitstr_64.c +++ /dev/null @@ -1,28 +0,0 @@ -#include <linux/module.h> -#include <linux/bitops.h> - -/* Find string of zero bits in a bitmap */ -unsigned long -find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) -{ - unsigned long n, end, i; - - again: - n = find_next_zero_bit(bitmap, nbits, start); - if (n == -1) - return -1; - - /* could test bitsliced, but it's hardly worth it */ - end = n+len; - if (end > nbits) - return -1; - for (i = n+1; i < end; i++) { - if (test_bit(i, bitmap)) { - start = i+1; - goto again; - } - } - return n; -} - -EXPORT_SYMBOL(find_next_zero_string); diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c index aad9d95469d..4535e6d147a 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay_32.c @@ -12,8 +12,10 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/timex.h> #include <linux/preempt.h> #include <linux/delay.h> +#include <linux/init.h> #include <asm/processor.h> #include <asm/delay.h> @@ -63,7 +65,7 @@ void use_tsc_delay(void) delay_fn = delay_tsc; } -int read_current_timer(unsigned long *timer_val) +int __devinit read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { rdtscl(*timer_val); diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c index 45cdd3fbd91..bbc61051851 100644 --- a/arch/x86/lib/delay_64.c +++ b/arch/x86/lib/delay_64.c @@ -10,8 +10,10 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/timex.h> #include <linux/preempt.h> #include <linux/delay.h> +#include <linux/init.h> #include <asm/delay.h> #include <asm/msr.h> @@ -20,7 +22,7 @@ #include <asm/smp.h> #endif -int read_current_timer(unsigned long *timer_value) +int __devinit read_current_timer(unsigned long *timer_value) { rdtscll(*timer_value); return 0; diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 28084d2e8dd..cc9b4a4450f 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -4,6 +4,7 @@ #include <linux/hardirq.h> #include <linux/module.h> +#include <asm/asm.h> #include <asm/i387.h> @@ -50,10 +51,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); @@ -81,10 +79,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -181,10 +176,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); for(i=0; i<(4096-320)/64; i++) @@ -211,10 +203,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -311,10 +300,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); for(i=0; i<4096/64; i++) @@ -341,10 +327,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9c4ffd5bedb..e849b9998b0 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -48,10 +48,7 @@ do { \ "3: movl %5,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 0b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ "=&D" (__d2) \ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ @@ -132,11 +129,8 @@ do { \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 0b,3b\n" \ - " .long 1b,2b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,2b) \ : "=&c"(size), "=&D" (__d0) \ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ } while (0) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 893d43f838c..0c89d1bb028 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -31,10 +31,7 @@ do { \ "3: movq %5,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 0b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ "=&D" (__d2) \ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ @@ -87,11 +84,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) "3: lea 0(%[size1],%[size8],8),%[size8]\n" " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 0b,3b\n" - " .quad 1b,2b\n" - ".previous" + _ASM_EXTABLE(0b,3b) + _ASM_EXTABLE(1b,2b) : [size8] "=c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), [zero] "r" (0UL), [eight] "r" (8UL)); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index dffa786f61f..3cc8eb2f36a 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -444,8 +444,6 @@ static __u32 __init setup_trampoline(void) static void __init start_secondary(void *unused) { __u8 cpuid = hard_smp_processor_id(); - /* external functions not defined in the headers */ - extern void calibrate_delay(void); cpu_init(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 04b1d20e261..c394ca0720b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -391,7 +391,8 @@ unsigned long __init setup_memory(void) void __init numa_kva_reserve(void) { if (kva_pages) - reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages)); + reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), + BOOTMEM_DEFAULT); } void __init zone_sizes_init(void) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4440d0abf8..621afb6343d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -240,7 +240,8 @@ void dump_pagetable(unsigned long address) pud = pud_offset(pgd, address); if (bad_address(pud)) goto bad; printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; + if (!pud_present(*pud) || pud_large(*pud)) + goto ret; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; @@ -427,6 +428,16 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + /* * Handle a spurious fault caused by a stale TLB entry. This allows * us to lazily refresh the TLB when increasing the permissions of a @@ -456,20 +467,21 @@ static int spurious_fault(unsigned long address, if (!pud_present(*pud)) return 0; + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - if ((error_code & PF_WRITE) && !pte_write(*pte)) - return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) - return 0; - - return 1; + return spurious_fault_check(error_code, pte); } /* @@ -508,6 +520,10 @@ static int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + /* Copy kernel mappings over when needed. This can also happen within a race in page table update. In the later case just flush. */ @@ -603,6 +619,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ #ifdef CONFIG_X86_32 if (unlikely(address >= TASK_SIZE)) { +#else + if (unlikely(address >= TASK_SIZE64)) { +#endif if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; @@ -618,6 +637,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) goto bad_area_nosemaphore; } + +#ifdef CONFIG_X86_32 /* It's safe to allow irq's after cr2 has been saved and the vmalloc fault has been handled. */ if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) @@ -630,28 +651,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (in_atomic() || !mm) goto bad_area_nosemaphore; #else /* CONFIG_X86_64 */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - - /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) - return; - - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } if (likely(regs->flags & X86_EFLAGS_IF)) local_irq_enable(); @@ -959,11 +958,12 @@ void vmalloc_sync_all(void) for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); @@ -972,7 +972,7 @@ void vmalloc_sync_all(void) else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); set_bit(pgd_index(address), insync); } if (address == start) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f2f36f8dae5..d1bc04006d1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -31,6 +31,7 @@ #include <linux/initrd.h> #include <linux/cpumask.h> +#include <asm/asm.h> #include <asm/processor.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -718,10 +719,7 @@ static noinline int do_test_wp_bit(void) "1: movb %1, %0 \n" " xorl %2, %2 \n" "2: \n" - ".section __ex_table, \"a\"\n" - " .align 4 \n" - " .long 1b, 2b \n" - ".previous \n" + _ASM_EXTABLE(1b,2b) :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), "=q" (tmp_reg), "=r" (flag) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index eabcaed76c2..5fe880fc305 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -273,7 +273,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { - unsigned long entry; pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { @@ -287,9 +286,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) if (pmd_val(*pmd)) continue; - entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; - entry &= __supported_pte_mask; - set_pmd(pmd, __pmd(entry)); + set_pte((pte_t *)pmd, + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); } } @@ -435,49 +433,6 @@ void __init paging_init(void) #endif /* - * Unmap a kernel mapping if it exists. This is useful to avoid - * prefetches from the CPU leading to inconsistent cache lines. - * address and size must be aligned to 2MB boundaries. - * Does nothing when the mapping doesn't exist. - */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) -{ - unsigned long end = address + size; - - BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - - if (pgd_none(*pgd)) - continue; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - continue; - - pmd = pmd_offset(pud, address); - if (!pmd || pmd_none(*pmd)) - continue; - - if (!(pmd_val(*pmd) & _PAGE_PSE)) { - /* - * Could handle this, but it should not happen - * currently: - */ - printk(KERN_ERR "clear_kernel_mapping: " - "mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); - } - set_pmd(pmd, __pmd(0)); - } - __flush_tlb_all(); -} - -/* * Memory hotplug specific functions */ void online_page(struct page *page) @@ -636,10 +591,17 @@ void mark_rodata_ro(void) if (end <= start) return; - set_memory_ro(start, (end - start) >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + /* + * The rodata section (but not the kernel text!) should also be + * not-executable. + */ + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + set_memory_nx(start, (end - start) >> PAGE_SHIFT); rodata_test(); @@ -682,9 +644,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); #else - reserve_bootmem(phys, len); + reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c004d94608f..ee6648fe6b1 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -70,25 +70,12 @@ int page_is_ram(unsigned long pagenr) * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. */ -static int ioremap_change_attr(unsigned long paddr, unsigned long size, +static int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum ioremap_mode mode) { - unsigned long vaddr = (unsigned long)__va(paddr); unsigned long nrpages = size >> PAGE_SHIFT; - unsigned int level; int err; - /* No change for pages after the last mapping */ - if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT)) - return 0; - - /* - * If there is no identity map for this address, - * change_page_attr_addr is unnecessary - */ - if (!lookup_address(vaddr, &level)) - return 0; - switch (mode) { case IOR_MODE_UNCACHED: default: @@ -114,9 +101,8 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size, static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, enum ioremap_mode mode) { - void __iomem *addr; + unsigned long pfn, offset, last_addr, vaddr; struct vm_struct *area; - unsigned long offset, last_addr; pgprot_t prot; /* Don't allow wraparound or zero size */ @@ -133,9 +119,10 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && - (offset << PAGE_SHIFT) < last_addr; offset++) { - if (page_is_ram(offset)) + for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped && + (pfn << PAGE_SHIFT) < last_addr; pfn++) { + if (page_is_ram(pfn) && pfn_valid(pfn) && + !PageReserved(pfn_to_page(pfn))) return NULL; } @@ -163,19 +150,18 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, if (!area) return NULL; area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, prot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + vaddr = (unsigned long) area->addr; + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { + remove_vm_area((void *)(vaddr & PAGE_MASK)); return NULL; } - if (ioremap_change_attr(phys_addr, size, mode) < 0) { - vunmap(addr); + if (ioremap_change_attr(vaddr, size, mode) < 0) { + vunmap(area->addr); return NULL; } - return (void __iomem *) (offset + (char __iomem *)addr); + return (void __iomem *) (vaddr + offset); } /** @@ -254,9 +240,6 @@ void iounmap(volatile void __iomem *addr) return; } - /* Reset the direct mapping. Can block */ - ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED); - /* Finally remove it */ o = remove_vm_area((void *)addr); BUG_ON(p != o || o == NULL); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a920d09b919..1aecc658cd7 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, if (node_data[nodeid] == NULL) return; nodedata_phys = __pa(node_data[nodeid]); + printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, + nodedata_phys + pgdat_size - 1); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; @@ -225,17 +227,21 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, return; } bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap_start >> PAGE_SHIFT, start_pfn, end_pfn); + printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", + bootmap_start, bootmap_start + bootmap_size - 1, + bootmap_pages); + free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size, + BOOTMEM_DEFAULT); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages<<PAGE_SHIFT); + bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); #ifdef CONFIG_ACPI_NUMA srat_reserve_add_area(nodeid); #endif diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 7573e786d2f..ed820160035 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -5,6 +5,7 @@ * and compares page tables forwards and afterwards. */ #include <linux/bootmem.h> +#include <linux/kthread.h> #include <linux/random.h> #include <linux/kernel.h> #include <linux/init.h> @@ -14,8 +15,13 @@ #include <asm/pgtable.h> #include <asm/kdebug.h> +/* + * Only print the results of the first pass: + */ +static __read_mostly int print = 1; + enum { - NTEST = 4000, + NTEST = 400, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -31,7 +37,7 @@ struct split_state { long min_exec, max_exec; }; -static __init int print_split(struct split_state *s) +static int print_split(struct split_state *s) { long i, expected, missed = 0; int printed = 0; @@ -82,10 +88,13 @@ static __init int print_split(struct split_state *s) s->max_exec = addr; } } - printk(KERN_INFO - "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", - s->spg, s->lpg, s->gpg, s->exec, - s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed); + if (print) { + printk(KERN_INFO + " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, + s->max_exec, missed); + } expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; if (expected != i) { @@ -96,11 +105,11 @@ static __init int print_split(struct split_state *s) return err; } -static unsigned long __initdata addr[NTEST]; -static unsigned int __initdata len[NTEST]; +static unsigned long addr[NTEST]; +static unsigned int len[NTEST]; /* Change the global bit on random pages in the direct mapping */ -static __init int exercise_pageattr(void) +static int pageattr_test(void) { struct split_state sa, sb, sc; unsigned long *bm; @@ -110,7 +119,8 @@ static __init int exercise_pageattr(void) int i, k; int err; - printk(KERN_INFO "CPA exercising pageattr\n"); + if (print) + printk(KERN_INFO "CPA self-test:\n"); bm = vmalloc((max_pfn_mapped + 7) / 8); if (!bm) { @@ -137,7 +147,8 @@ static __init int exercise_pageattr(void) for (k = 0; k < len[i]; k++) { pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); - if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 || + !(pte_val(*pte) & _PAGE_PRESENT)) { addr[i] = 0; break; } @@ -185,7 +196,6 @@ static __init int exercise_pageattr(void) failed += print_split(&sb); - printk(KERN_INFO "CPA reverting everything\n"); for (i = 0; i < NTEST; i++) { if (!addr[i]) continue; @@ -213,12 +223,40 @@ static __init int exercise_pageattr(void) failed += print_split(&sc); if (failed) { - printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n"); + printk(KERN_ERR "NOT PASSED. Please report.\n"); WARN_ON(1); + return -EINVAL; } else { - printk(KERN_INFO "CPA selftests PASSED\n"); + if (print) + printk(KERN_INFO "ok.\n"); } return 0; } -module_init(exercise_pageattr); + +static int do_pageattr_test(void *__unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(HZ*30); + if (pageattr_test() < 0) + break; + if (print) + print--; + } + return 0; +} + +static int start_pageattr_test(void) +{ + struct task_struct *p; + + p = kthread_create(do_pageattr_test, NULL, "pageattr-test"); + if (!IS_ERR(p)) + wake_up_process(p); + else + WARN_ON(1); + + return 0; +} + +module_init(start_pageattr_test); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e297bd65e51..8493c855582 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -16,6 +16,17 @@ #include <asm/uaccess.h> #include <asm/pgalloc.h> +/* + * The current flushing context - we pass it instead of 5 arguments: + */ +struct cpa_data { + unsigned long vaddr; + pgprot_t mask_set; + pgprot_t mask_clr; + int numpages; + int flushtlb; +}; + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -52,21 +63,23 @@ void clflush_cache_range(void *vaddr, unsigned int size) static void __cpa_flush_all(void *arg) { + unsigned long cache = (unsigned long)arg; + /* * Flush all to work around Errata in early athlons regarding * large page flushing. */ __flush_tlb_all(); - if (boot_cpu_data.x86_model >= 4) + if (cache && boot_cpu_data.x86_model >= 4) wbinvd(); } -static void cpa_flush_all(void) +static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled()); - on_each_cpu(__cpa_flush_all, NULL, 1, 1); + on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); } static void __cpa_flush_range(void *arg) @@ -79,7 +92,7 @@ static void __cpa_flush_range(void *arg) __flush_tlb_all(); } -static void cpa_flush_range(unsigned long start, int numpages) +static void cpa_flush_range(unsigned long start, int numpages, int cache) { unsigned int i, level; unsigned long addr; @@ -89,6 +102,9 @@ static void cpa_flush_range(unsigned long start, int numpages) on_each_cpu(__cpa_flush_range, NULL, 1, 1); + if (!cache) + return; + /* * We only need to flush on one CPU, * clflush is a MESI-coherent instruction that @@ -101,11 +117,27 @@ static void cpa_flush_range(unsigned long start, int numpages) /* * Only flush present addresses: */ - if (pte && pte_present(*pte)) + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) clflush_cache_range((void *) addr, PAGE_SIZE); } } +#define HIGH_MAP_START __START_KERNEL_map +#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) + + +/* + * Converts a virtual address to a X86-64 highmap address + */ +static unsigned long virt_to_highmap(void *address) +{ +#ifdef CONFIG_X86_64 + return __pa((unsigned long)address) + HIGH_MAP_START - phys_base; +#else + return (unsigned long)address; +#endif +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -129,19 +161,36 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) */ if (within(address, (unsigned long)_text, (unsigned long)_etext)) pgprot_val(forbidden) |= _PAGE_NX; + /* + * Do the same for the x86-64 high kernel mapping + */ + if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) + pgprot_val(forbidden) |= _PAGE_NX; -#ifdef CONFIG_DEBUG_RODATA /* The .rodata section needs to be read-only */ if (within(address, (unsigned long)__start_rodata, (unsigned long)__end_rodata)) pgprot_val(forbidden) |= _PAGE_RW; -#endif + /* + * Do the same for the x86-64 high kernel mapping + */ + if (within(address, virt_to_highmap(__start_rodata), + virt_to_highmap(__end_rodata))) + pgprot_val(forbidden) |= _PAGE_RW; prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); return prot; } +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ pte_t *lookup_address(unsigned long address, int *level) { pgd_t *pgd = pgd_offset_k(address); @@ -152,21 +201,31 @@ pte_t *lookup_address(unsigned long address, int *level) if (pgd_none(*pgd)) return NULL; + pud = pud_offset(pgd, address); if (pud_none(*pud)) return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; *level = PG_LEVEL_2M; - if (pmd_large(*pmd)) + if (pmd_large(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; + return pte_offset_kernel(pmd, address); } +/* + * Set the new pmd in all the pgds we know about: + */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { /* change init_mm */ @@ -189,18 +248,103 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) #endif } +static int +try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long nextpage_addr, numpages, pmask, psize, flags; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot; + int level, do_split = 1; + + spin_lock_irqsave(&pgd_lock, flags); + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + switch (level) { + case PG_LEVEL_2M: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#ifdef CONFIG_X86_64 + case PG_LEVEL_1G: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#endif + default: + do_split = -EINVAL; + goto out_unlock; + } + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + nextpage_addr = (address + psize) & pmask; + numpages = (nextpage_addr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + */ + old_pte = *kpte; + old_prot = new_prot = pte_pgprot(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + new_prot = static_protections(new_prot, address); + + /* + * If there are no changes, return. maxpages has been updated + * above: + */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + do_split = 0; + goto out_unlock; + } + + /* + * We need to change the attributes. Check, whether we can + * change the large page in one go. We request a split, when + * the address is not aligned and the number of pages is + * smaller than the number of pages in the large page. Note + * that we limited the number of possible pages already to + * the number of pages in the large page. + */ + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + /* + * The address is aligned and the number of pages + * covers the full page. + */ + new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + __set_pmd_pte(kpte, address, new_pte); + cpa->flushtlb = 1; + do_split = 0; + } + +out_unlock: + spin_unlock_irqrestore(&pgd_lock, flags); + + return do_split; +} + static int split_large_page(pte_t *kpte, unsigned long address) { - pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + unsigned long flags, pfn, pfninc = 1; gfp_t gfp_flags = GFP_KERNEL; - unsigned long flags; - unsigned long addr; + unsigned int i, level; pte_t *pbase, *tmp; + pgprot_t ref_prot; struct page *base; - unsigned int i, level; #ifdef CONFIG_DEBUG_PAGEALLOC - gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN; gfp_flags = GFP_ATOMIC | __GFP_NOWARN; #endif base = alloc_pages(gfp_flags, 0); @@ -213,30 +357,41 @@ static int split_large_page(pte_t *kpte, unsigned long address) * up for us already: */ tmp = lookup_address(address, &level); - if (tmp != kpte) { - WARN_ON_ONCE(1); + if (tmp != kpte) goto out_unlock; - } - address = __pa(address); - addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); #ifdef CONFIG_X86_32 paravirt_alloc_pt(&init_mm, page_to_pfn(base)); #endif + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - pgprot_val(ref_prot) &= ~_PAGE_NX; - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) - set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); +#ifdef CONFIG_X86_64 + if (level == PG_LEVEL_1G) { + pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pgprot_val(ref_prot) |= _PAGE_PSE; + } +#endif + + /* + * Get the target pfn from the original entry: + */ + pfn = pte_pfn(*kpte); + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) + set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); /* - * Install the new, split up pagetable. Important detail here: + * Install the new, split up pagetable. Important details here: * * On Intel the NX bit of all levels must be cleared to make a * page executable. See section 4.13.2 of Intel 64 and IA-32 * Architectures Software Developer's Manual). + * + * Mark the entry present. The current mapping might be + * set to not present, which we preserved above. */ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); + pgprot_val(ref_prot) |= _PAGE_PRESENT; __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); base = NULL; @@ -249,18 +404,12 @@ out_unlock: return 0; } -static int -__change_page_attr(unsigned long address, unsigned long pfn, - pgprot_t mask_set, pgprot_t mask_clr) +static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { + int level, do_split, err; struct page *kpte_page; - int level, err = 0; pte_t *kpte; -#ifdef CONFIG_X86_32 - BUG_ON(pfn > max_low_pfn); -#endif - repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -271,23 +420,62 @@ repeat: BUG_ON(PageCompound(kpte_page)); if (level == PG_LEVEL_4K) { - pgprot_t new_prot = pte_pgprot(*kpte); pte_t new_pte, old_pte = *kpte; + pgprot_t new_prot = pte_pgprot(old_pte); + + if(!pte_val(old_pte)) { + printk(KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", address, + cpa->vaddr); + WARN_ON(1); + return -EINVAL; + } - pgprot_val(new_prot) &= ~pgprot_val(mask_clr); - pgprot_val(new_prot) |= pgprot_val(mask_set); + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); new_prot = static_protections(new_prot, address); - new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); - BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); + /* + * We need to keep the pfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); - set_pte_atomic(kpte, new_pte); - } else { - err = split_large_page(kpte, address); - if (!err) - goto repeat; + /* + * Do we really change anything ? + */ + if (pte_val(old_pte) != pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flushtlb = 1; + } + cpa->numpages = 1; + return 0; } + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + do_split = try_preserve_large_page(kpte, address, cpa); + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (do_split <= 0) + return do_split; + + /* + * We have to split the large page: + */ + err = split_large_page(kpte, address); + if (!err) { + cpa->flushtlb = 1; + goto repeat; + } + return err; } @@ -304,19 +492,14 @@ repeat: * * Modules and drivers should use the set_memory_* APIs instead. */ - -#define HIGH_MAP_START __START_KERNEL_map -#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) - -static int -change_page_attr_addr(unsigned long address, pgprot_t mask_set, - pgprot_t mask_clr) +static int change_page_attr_addr(struct cpa_data *cpa) { - unsigned long phys_addr = __pa(address); - unsigned long pfn = phys_addr >> PAGE_SHIFT; int err; + unsigned long address = cpa->vaddr; #ifdef CONFIG_X86_64 + unsigned long phys_addr = __pa(address); + /* * If we are inside the high mapped kernel range, then we * fixup the low mapping first. __va() returns the virtual @@ -326,7 +509,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, address = (unsigned long) __va(phys_addr); #endif - err = __change_page_attr(address, pfn, mask_set, mask_clr); + err = __change_page_attr(address, cpa); if (err) return err; @@ -339,42 +522,89 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, /* * Calc the high mapping address. See __phys_addr() * for the non obvious details. + * + * Note that NX and other required permissions are + * checked in static_protections(). */ address = phys_addr + HIGH_MAP_START - phys_base; - /* Make sure the kernel mappings stay executable */ - pgprot_val(mask_clr) |= _PAGE_NX; /* * Our high aliases are imprecise, because we check * everything between 0 and KERNEL_TEXT_SIZE, so do * not propagate lookup failures back to users: */ - __change_page_attr(address, pfn, mask_set, mask_clr); + __change_page_attr(address, cpa); } #endif return err; } -static int __change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr) +static int __change_page_attr_set_clr(struct cpa_data *cpa) { - unsigned int i; - int ret; + int ret, numpages = cpa->numpages; - for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { - ret = change_page_attr_addr(addr, mask_set, mask_clr); + while (numpages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = numpages; + ret = change_page_attr_addr(cpa); if (ret) return ret; - } + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } +static inline int cache_attr(pgprot_t attr) +{ + return pgprot_val(attr) & + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); +} + static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr) { - int ret = __change_page_attr_set_clr(addr, numpages, mask_set, - mask_clr); + struct cpa_data cpa; + int ret, cache; + + /* + * Check, if we are requested to change a not supported + * feature: + */ + mask_set = canon_pgprot(mask_set); + mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + return 0; + + cpa.vaddr = addr; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + cpa.flushtlb = 0; + + ret = __change_page_attr_set_clr(&cpa); + + /* + * Check whether we really changed something: + */ + if (!cpa.flushtlb) + return ret; + + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = cache_attr(mask_set); /* * On success we use clflush, when the CPU supports it to @@ -383,9 +613,9 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * wbindv): */ if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages); + cpa_flush_range(addr, numpages, cache); else - cpa_flush_all(); + cpa_flush_all(cache); return ret; } @@ -489,37 +719,26 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } - -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) -static inline int __change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) -{ - return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); -} - -static inline int __change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) -{ - return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); -} -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC static int __set_pages_p(struct page *page, int numpages) { - unsigned long addr = (unsigned long)page_address(page); + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0)}; - return __change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PRESENT | _PAGE_RW)); + return __change_page_attr_set_clr(&cpa); } static int __set_pages_np(struct page *page, int numpages) { - unsigned long addr = (unsigned long)page_address(page); + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; - return __change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_PRESENT)); + return __change_page_attr_set_clr(&cpa); } void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cb3aa470249..6c1914622a8 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -219,50 +219,39 @@ static inline void pgd_list_del(pgd_t *pgd) list_del(&page->lru); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -#if (PTRS_PER_PMD == 1) -/* Non-PAE pgd constructor */ -static void pgd_ctor(void *pgd) +static void pgd_ctor(void *p) { + pgd_t *pgd = p; unsigned long flags; - /* !PAE, no pagetable sharing */ + /* Clear usermode parts of PGD */ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); - /* must happen under lock */ - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} -#else /* PTRS_PER_PMD > 1 */ -/* PAE pgd constructor */ -static void pgd_ctor(void *pgd) -{ - /* PAE, kernel PMD may be shared */ - - if (SHARED_KERNEL_PMD) { - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { + clone_pgd_range(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - } else { - unsigned long flags; + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - spin_lock_irqsave(&pgd_lock, flags); + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - } + + spin_unlock_irqrestore(&pgd_lock, flags); } -#endif /* PTRS_PER_PMD */ static void pgd_dtor(void *pgd) { @@ -276,9 +265,6 @@ static void pgd_dtor(void *pgd) spin_unlock_irqrestore(&pgd_lock, flags); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - #ifdef CONFIG_X86_PAE /* * Mop up any pmd pages which may still be attached to the pgd. @@ -286,7 +272,7 @@ static void pgd_dtor(void *pgd) * preallocate which never got a corresponding vma will need to be * freed manually. */ -static void pgd_mop_up_pmds(pgd_t *pgdp) +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; @@ -299,7 +285,7 @@ static void pgd_mop_up_pmds(pgd_t *pgdp) pgdp[i] = native_make_pgd(0); paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(pmd); + pmd_free(mm, pmd); } } } @@ -327,7 +313,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) pmd_t *pmd = pmd_alloc_one(mm, addr); if (!pmd) { - pgd_mop_up_pmds(pgd); + pgd_mop_up_pmds(mm, pgd); return 0; } @@ -347,7 +333,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) return 1; } -static void pgd_mop_up_pmds(pgd_t *pgd) +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { } #endif /* CONFIG_X86_PAE */ @@ -366,9 +352,9 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pgd_mop_up_pmds(pgd); + pgd_mop_up_pmds(mm, pgd); quicklist_free(0, pgd_dtor, pgd); } @@ -387,13 +373,6 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - /* This is called just after the pmd has been detached from - the pgd, which requires a full tlb flush to be recognized - by the CPU. Rather than incurring multiple tlb flushes - while the address space is being pulled down, make the tlb - gathering machinery do a full flush when we're done. */ - tlb->fullmm = 1; - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 65416f843e5..ecd91ea8a8a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -488,7 +488,8 @@ void __init srat_reserve_add_area(int nodeid) printk(KERN_INFO "SRAT: This will cost you %Lu MB of " "pre-allocated memory.\n", (unsigned long long)total_mb); reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start); + nodes_add[nodeid].end - nodes_add[nodeid].start, + BOOTMEM_DEFAULT); } } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 42ba0e2da1a..103b9dff121 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -72,7 +72,7 @@ pcibios_align_resource(void *data, struct resource *res, } } } - +EXPORT_SYMBOL(pcibios_align_resource); /* * Handle resources of PCI devices. If the world were perfect, we could diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c index f5f165f69e0..55270c26237 100644 --- a/arch/x86/pci/numa.c +++ b/arch/x86/pci/numa.c @@ -5,36 +5,62 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/nodemask.h> +#include <mach_apic.h> #include "pci.h" +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ + #define BUS2QUAD(global) (mp_bus_id_to_node[global]) #define BUS2LOCAL(global) (mp_bus_id_to_local[global]) #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) +extern void *xquad_portio; /* Where the IO area was mapped */ +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) +static void write_cf8(unsigned bus, unsigned devfn, unsigned reg) +{ + unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg); + if (xquad_portio) + writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus))); + else + outl(val, 0xCF8); +} + static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); - outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + write_cf8(bus, devfn, reg); switch (len) { case 1: - *value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus)); + if (xquad_portio) + *value = readb(adr + (reg & 3)); + else + *value = inb(0xCFC + (reg & 3)); break; case 2: - *value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus)); + if (xquad_portio) + *value = readw(adr + (reg & 2)); + else + *value = inw(0xCFC + (reg & 2)); break; case 4: - *value = inl_quad(0xCFC, BUS2QUAD(bus)); + if (xquad_portio) + *value = readl(adr); + else + *value = inl(0xCFC); break; } @@ -47,23 +73,33 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); - outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + write_cf8(bus, devfn, reg); switch (len) { case 1: - outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus)); + if (xquad_portio) + writeb(value, adr + (reg & 3)); + else + outb((u8)value, 0xCFC + (reg & 3)); break; case 2: - outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus)); + if (xquad_portio) + writew(value, adr + (reg & 2)); + else + outw((u16)value, 0xCFC + (reg & 2)); break; case 4: - outl_quad((u32)value, 0xCFC, BUS2QUAD(bus)); + if (xquad_portio) + writel(value, adr + reg); + else + outl((u32)value, 0xCFC); break; } |