diff options
Diffstat (limited to 'arch')
33 files changed, 573 insertions, 190 deletions
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 033066176b3..8dab3527bc9 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -215,7 +215,7 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size) { struct acpi_table_madt *madt = NULL; - if (!phys_addr || !size) + if (!phys_addr || !size || !cpu_has_apic) return -EINVAL; madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size); @@ -751,6 +751,9 @@ static int __init acpi_parse_madt_ioapic_entries(void) return -ENODEV; } + if (!cpu_has_apic) + return -ENODEV; + /* * if "noapic" boot option, don't look for IO-APICs */ diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 6273bf74c20..254cee9f0b7 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -62,6 +62,18 @@ int apic_verbosity; static void apic_pm_activate(void); +int modern_apic(void) +{ + unsigned int lvr, version; + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + lvr = apic_read(APIC_LVR); + version = GET_APIC_VERSION(lvr); + return version >= 0x14; +} + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -119,10 +131,7 @@ void enable_NMI_through_LVT0 (void * dummy) int get_physical_broadcast(void) { - unsigned int lvr, version; - lvr = apic_read(APIC_LVR); - version = GET_APIC_VERSION(lvr); - if (!APIC_INTEGRATED(version) || version >= 0x14) + if (modern_apic()) return 0xff; else return 0xf; @@ -349,9 +358,9 @@ int __init verify_local_APIC(void) void __init sync_Arb_IDs(void) { - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (ver >= 0x14) /* P4 or higher */ + /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 + And not needed on AMD */ + if (modern_apic()) return; /* * Wait for idle. diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index 712a26bd445..7c0e160a214 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -46,7 +46,7 @@ #define PFX "powernow-k8: " #define BFX PFX "BIOS error: " -#define VERSION "version 1.60.1" +#define VERSION "version 1.60.2" #include "powernow-k8.h" /* serialize freq changes */ @@ -55,7 +55,7 @@ static DEFINE_MUTEX(fidvid_mutex); static struct powernow_k8_data *powernow_data[NR_CPUS]; #ifndef CONFIG_SMP -static cpumask_t cpu_core_map[1] = { CPU_MASK_ALL }; +static cpumask_t cpu_core_map[1]; #endif /* Return a frequency in MHz, given an input fid */ @@ -910,6 +910,9 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi unsigned int newstate; int ret = -EIO; + if (!data) + return -EINVAL; + /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); @@ -969,6 +972,9 @@ static int powernowk8_verify(struct cpufreq_policy *pol) { struct powernow_k8_data *data = powernow_data[pol->cpu]; + if (!data) + return -EINVAL; + return cpufreq_frequency_table_verify(pol, data->powernow_table); } @@ -977,7 +983,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask = CPU_MASK_ALL; - int rc, i; + int rc; if (!cpu_online(pol->cpu)) return -ENODEV; @@ -1063,8 +1069,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) printk("cpu_init done, current fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); - for_each_cpu_mask(i, cpu_core_map[pol->cpu]) - powernow_data[i] = data; + powernow_data[pol->cpu] = data; return 0; @@ -1104,6 +1109,9 @@ static unsigned int powernowk8_get (unsigned int cpu) if (!data) return -EINVAL; + if (!data) + return -EINVAL; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 8d8aa9d1796..db120174aa7 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -110,21 +110,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) static int mpc_record; static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; -#ifdef CONFIG_X86_NUMAQ -static int MP_valid_apicid(int apicid, int version) -{ - return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf; -} -#else -static int MP_valid_apicid(int apicid, int version) -{ - if (version >= 0x14) - return apicid < 0xff; - else - return apicid < 0xf; -} -#endif - static void __devinit MP_processor_info (struct mpc_config_processor *m) { int ver, apicid; @@ -190,12 +175,6 @@ static void __devinit MP_processor_info (struct mpc_config_processor *m) ver = m->mpc_apicver; - if (!MP_valid_apicid(apicid, ver)) { - printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } - /* * Validate version */ diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c index 10e21a4773d..99aab41a05b 100644 --- a/arch/i386/kernel/reboot_fixups.c +++ b/arch/i386/kernel/reboot_fixups.c @@ -51,7 +51,5 @@ void mach_reboot_fixups(void) cur->reboot_fixup(dev); } - - printk(KERN_WARNING "No reboot fixup found for your hardware\n"); } diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index eacc3f0a2ea..80cb3b2d099 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -963,6 +963,36 @@ efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) return 0; } + /* + * This function checks if the entire range <start,end> is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init +e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + /* if the region is at the beginning of <start,end> we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full + * coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + /* * Find the highest page frame number we have available */ @@ -1317,8 +1347,8 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat /* * Request address space for all standard resources * - * This is called just before pcibios_assign_resources(), which is also - * an fs_initcall, but is linked in later (in arch/i386/pci/i386.c). + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). */ static int __init request_standard_resources(void) { @@ -1339,7 +1369,7 @@ static int __init request_standard_resources(void) return 0; } -fs_initcall(request_standard_resources); +subsys_initcall(request_standard_resources); static void __init register_memory(void) { diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 9f66ac582a8..ae6534ad816 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -651,6 +651,7 @@ void __init mem_init(void) * Specifically, in the case of x86, we will always add * memory to the highmem for now. */ +#ifdef CONFIG_HOTPLUG_MEMORY #ifndef CONFIG_NEED_MULTIPLE_NODES int add_memory(u64 start, u64 size) { @@ -667,6 +668,7 @@ int remove_memory(u64 start, u64 size) return -EINVAL; } #endif +#endif kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; diff --git a/arch/i386/pci/direct.c b/arch/i386/pci/direct.c index 99012b93bd1..0659ced0118 100644 --- a/arch/i386/pci/direct.c +++ b/arch/i386/pci/direct.c @@ -4,6 +4,7 @@ #include <linux/pci.h> #include <linux/init.h> +#include <linux/dmi.h> #include "pci.h" /* @@ -18,8 +19,10 @@ int pci_conf1_read(unsigned int seg, unsigned int bus, { unsigned long flags; - if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) + if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) { + *value = -1; return -EINVAL; + } spin_lock_irqsave(&pci_config_lock, flags); @@ -188,6 +191,10 @@ static int __init pci_sanity_check(struct pci_raw_ops *o) if (pci_probe & PCI_NO_CHECKS) return 1; + /* Assume Type 1 works for newer systems. + This handles machines that don't have anything on PCI Bus 0. */ + if (dmi_get_year(DMI_BIOS_DATE) >= 2001) + return 1; for (devfn = 0; devfn < 0x100; devfn++) { if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x)) diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 613789071f3..f77d7f8b9bf 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -12,14 +12,20 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/acpi.h> +#include <asm/e820.h> #include "pci.h" +#define MMCONFIG_APER_SIZE (256*1024*1024) + +/* Assume systems with more busses have correct MCFG */ +#define MAX_CHECK_BUS 16 + #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) /* The base address of the last MMCONFIG device accessed */ static u32 mmcfg_last_accessed_device; -static DECLARE_BITMAP(fallback_slots, 32); +static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32); /* * Functions for accessing PCI configuration space with MMCONFIG accesses @@ -29,8 +35,8 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) int cfg_num = -1; struct acpi_table_mcfg_config *cfg; - if (seg == 0 && bus == 0 && - test_bit(PCI_SLOT(devfn), fallback_slots)) + if (seg == 0 && bus < MAX_CHECK_BUS && + test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots)) return 0; while (1) { @@ -74,8 +80,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, unsigned long flags; u32 base; - if (!value || (bus > 255) || (devfn > 255) || (reg > 4095)) + if (!value || (bus > 255) || (devfn > 255) || (reg > 4095)) { + *value = -1; return -EINVAL; + } base = get_base_addr(seg, bus, devfn); if (!base) @@ -146,29 +154,34 @@ static struct pci_raw_ops pci_mmcfg = { Normally this can be expressed in the MCFG by not listing them and assigning suitable _SEGs, but this isn't implemented in some BIOS. Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. - We only do this for bus 0/seg 0 */ + and fallback for them. */ static __init void unreachable_devices(void) { - int i; + int i, k; unsigned long flags; - for (i = 0; i < 32; i++) { - u32 val1; - u32 addr; - - pci_conf1_read(0, 0, PCI_DEVFN(i, 0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - - /* Locking probably not needed, but safer */ - spin_lock_irqsave(&pci_config_lock, flags); - addr = get_base_addr(0, 0, PCI_DEVFN(i, 0)); - if (addr != 0) - pci_exp_set_dev_base(addr, 0, PCI_DEVFN(i, 0)); - if (addr == 0 || readl((u32 __iomem *)mmcfg_virt_addr) != val1) - set_bit(i, fallback_slots); - spin_unlock_irqrestore(&pci_config_lock, flags); + for (k = 0; k < MAX_CHECK_BUS; k++) { + for (i = 0; i < 32; i++) { + u32 val1; + u32 addr; + + pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1); + if (val1 == 0xffffffff) + continue; + + /* Locking probably not needed, but safer */ + spin_lock_irqsave(&pci_config_lock, flags); + addr = get_base_addr(0, k, PCI_DEVFN(i, 0)); + if (addr != 0) + pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0)); + if (addr == 0 || + readl((u32 __iomem *)mmcfg_virt_addr) != val1) { + set_bit(i, fallback_slots); + printk(KERN_NOTICE + "PCI: No mmconfig possible on %x:%x\n", k, i); + } + spin_unlock_irqrestore(&pci_config_lock, flags); + } } } @@ -183,6 +196,14 @@ void __init pci_mmcfg_init(void) (pci_mmcfg_config[0].base_address == 0)) return; + if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE, + E820_RESERVED)) { + printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n"); + printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + return; + } + printk(KERN_INFO "PCI: Using MMCONFIG\n"); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index d1e2fc56648..648047a0bce 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -194,6 +194,9 @@ endchoice endmenu +config ARCH_SELECT_MEMORY_MODEL + def_bool y + config ARCH_SPARSEMEM_ENABLE def_bool y diff --git a/arch/sparc64/defconfig b/arch/sparc64/defconfig index 30389085a35..1317380fa93 100644 --- a/arch/sparc64/defconfig +++ b/arch/sparc64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.16 -# Fri Mar 31 01:40:57 2006 +# Sun Apr 2 19:31:04 2006 # CONFIG_SPARC=y CONFIG_SPARC64=y @@ -838,7 +838,6 @@ CONFIG_FB_TILEBLITTING=y # CONFIG_FB_NVIDIA is not set # CONFIG_FB_RIVA is not set # CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON_OLD is not set CONFIG_FB_RADEON=y CONFIG_FB_RADEON_I2C=y # CONFIG_FB_RADEON_DEBUG is not set @@ -924,6 +923,7 @@ CONFIG_SND_MTPAV=m # PCI devices # # CONFIG_SND_AD1889 is not set +# CONFIG_SND_ALS300 is not set CONFIG_SND_ALI5451=m # CONFIG_SND_ATIIXP is not set # CONFIG_SND_ATIIXP_MODEM is not set @@ -955,6 +955,7 @@ CONFIG_SND_ALI5451=m # CONFIG_SND_MIXART is not set # CONFIG_SND_NM256 is not set # CONFIG_SND_PCXHR is not set +# CONFIG_SND_RIPTIDE is not set # CONFIG_SND_RME32 is not set # CONFIG_SND_RME96 is not set # CONFIG_SND_RME9652 is not set @@ -1109,6 +1110,11 @@ CONFIG_USB_HIDDEV=y # CONFIG_MMC is not set # +# LED devices +# +# CONFIG_NEW_LEDS is not set + +# # InfiniBand support # # CONFIG_INFINIBAND is not set diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c index eb93e9c5284..49e6dedd027 100644 --- a/arch/sparc64/kernel/ptrace.c +++ b/arch/sparc64/kernel/ptrace.c @@ -244,6 +244,13 @@ asmlinkage void do_ptrace(struct pt_regs *regs) } switch(request) { + case PTRACE_PEEKUSR: + if (addr != 0) + pt_error_return(regs, EIO); + else + pt_succ_return(regs, 0); + goto out_tsk; + case PTRACE_PEEKTEXT: /* read word at location addr. */ case PTRACE_PEEKDATA: { unsigned long tmp64; @@ -602,6 +609,22 @@ asmlinkage void do_ptrace(struct pt_regs *regs) /* PTRACE_DUMPCORE unsupported... */ + case PTRACE_GETEVENTMSG: { + int err; + + if (test_thread_flag(TIF_32BIT)) + err = put_user(child->ptrace_message, + (unsigned int __user *) data); + else + err = put_user(child->ptrace_message, + (unsigned long __user *) data); + if (err) + pt_error_return(regs, -err); + else + pt_succ_return(regs, 0); + break; + } + default: { int err = ptrace_request(child, request, addr, data); if (err) diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 8175a6968c6..eb36f7988ff 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -745,12 +745,21 @@ struct call_data_struct { int wait; }; -static DEFINE_SPINLOCK(call_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); static struct call_data_struct *call_data; extern unsigned long xcall_call_function; -/* +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <<func>> or are or have executed. + * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ @@ -759,7 +768,6 @@ static int smp_call_function_mask(void (*func)(void *info), void *info, { struct call_data_struct data; int cpus; - long timeout; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -777,31 +785,18 @@ static int smp_call_function_mask(void (*func)(void *info), void *info, goto out_unlock; call_data = &data; + mb(); smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask); - /* - * Wait for other cpus to complete function or at - * least snap the call data. - */ - timeout = 1000000; - while (atomic_read(&data.finished) != cpus) { - if (--timeout <= 0) - goto out_timeout; - barrier(); - udelay(1); - } + /* Wait for response */ + while (atomic_read(&data.finished) != cpus) + cpu_relax(); out_unlock: spin_unlock(&call_lock); return 0; - -out_timeout: - spin_unlock(&call_lock); - printk("XCALL: Remote cpus not responding, ncpus=%d finished=%d\n", - cpus, atomic_read(&data.finished)); - return 0; } int smp_call_function(void (*func)(void *info), void *info, diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index ff090bb9734..2793a5d8238 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -1130,9 +1130,9 @@ static void cheetah_log_errors(struct pt_regs *regs, struct cheetah_err_info *in (recoverable ? KERN_WARNING : KERN_CRIT), smp_processor_id(), afsr, afar, (afsr & CHAFSR_TL1) ? 1 : 0); - printk("%s" "ERROR(%d): TPC[%016lx] TNPC[%016lx] TSTATE[%016lx]\n", + printk("%s" "ERROR(%d): TPC[%lx] TNPC[%lx] O7[%lx] TSTATE[%lx]\n", (recoverable ? KERN_WARNING : KERN_CRIT), smp_processor_id(), - regs->tpc, regs->tnpc, regs->tstate); + regs->tpc, regs->tnpc, regs->u_regs[UREG_I7], regs->tstate); printk("%s" "ERROR(%d): M_SYND(%lx), E_SYND(%lx)%s%s\n", (recoverable ? KERN_WARNING : KERN_CRIT), smp_processor_id(), (afsr & CHAFSR_M_SYNDROME) >> CHAFSR_M_SYNDROME_SHIFT, diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 4310b4a311a..7df2fe1844b 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -136,6 +136,11 @@ config X86_L1_CACHE_SHIFT default "7" if GENERIC_CPU || MPSC default "6" if MK8 +config X86_INTERNODE_CACHE_BYTES + int + default "4096" if X86_VSMP + default X86_L1_CACHE_BYTES if !X86_VSMP + config X86_TSC bool default y diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 585fd4a559c..e573e2ab551 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -24,37 +24,37 @@ LDFLAGS := -m elf_x86_64 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := - CHECKFLAGS += -D__x86_64__ -m64 +cflags-y := cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) -CFLAGS += $(cflags-y) -CFLAGS += -m64 -CFLAGS += -mno-red-zone -CFLAGS += -mcmodel=kernel -CFLAGS += -pipe +cflags-y += -m64 +cflags-y += -mno-red-zone +cflags-y += -mcmodel=kernel +cflags-y += -pipe cflags-$(CONFIG_REORDER) += -ffunction-sections # this makes reading assembly source easier, but produces worse code # actually it makes the kernel smaller too. -CFLAGS += -fno-reorder-blocks -CFLAGS += -Wno-sign-compare +cflags-y += -fno-reorder-blocks +cflags-y += -Wno-sign-compare ifneq ($(CONFIG_UNWIND_INFO),y) -CFLAGS += -fno-asynchronous-unwind-tables +cflags-y += -fno-asynchronous-unwind-tables endif ifneq ($(CONFIG_DEBUG_INFO),y) # -fweb shrinks the kernel a bit, but the difference is very small # it also messes up debugging, so don't use it for now. -#CFLAGS += $(call cc-option,-fweb) +#cflags-y += $(call cc-option,-fweb) endif # -funit-at-a-time shrinks the kernel .text considerably # unfortunately it makes reading oopses harder. -CFLAGS += $(call cc-option,-funit-at-a-time) +cflags-y += $(call cc-option,-funit-at-a-time) # prevent gcc from generating any FP code by mistake -CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +CFLAGS += $(cflags-y) AFLAGS += -m64 head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 566ecc97ee5..3c45ec22b3f 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.16-git9 -# Sat Mar 25 15:18:40 2006 +# Linux kernel version: 2.6.17-rc1 +# Mon Apr 3 16:11:14 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -9,6 +9,7 @@ CONFIG_X86=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_RWSEM_GENERIC_SPINLOCK=y +CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_CMPXCHG=y CONFIG_EARLY_PRINTK=y @@ -55,10 +56,6 @@ CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y -CONFIG_CC_ALIGN_FUNCTIONS=0 -CONFIG_CC_ALIGN_LABELS=0 -CONFIG_CC_ALIGN_LOOPS=0 -CONFIG_CC_ALIGN_JUMPS=0 CONFIG_SLAB=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 @@ -70,7 +67,6 @@ CONFIG_BASE_SMALL=0 CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_OBSOLETE_MODPARM=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set @@ -81,6 +77,7 @@ CONFIG_STOP_MACHINE=y # CONFIG_LBD=y # CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_LSF is not set # # IO Schedulers @@ -105,6 +102,7 @@ CONFIG_X86_PC=y CONFIG_GENERIC_CPU=y CONFIG_X86_L1_CACHE_BYTES=128 CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_INTERNODE_CACHE_BYTES=128 CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y # CONFIG_MICROCODE is not set @@ -116,6 +114,7 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_MTRR=y CONFIG_SMP=y CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y # CONFIG_PREEMPT is not set @@ -138,6 +137,7 @@ CONFIG_NEED_MULTIPLE_NODES=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y +CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 CONFIG_HOTPLUG_CPU=y CONFIG_HPET_TIMER=y @@ -289,6 +289,7 @@ CONFIG_IP_PNP_DHCP=y # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set +# CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y @@ -300,6 +301,7 @@ CONFIG_IPV6=y # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set +# CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set # CONFIG_IPV6_TUNNEL is not set # CONFIG_NETFILTER is not set @@ -704,7 +706,6 @@ CONFIG_S2IO=m # Wireless LAN (non-hamradio) # # CONFIG_NET_RADIO is not set -# CONFIG_NET_WIRELESS_RTNETLINK is not set # # Wan interfaces @@ -791,7 +792,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -# CONFIG_SERIAL_8250_ACPI is not set +CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_NR_UARTS=4 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # CONFIG_SERIAL_8250_EXTENDED is not set @@ -921,6 +922,7 @@ CONFIG_HWMON=y # Digital Video Broadcasting Devices # # CONFIG_DVB is not set +# CONFIG_USB_DABUSB is not set # # Graphics support @@ -932,6 +934,8 @@ CONFIG_VIDEO_SELECT=y # Console display driver support # CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256 CONFIG_DUMMY_CONSOLE=y # @@ -1058,15 +1062,6 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_MICROTEK is not set # -# USB Multimedia devices -# -# CONFIG_USB_DABUSB is not set - -# -# Video4Linux support is needed for USB Multimedia device support -# - -# # USB Network Adapters # # CONFIG_USB_CATC is not set @@ -1118,9 +1113,15 @@ CONFIG_USB_MON=y # CONFIG_MMC is not set # +# LED devices +# +# CONFIG_NEW_LEDS is not set + +# # InfiniBand support # # CONFIG_INFINIBAND is not set +# CONFIG_IPATH_CORE is not set # # EDAC - error detection and reporting (RAS) (EXPERIMENTAL) @@ -1128,6 +1129,11 @@ CONFIG_USB_MON=y # CONFIG_EDAC is not set # +# Real Time Clock +# +# CONFIG_RTC_CLASS is not set + +# # Firmware Drivers # # CONFIG_EDD is not set diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 35b2faccdc6..5a980267668 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -15,6 +15,8 @@ #include <asm/vsyscall32.h> #include <linux/linkage.h> +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) + .macro IA32_ARG_FIXUP noebp=0 movl %edi,%r8d .if \noebp @@ -109,8 +111,8 @@ ENTRY(ia32_sysenter_target) CFI_REMEMBER_STATE jnz sysenter_tracesys sysenter_do_call: - cmpl $(IA32_NR_syscalls),%eax - jae ia32_badsys + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys IA32_ARG_FIXUP 1 call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -210,8 +212,8 @@ ENTRY(ia32_cstar_target) CFI_REMEMBER_STATE jnz cstar_tracesys cstar_do_call: - cmpl $IA32_NR_syscalls,%eax - jae ia32_badsys + cmpl $IA32_NR_syscalls-1,%eax + ja ia32_badsys IA32_ARG_FIXUP 1 call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -296,8 +298,8 @@ ENTRY(ia32_syscall) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz ia32_tracesys ia32_do_syscall: - cmpl $(IA32_NR_syscalls),%eax - jae ia32_badsys + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys IA32_ARG_FIXUP call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: @@ -685,12 +687,11 @@ ia32_sys_call_table: .quad sys_readlinkat /* 305 */ .quad sys_fchmodat .quad sys_faccessat - .quad sys_ni_syscall /* pselect6 for now */ - .quad sys_ni_syscall /* ppoll for now */ + .quad quiet_ni_syscall /* pselect6 for now */ + .quad quiet_ni_syscall /* ppoll for now */ .quad sys_unshare /* 310 */ .quad compat_sys_set_robust_list .quad compat_sys_get_robust_list + .quad sys_splice + .quad sys_sync_file_range ia32_syscall_end: - .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 - .quad ni_syscall - .endr diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index fffd6b0a2fa..70b9d21ed67 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -80,7 +80,7 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) printk("Aperture from %s beyond 4GB. Ignoring.\n",name); return 0; } - if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); return 0; } diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 293cd71a266..62776c07cff 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -80,7 +80,12 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 0; } -int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int __meminit +e820_any_mapped(unsigned long start, unsigned long end, unsigned type) { int i; for (i = 0; i < e820.nr_map; i++) { @@ -94,6 +99,35 @@ int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) return 0; } +/* + * This function checks if the entire range <start,end> is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of <start,end> we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + /* * Find a free area in a specific range. */ diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 8538bfea30e..c946e4fe67a 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -180,6 +180,10 @@ rff_trace: * * XXX if we had a free scratch register we could save the RSP into the stack frame * and report it properly in ps. Unfortunately we haven't. + * + * When user can change the frames always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. */ ENTRY(system_call) @@ -254,7 +258,10 @@ sysret_signal: xorl %esi,%esi # oldset -> arg2 call ptregscall_common 1: movl $_TIF_NEED_RESCHED,%edi - jmp sysret_check + /* Use IRET because user could have changed frame. This + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ + cli + jmp int_with_check badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -280,7 +287,8 @@ tracesys: call syscall_trace_leave RESTORE_TOP_OF_STACK %rbx RESTORE_REST - jmp ret_from_sys_call + /* Use IRET because user could have changed frame */ + jmp int_ret_from_sys_call CFI_ENDPROC /* @@ -408,25 +416,9 @@ ENTRY(stub_execve) CFI_ADJUST_CFA_OFFSET -8 CFI_REGISTER rip, r11 SAVE_REST - movq %r11, %r15 - CFI_REGISTER rip, r15 FIXUP_TOP_OF_STACK %r11 call sys_execve - GET_THREAD_INFO(%rcx) - bt $TIF_IA32,threadinfo_flags(%rcx) - CFI_REMEMBER_STATE - jc exec_32bit RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - CFI_REGISTER rip, r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip, 0 - ret - -exec_32bit: - CFI_RESTORE_STATE movq %rax,RAX(%rsp) RESTORE_REST jmp int_ret_from_sys_call diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 10b3e348fc9..6f0790e8b6d 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -29,6 +29,8 @@ #define MISC_MCELOG_MINOR 227 #define NR_BANKS 6 +atomic_t mce_entry; + static int mce_dont_init; /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, @@ -172,10 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) int i; int panicm_found = 0; + atomic_inc(&mce_entry); + if (regs) notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); if (!banks) - return; + goto out2; memset(&m, 0, sizeof(struct mce)); m.cpu = safe_smp_processor_id(); @@ -266,6 +270,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) out: /* Last thing done in the machine check exception to clear state. */ wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); } /* diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index d9e4067faf0..4e6357fe0ec 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -34,6 +34,7 @@ #include <asm/proto.h> #include <asm/kdebug.h> #include <asm/local.h> +#include <asm/mce.h> /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -480,6 +481,12 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) __get_cpu_var(nmi_touch) = 0; touched = 1; } +#ifdef CONFIG_X86_MCE + /* Could check oops_in_progress here too, but it's safer + not too */ + if (atomic_read(&mce_entry) > 0) + touched = 1; +#endif if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 03c9eeedb0f..af035ede70c 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -48,9 +48,11 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { struct page *page; int node; +#ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) node = pcibus_to_node(to_pci_dev(dev)->bus); else +#endif node = numa_node_id(); page = alloc_pages_node(node, gfp, order); return page ? page_address(page) : NULL; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 70dd8e5c688..1c44b53cb15 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -781,10 +781,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } case ARCH_GET_GS: { unsigned long base; + unsigned gsindex; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); - else if (doit) - rdmsrl(MSR_KERNEL_GS_BASE, base); + else if (doit) { + asm("movl %%gs,%0" : "=r" (gsindex)); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0856ad444f9..c50b06765a8 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -353,8 +353,10 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (fullarg(from, "enable_timer_pin_1")) disable_timer_pin_1 = -1; - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) + if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); disable_apic = 1; + } if (fullarg(from, "noapic")) skip_ioapic_setup = 1; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index ef8bc46dc14..7392570f975 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -726,7 +726,7 @@ static __init int late_hpet_init(void) unsigned int ntimer; if (!vxtime.hpet_address) - return -1; + return 0; memset(&hd, 0, sizeof (hd)); @@ -917,6 +917,8 @@ void __init time_init(void) vxtime.hpet_address = 0; if (hpet_use_timer) { + /* set tick_nsec to use the proper rate for HPET */ + tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; #ifdef CONFIG_X86_PM_TIMER diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 39ff0708f80..b81f473c4a1 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -65,7 +65,7 @@ SECTIONS .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { *(.data.cacheline_aligned) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index d78f46056bd..fec4e521c01 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -112,7 +112,6 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback); #undef memcpy #undef memset #undef memmove -#undef strlen extern void * memset(void *,int,__kernel_size_t); extern size_t strlen(const char *); @@ -121,7 +120,6 @@ extern void * memcpy(void *,const void *,__kernel_size_t); extern void * __memcpy(void *,const void *,__kernel_size_t); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memcpy); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index e5f7f1c3446..4ba34e95d83 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned if (paddr >= end) break; - if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } @@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) /* * Memory hotplug specific functions - * These are only for non-NUMA machines right now. */ -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE) void online_page(struct page *page) { @@ -520,6 +519,38 @@ void online_page(struct page *page) num_physpages++; } +#ifndef CONFIG_MEMORY_HOTPLUG +/* + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, + * just online the pages. + */ +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +{ + int err = -EIO; + unsigned long pfn; + unsigned long total = 0, mem = 0; + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { + if (pfn_valid(pfn)) { + online_page(pfn_to_page(pfn)); + err = 0; + mem++; + } + total++; + } + if (!err) { + z->spanned_pages += total; + z->present_pages += mem; + z->zone_pgdat->node_spanned_pages += total; + z->zone_pgdat->node_present_pages += mem; + } + return err; +} +#endif + +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ int add_memory(u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(0); diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 4be82d6e2b4..cc02573a327 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn) } #endif +static void * __init +early_node_mem(int nodeid, unsigned long start, unsigned long end, + unsigned long size) +{ + unsigned long mem = find_e820_area(start, end, size); + void *ptr; + if (mem != -1L) + return __va(mem); + ptr = __alloc_bootmem_nopanic(size, + SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); + if (ptr == 0) { + printk(KERN_ERR "Cannot find %lu bytes in node %d\n", + size, nodeid); + return NULL; + } + return ptr; +} + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; unsigned long nodedata_phys; + void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); start = round_up(start, ZONE_ALIGN); @@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - nodedata_phys = find_e820_area(start, end, pgdat_size); - if (nodedata_phys == -1L) - panic("Cannot find memory pgdat in node %d\n", nodeid); - - Dprintk("nodedata_phys %lx\n", nodedata_phys); + node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); + if (node_data[nodeid] == NULL) + return; + nodedata_phys = __pa(node_data[nodeid]); - node_data[nodeid] = phys_to_virt(nodedata_phys); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; @@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en /* Find a place for the bootmem map */ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); - bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); - if (bootmap_start == -1L) - panic("Not enough continuous space for bootmap on node %d", nodeid); + bootmap = early_node_mem(nodeid, bootmap_start, end, + bootmap_pages<<PAGE_SHIFT); + if (bootmap == NULL) { + if (nodedata_phys < start || nodedata_phys >= end) + free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + node_data[nodeid] = NULL; + return; + } + bootmap_start = __pa(bootmap); Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); bootmap_size = init_bootmem_node(NODE_DATA(nodeid), @@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); +#ifdef CONFIG_ACPI_NUMA + srat_reserve_add_area(nodeid); +#endif node_set_online(nodeid); } @@ -335,6 +361,8 @@ __init int numa_setup(char *opt) #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) acpi_numa = -1; + if (!strncmp(opt,"hotadd=", 7)) + hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 1; } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 2eb879590dc..15ae9fcd65a 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -15,15 +15,26 @@ #include <linux/bitmap.h> #include <linux/module.h> #include <linux/topology.h> +#include <linux/bootmem.h> +#include <linux/mm.h> #include <asm/proto.h> #include <asm/numa.h> #include <asm/e820.h> +#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ + defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ + && !defined(CONFIG_MEMORY_HOTPLUG) +#define RESERVE_HOTADD 1 +#endif + static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode nodes_add[MAX_NUMNODES] __initdata; +static int found_add_area __initdata; +int hotadd_percent __initdata = 10; static u8 pxm2node[256] = { [0 ... 255] = 0xff }; /* Too small nodes confuse the VM badly. Usually they result @@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end) static __init void cutoff_node(int i, unsigned long start, unsigned long end) { struct bootnode *nd = &nodes[i]; + + if (found_add_area) + return; + if (nd->start < start) { nd->start = start; if (nd->end < nd->start) @@ -90,6 +105,8 @@ static __init void bad_srat(void) acpi_numa = -1; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; + for (i = 0; i < MAX_NUMNODES; i++) + nodes_add[i].start = nodes[i].end = 0; } static __init inline int srat_disabled(void) @@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) pxm, pa->apic_id, node); } +#ifdef RESERVE_HOTADD +/* + * Protect against too large hotadd areas that would fill up memory. + */ +static int hotadd_enough_memory(struct bootnode *nd) +{ + static unsigned long allocated; + static unsigned long last_area_end; + unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; + long mem = pages * sizeof(struct page); + unsigned long addr; + unsigned long allowed; + unsigned long oldpages = pages; + + if (mem < 0) + return 0; + allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; + allowed = (allowed / 100) * hotadd_percent; + if (allocated + mem > allowed) { + /* Give them at least part of their hotadd memory upto hotadd_percent + It would be better to spread the limit out + over multiple hotplug areas, but that is too complicated + right now */ + if (allocated >= allowed) + return 0; + pages = (allowed - allocated + mem) / sizeof(struct page); + mem = pages * sizeof(struct page); + nd->end = nd->start + pages*PAGE_SIZE; + } + /* Not completely fool proof, but a good sanity check */ + addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); + if (addr == -1UL) + return 0; + if (pages != oldpages) + printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", + pages << PAGE_SHIFT); + last_area_end = addr + mem; + allocated += mem; + return 1; +} + +/* + * It is fine to add this area to the nodes data it will be used later + * This code supports one contigious hot add area per node. + */ +static int reserve_hotadd(int node, unsigned long start, unsigned long end) +{ + unsigned long s_pfn = start >> PAGE_SHIFT; + unsigned long e_pfn = end >> PAGE_SHIFT; + int changed = 0; + struct bootnode *nd = &nodes_add[node]; + + /* I had some trouble with strange memory hotadd regions breaking + the boot. Be very strict here and reject anything unexpected. + If you want working memory hotadd write correct SRATs. + + The node size check is a basic sanity check to guard against + mistakes */ + if ((signed long)(end - start) < NODE_MIN_SIZE) { + printk(KERN_ERR "SRAT: Hotplug area too small\n"); + return -1; + } + + /* This check might be a bit too strict, but I'm keeping it for now. */ + if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { + printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); + return -1; + } + + if (!hotadd_enough_memory(&nodes_add[node])) { + printk(KERN_ERR "SRAT: Hotplug area too large\n"); + return -1; + } + + /* Looks good */ + + found_add_area = 1; + if (nd->start == nd->end) { + nd->start = start; + nd->end = end; + changed = 1; + } else { + if (nd->start == end) { + nd->start = start; + changed = 1; + } + if (nd->end == start) { + nd->end = end; + changed = 1; + } + if (!changed) + printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); + } + + if ((nd->end >> PAGE_SHIFT) > end_pfn) + end_pfn = nd->end >> PAGE_SHIFT; + + if (changed) + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + return 0; +} +#endif + /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) { - struct bootnode *nd; + struct bootnode *nd, oldnode; unsigned long start, end; int node, pxm; int i; @@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) } if (ma->flags.enabled == 0) return; + if (ma->flags.hot_pluggable && hotadd_percent == 0) + return; start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); pxm = ma->proximity_domain; @@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) bad_srat(); return; } - /* It is fine to add this area to the nodes data it will be used later*/ - if (ma->flags.hot_pluggable == 1) - printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", - start, end); i = conflicting_nodes(start, end); if (i == node) { printk(KERN_WARNING @@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) return; } nd = &nodes[node]; + oldnode = *nd; if (!node_test_and_set(node, nodes_parsed)) { nd->start = start; nd->end = end; @@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (nd->end < end) nd->end = end; } + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); + +#ifdef RESERVE_HOTADD + if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { + /* Ignore hotadd region. Undo damage */ + printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + *nd = oldnode; + if ((nd->start | nd->end) == 0) + node_clear(node, nodes_parsed); + } +#endif } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -225,6 +355,9 @@ static int nodes_cover_memory(void) unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; pxmram -= e820_hole_size(s, e); + pxmram -= nodes_add[i].end - nodes_add[i].start; + if ((long)pxmram < 0) + pxmram = 0; } e820ram = end_pfn - e820_hole_size(0, end_pfn); @@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { - cutoff_node(i, start, end); + cutoff_node(i, start, end); if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) unparse_node(i); } @@ -282,6 +415,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* Finally register nodes */ for_each_node_mask(i, nodes_parsed) setup_node_bootmem(i, nodes[i].start, nodes[i].end); + /* Try again in case setup_node_bootmem missed one due + to missing bootmem */ + for_each_node_mask(i, nodes_parsed) + if (!node_online(i)) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] == NUMA_NO_NODE) continue; @@ -303,6 +442,25 @@ static int node_to_pxm(int n) return 0; } +void __init srat_reserve_add_area(int nodeid) +{ + if (found_add_area && nodes_add[nodeid].end) { + u64 total_mb; + + printk(KERN_INFO "SRAT: Reserving hot-add memory space " + "for node %d at %Lx-%Lx\n", + nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); + total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) + >> PAGE_SHIFT; + total_mb *= sizeof(struct page); + total_mb >>= 20; + printk(KERN_INFO "SRAT: This will cost you %Lu MB of " + "pre-allocated memory.\n", (unsigned long long)total_mb); + reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, + nodes_add[nodeid].end - nodes_add[nodeid].start); + } +} + int __node_distance(int a, int b) { int index; diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index e616500207e..b493ed977e7 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -9,11 +9,16 @@ #include <linux/init.h> #include <linux/acpi.h> #include <linux/bitmap.h> +#include <asm/e820.h> + #include "pci.h" #define MMCONFIG_APER_SIZE (256*1024*1024) +/* Verify the first 16 busses. We assume that systems with more busses + get MCFG right. */ +#define MAX_CHECK_BUS 16 -static DECLARE_BITMAP(fallback_slots, 32); +static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS); /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { @@ -55,7 +60,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; - if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots)) + if (seg == 0 && bus < MAX_CHECK_BUS && + test_bit(32*bus + PCI_SLOT(devfn), fallback_slots)) return NULL; addr = get_virt(seg, bus); if (!addr) @@ -69,8 +75,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, char __iomem *addr; /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ - if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) + if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) { + *value = -1; return -EINVAL; + } addr = pci_dev_base(seg, bus, devfn); if (!addr) @@ -129,21 +137,26 @@ static struct pci_raw_ops pci_mmcfg = { Normally this can be expressed in the MCFG by not listing them and assigning suitable _SEGs, but this isn't implemented in some BIOS. Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. - We only do this for bus 0/seg 0 */ + and fallback for them. */ static __init void unreachable_devices(void) { - int i; - for (i = 0; i < 32; i++) { - u32 val1; - char __iomem *addr; - - pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0)); - if (addr == NULL|| readl(addr) != val1) { - set_bit(i, fallback_slots); + int i, k; + /* Use the max bus number from ACPI here? */ + for (k = 0; i < MAX_CHECK_BUS; k++) { + for (i = 0; i < 32; i++) { + u32 val1; + char __iomem *addr; + + pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); + if (val1 == 0xffffffff) + continue; + addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); + if (addr == NULL|| readl(addr) != val1) { + set_bit(i + 32*k, fallback_slots); + printk(KERN_NOTICE + "PCI: No mmconfig possible on device %x:%x\n", + k, i); + } } } } @@ -161,6 +174,14 @@ void __init pci_mmcfg_init(void) (pci_mmcfg_config[0].base_address == 0)) return; + if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE, + E820_RESERVED)) { + printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n"); + printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + return; + } + /* RED-PEN i386 doesn't do _nocache right now */ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { |