diff options
Diffstat (limited to 'arch/i386')
79 files changed, 2247 insertions, 847 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index d4ae5f9ceae..a801d9d4860 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -510,28 +510,7 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT - default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. +source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" @@ -963,6 +942,41 @@ config SECCOMP source kernel/Kconfig.hz +config PHYSICAL_START + hex "Physical address where the kernel is loaded" if EMBEDDED + default "0x100000" + help + This gives the physical address where the kernel is loaded. + Primarily used in the case of kexec on panic where the + fail safe kernel needs to run at a different address than + the panic-ed kernel. + + Don't change this unless you know what you are doing. + +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + +config CRASH_DUMP + bool "kernel crash dumps (EXPERIMENTAL)" + depends on EMBEDDED + depends on EXPERIMENTAL + depends on HIGHMEM + help + Generate crash dump after being started by kexec. endmenu @@ -1250,6 +1264,15 @@ config SCx200 This support is also available as a module. If compiled as a module, it will be called scx200. +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" + depends on SMP && HOTPLUG && EXPERIMENTAL + ---help--- + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu. + + Say N. + source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" @@ -1262,6 +1285,8 @@ source "fs/Kconfig.binfmt" endmenu +source "net/Kconfig" + source "drivers/Kconfig" source "fs/Kconfig" diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index 43cd6220ee4..1e71382d413 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -25,8 +25,8 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA #RAMDISK := -DRAMDISK=512 -targets := vmlinux.bin bootsect bootsect.o setup setup.o \ - zImage bzImage +targets := vmlinux.bin bootsect bootsect.o \ + setup setup.o zImage bzImage subdir- := compressed hostprogs-y := tools/build diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index c5e80b69e7d..b5893e4ecd3 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -25,6 +25,7 @@ #include <linux/linkage.h> #include <asm/segment.h> +#include <asm/page.h> .globl startup_32 @@ -74,7 +75,7 @@ startup_32: popl %esi # discard address popl %esi # real mode pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $0x100000 + ljmp $(__BOOT_CS), $__PHYSICAL_START /* * We come here, if we were loaded high. @@ -99,7 +100,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $0x100000,%edi + movl $__PHYSICAL_START,%edi cli # make sure we don't get interrupted ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine @@ -124,5 +125,5 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $0x100000 + ljmp $(__BOOT_CS), $__PHYSICAL_START move_routine_end: diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index cedc55cc47d..82a807f9f5e 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -13,6 +13,7 @@ #include <linux/vmalloc.h> #include <linux/tty.h> #include <asm/io.h> +#include <asm/page.h> /* * gzip declarations @@ -308,7 +309,7 @@ static void setup_normal_output_buffer(void) #else if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (char *)0x100000; /* Points to 1M */ + output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -333,8 +334,8 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(0x100000 + low_buffer_size); + if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; @@ -353,7 +354,6 @@ static void close_output_buffer_if_we_run_high(struct moveparams *mv) } } - asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode) { real_mode = rmode; diff --git a/arch/i386/boot/edd.S b/arch/i386/boot/edd.S index 027d6b354ff..d8d69f2b911 100644 --- a/arch/i386/boot/edd.S +++ b/arch/i386/boot/edd.S @@ -6,7 +6,7 @@ * projects 1572D, 1484D, 1386D, 1226DT * disk signature read by Matt Domsch <Matt_Domsch@dell.com> * and Andrew Wilks <Andrew_Wilks@dell.com> September 2003, June 2004 - * legacy CHS retreival by Patrick J. LoPresti <patl@users.sourceforge.net> + * legacy CHS retrieval by Patrick J. LoPresti <patl@users.sourceforge.net> * March 2004 * Command line option parsing, Matt Domsch, November 2004 */ diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index caa1fde6904..8cb420f40c5 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -33,7 +33,7 @@ * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. * <stiker@northlink.com> * - * Fix to work around buggy BIOSes which dont use carry bit correctly + * Fix to work around buggy BIOSes which don't use carry bit correctly * and/or report extended memory in CX/DX for e801h memory size detection * call. As a result the kernel got wrong figures. The int15/e801h docs * from Ralf Brown interrupt list seem to indicate AX/BX should be used @@ -357,7 +357,7 @@ bail820: meme801: stc # fix to work around buggy - xorw %cx,%cx # BIOSes which dont clear/set + xorw %cx,%cx # BIOSes which don't clear/set xorw %dx,%dx # carry on pass/error of # e801h memory size call # or merely pass cx,dx though @@ -847,7 +847,7 @@ flush_instr: # # but we yet haven't reloaded the CS register, so the default size # of the target offset still is 16 bit. -# However, using an operand prefix (0x66), the CPU will properly +# However, using an operand prefix (0x66), the CPU will properly # take our 48 bit far pointer. (INTeL 80386 Programmer's Reference # Manual, Mixing 16-bit and 32-bit code, page 16-6) diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c index 26509b826ae..6835f6d47c3 100644 --- a/arch/i386/boot/tools/build.c +++ b/arch/i386/boot/tools/build.c @@ -1,6 +1,4 @@ /* - * $Id: build.c,v 1.5 1997/05/19 12:29:58 mj Exp $ - * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 Martin Mares */ @@ -8,7 +6,8 @@ /* * This file builds a disk-image from three different files: * - * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest + * - bootsect: compatibility mbr which prints an error message if + * someone tries to boot the kernel directly. * - setup: 8086 machine code, sets up system parm * - system: 80386 code for actual system * @@ -71,7 +70,8 @@ void usage(void) int main(int argc, char ** argv) { - unsigned int i, c, sz, setup_sectors; + unsigned int i, sz, setup_sectors; + int c; u32 sys_size; byte major_root, minor_root; struct stat sb; diff --git a/arch/i386/crypto/aes.c b/arch/i386/crypto/aes.c index 1019430fc1f..88ee85c3b43 100644 --- a/arch/i386/crypto/aes.c +++ b/arch/i386/crypto/aes.c @@ -59,7 +59,7 @@ struct aes_ctx { }; #define WPOLY 0x011b -#define u32_in(x) le32_to_cpu(*(const u32 *)(x)) +#define u32_in(x) le32_to_cpup((const __le32 *)(x)) #define bytes2word(b0, b1, b2, b3) \ (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0)) diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 28e62038379..ca07b95c06b 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -126,7 +126,6 @@ CONFIG_HAVE_DEC_LOCK=y # CONFIG_PM=y CONFIG_SOFTWARE_SUSPEND=y -# CONFIG_PM_DISK is not set # # ACPI (Advanced Configuration and Power Interface) Support diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 51ecd512603..4cc83b322b3 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index ee75cb286cf..5e291a20c03 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -2,3 +2,7 @@ obj-$(CONFIG_ACPI_BOOT) := boot.o obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o +ifneq ($(CONFIG_ACPI_PROCESSOR),) +obj-y += cstate.o +endif + diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 848bb97af7c..b7808a89d94 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -29,6 +29,7 @@ #include <linux/efi.h> #include <linux/irq.h> #include <linux/module.h> +#include <linux/dmi.h> #include <asm/pgtable.h> #include <asm/io_apic.h> @@ -158,9 +159,15 @@ char *__acpi_map_table(unsigned long phys, unsigned long size) #endif #ifdef CONFIG_PCI_MMCONFIG -static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ +struct acpi_table_mcfg_config *pci_mmcfg_config; +int pci_mmcfg_config_num; + +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) { struct acpi_table_mcfg *mcfg; + unsigned long i; + int config_size; if (!phys_addr || !size) return -EINVAL; @@ -171,18 +178,38 @@ static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) return -ENODEV; } - if (mcfg->base_reserved) { - printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); + /* how many config structures do we have */ + pci_mmcfg_config_num = 0; + i = size - sizeof(struct acpi_table_mcfg); + while (i >= sizeof(struct acpi_table_mcfg_config)) { + ++pci_mmcfg_config_num; + i -= sizeof(struct acpi_table_mcfg_config); + }; + if (pci_mmcfg_config_num == 0) { + printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); return -ENODEV; } - pci_mmcfg_base_addr = mcfg->base_address; + config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); + pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); + if (!pci_mmcfg_config) { + printk(KERN_WARNING PREFIX + "No memory for MCFG config tables\n"); + return -ENOMEM; + } + + memcpy(pci_mmcfg_config, &mcfg->config, config_size); + for (i = 0; i < pci_mmcfg_config_num; ++i) { + if (mcfg->config[i].base_reserved) { + printk(KERN_ERR PREFIX + "MMCONFIG not in low 4GB of memory\n"); + return -ENODEV; + } + } return 0; } -#else -#define acpi_parse_mcfg NULL -#endif /* !CONFIG_PCI_MMCONFIG */ +#endif /* CONFIG_PCI_MMCONFIG */ #ifdef CONFIG_X86_LOCAL_APIC static int __init @@ -506,6 +533,22 @@ acpi_unmap_lsapic(int cpu) EXPORT_SYMBOL(acpi_unmap_lsapic); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ +int +acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_register_ioapic); + +int +acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_unregister_ioapic); + static unsigned long __init acpi_scan_rsdp ( unsigned long start, @@ -815,6 +858,219 @@ acpi_process_madt(void) return; } +extern int acpi_force; + +#ifdef __i386__ + +#ifdef CONFIG_ACPI_PCI +static int __init disable_acpi_irq(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", + d->ident); + acpi_noirq_set(); + } + return 0; +} + +static int __init disable_acpi_pci(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", + d->ident); + acpi_disable_pci(); + } + return 0; +} +#endif + +static int __init dmi_disable_acpi(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: acpi off\n",d->ident); + disable_acpi(); + } else { + printk(KERN_NOTICE + "Warning: DMI blacklist says broken, but acpi forced\n"); + } + return 0; +} + +/* + * Limit ACPI to CPU enumeration for HT + */ +static int __init force_acpi_ht(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", d->ident); + disable_acpi(); + acpi_ht = 1; + } else { + printk(KERN_NOTICE + "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); + } + return 0; +} + +/* + * If your system is blacklisted here, but you find that acpi=force + * works for you, please contact acpi-devel@sourceforge.net + */ +static struct dmi_system_id __initdata acpi_dmi_table[] = { + /* + * Boxes that need ACPI disabled + */ + { + .callback = dmi_disable_acpi, + .ident = "IBM Thinkpad", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2629H1G"), + }, + }, + + /* + * Boxes that need acpi=ht + */ + { + .callback = force_acpi_ht, + .ident = "FSC Primergy T850", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "DELL GX240", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "HP VISUALIZE NT Workstation", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "Compaq Workstation W8000", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS P4B266", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4B266"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS P2B-DS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS CUR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ABIT i440BX-W83977", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), + DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM Bladecenter", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eServer xSeries 360", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eserver xSeries 330", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eserver xSeries 440", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), + }, + }, + +#ifdef CONFIG_ACPI_PCI + /* + * Boxes that need ACPI PCI IRQ routing disabled + */ + { + .callback = disable_acpi_irq, + .ident = "ASUS A7V", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), + DMI_MATCH(DMI_BOARD_NAME, "<A7V>"), + /* newer BIOS, Revision 1011, does work */ + DMI_MATCH(DMI_BIOS_VERSION, "ASUS A7V ACPI BIOS Revision 1007"), + }, + }, + + /* + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled + */ + { /* _BBN 0 bug */ + .callback = disable_acpi_pci, + .ident = "ASUS PR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"), + DMI_MATCH(DMI_BIOS_VERSION, "ASUS PR-DLS ACPI BIOS Revision 1010"), + DMI_MATCH(DMI_BIOS_DATE, "03/21/2003") + }, + }, + { + .callback = disable_acpi_pci, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, +#endif + { } +}; + +#endif /* __i386__ */ + /* * acpi_boot_table_init() and acpi_boot_init() * called from setup_arch(), always. @@ -843,6 +1099,10 @@ acpi_boot_table_init(void) { int error; +#ifdef __i386__ + dmi_check_system(acpi_dmi_table); +#endif + /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs @@ -870,8 +1130,6 @@ acpi_boot_table_init(void) */ error = acpi_blacklisted(); if (error) { - extern int acpi_force; - if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { @@ -907,7 +1165,6 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_HPET, acpi_parse_hpet); - acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); return 0; } diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c new file mode 100644 index 00000000000..4c3036ba65d --- /dev/null +++ b/arch/i386/kernel/acpi/cstate.c @@ -0,0 +1,103 @@ +/* + * arch/i386/kernel/acpi/cstate.c + * + * Copyright (C) 2005 Intel Corporation + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * - Added _PDC for SMP C-states on Intel CPUs + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/acpi.h> + +#include <acpi/processor.h> +#include <asm/acpi.h> + +static void acpi_processor_power_init_intel_pdc(struct acpi_processor_power + *pow) +{ + struct acpi_object_list *obj_list; + union acpi_object *obj; + u32 *buf; + + /* allocate and initialize pdc. It will be used later. */ + obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL); + if (!obj_list) { + printk(KERN_ERR "Memory allocation error\n"); + return; + } + + obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL); + if (!obj) { + printk(KERN_ERR "Memory allocation error\n"); + kfree(obj_list); + return; + } + + buf = kmalloc(12, GFP_KERNEL); + if (!buf) { + printk(KERN_ERR "Memory allocation error\n"); + kfree(obj); + kfree(obj_list); + return; + } + + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = ACPI_PDC_C_CAPABILITY_SMP; + + obj->type = ACPI_TYPE_BUFFER; + obj->buffer.length = 12; + obj->buffer.pointer = (u8 *) buf; + obj_list->count = 1; + obj_list->pointer = obj; + pow->pdc = obj_list; + + return; +} + +/* Initialize _PDC data based on the CPU vendor */ +void acpi_processor_power_init_pdc(struct acpi_processor_power *pow, + unsigned int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + + pow->pdc = NULL; + if (c->x86_vendor == X86_VENDOR_INTEL) + acpi_processor_power_init_intel_pdc(pow); + + return; +} + +EXPORT_SYMBOL(acpi_processor_power_init_pdc); + +/* + * Initialize bm_flags based on the CPU cache properties + * On SMP it depends on cache configuration + * - When cache is not shared among all CPUs, we flush cache + * before entering C3. + * - When cache is shared among all CPUs, we use bm_check + * mechanism as in UP case + * + * This routine is called only after all the CPUs are online + */ +void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, + unsigned int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + + flags->bm_check = 0; + if (num_online_cpus() == 1) + flags->bm_check = 1; + else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Today all CPUs that support C3 share cache. + * TBD: This needs to look at cache shared map, once + * multi-core detection patch makes to the base. + */ + flags->bm_check = 1; + } +} + +EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c index 28bb0514bb6..c1af93032ff 100644 --- a/arch/i386/kernel/acpi/sleep.c +++ b/arch/i386/kernel/acpi/sleep.c @@ -7,6 +7,7 @@ #include <linux/acpi.h> #include <linux/bootmem.h> +#include <linux/dmi.h> #include <asm/smp.h> #include <asm/tlbflush.h> @@ -91,3 +92,29 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); + + +static __init int reset_videomode_after_s3(struct dmi_system_id *d) +{ + acpi_video_flags |= 2; + return 0; +} + +static __initdata struct dmi_system_id acpisleep_dmi_table[] = { + { /* Reset video mode after returning from ACPI S3 sleep */ + .callback = reset_videomode_after_s3, + .ident = "Toshiba Satellite 4030cdt", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), + }, + }, + { } +}; + +static int __init acpisleep_dmi_init(void) +{ + dmi_check_system(acpisleep_dmi_table); + return 0; +} + +core_initcall(acpisleep_dmi_init); diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S index 39d32484f6f..44d886c745e 100644 --- a/arch/i386/kernel/acpi/wakeup.S +++ b/arch/i386/kernel/acpi/wakeup.S @@ -74,8 +74,9 @@ wakeup_code: movw %ax,%fs movw $0x0e00 + 'i', %fs:(0x12) - # need a gdt - lgdt real_save_gdt - wakeup_code + # need a gdt -- use lgdtl to force 32-bit operands, in case + # the GDT is located past 16 megabytes. + lgdtl real_save_gdt - wakeup_code movl real_save_cr0 - wakeup_code, %eax movl %eax, %cr0 diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 8d993fa7175..bd1dbf3bd22 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> +#include <linux/cpu.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -34,12 +35,18 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> #include <asm/hpet.h> +#include <asm/i8253.h> #include <mach_apic.h> #include "io_ports.h" /* + * Knob to control our willingness to enable the local APIC. + */ +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +/* * Debug level */ int apic_verbosity; @@ -205,7 +212,7 @@ void __init connect_bsp_APIC(void) enable_apic_mode(); } -void disconnect_bsp_APIC(void) +void disconnect_bsp_APIC(int virt_wire_setup) { if (pic_mode) { /* @@ -219,6 +226,42 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } + else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) @@ -363,7 +406,7 @@ void __init init_bsp_APIC(void) apic_write_around(APIC_LVT1, value); } -void __init setup_local_APIC (void) +void __devinit setup_local_APIC(void) { unsigned long oldvalue, value, ver, maxlvt; @@ -634,7 +677,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __init apic_pm_activate(void) +static void __devinit apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -665,26 +708,6 @@ static void apic_pm_activate(void) { } * Original code written by Keir Fraser. */ -/* - * Knob to control our willingness to enable the local APIC. - */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ - -static int __init lapic_disable(char *str) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return 0; -} -__setup("nolapic", lapic_disable); - -static int __init lapic_enable(char *str) -{ - enable_local_apic = 1; - return 0; -} -__setup("lapic", lapic_enable); - static int __init apic_set_verbosity(char *str) { if (strcmp("debug", str) == 0) @@ -855,9 +878,8 @@ fake_ioapic_page: * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. */ -static unsigned int __init get_8254_timer_count(void) +static unsigned int __devinit get_8254_timer_count(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; @@ -874,7 +896,7 @@ static unsigned int __init get_8254_timer_count(void) } /* next tick in 8254 can be caught by catching timer wraparound */ -static void __init wait_8254_wraparound(void) +static void __devinit wait_8254_wraparound(void) { unsigned int curr_count, prev_count; @@ -894,7 +916,7 @@ static void __init wait_8254_wraparound(void) * Default initialization for 8254 timers. If we use other timers like HPET, * we override this later */ -void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound; +void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound; /* * This function sets up the local APIC timer, with a timeout of @@ -930,7 +952,7 @@ static void __setup_APIC_LVTT(unsigned int clocks) apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -static void __init setup_APIC_timer(unsigned int clocks) +static void __devinit setup_APIC_timer(unsigned int clocks) { unsigned long flags; @@ -1043,12 +1065,12 @@ void __init setup_boot_APIC_clock(void) local_irq_enable(); } -void __init setup_secondary_APIC_clock(void) +void __devinit setup_secondary_APIC_clock(void) { setup_APIC_timer(calibration_result); } -void __init disable_APIC_timer(void) +void __devinit disable_APIC_timer(void) { if (using_apic_timer) { unsigned long v; diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 0ff65abcd56..064211d5f41 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -228,10 +228,10 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <asm/desc.h> +#include <asm/i8253.h> #include "io_ports.h" -extern spinlock_t i8253_lock; extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); @@ -346,10 +346,10 @@ extern int (*console_blank_hook)(int); struct apm_user { int magic; struct apm_user * next; - int suser: 1; - int writer: 1; - int reader: 1; - int suspend_wait: 1; + unsigned int suser: 1; + unsigned int writer: 1; + unsigned int reader: 1; + unsigned int suspend_wait: 1; int suspend_result; int suspends_pending; int standbys_pending; @@ -1168,8 +1168,7 @@ static void get_time_diff(void) static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND - unsigned long flags; - extern spinlock_t i8253_lock; + unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); /* set the clock to 100 Hz */ diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index b9954248d0a..4553ffd94b1 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -24,9 +24,9 @@ EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); -static int cachesize_override __initdata = -1; -static int disable_x86_fxsr __initdata = 0; -static int disable_x86_serial_nr __initdata = 1; +static int cachesize_override __devinitdata = -1; +static int disable_x86_fxsr __devinitdata = 0; +static int disable_x86_serial_nr __devinitdata = 1; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; @@ -59,7 +59,7 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -int __init get_model_name(struct cpuinfo_x86 *c) +int __devinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; @@ -89,7 +89,7 @@ int __init get_model_name(struct cpuinfo_x86 *c) } -void __init display_cacheinfo(struct cpuinfo_x86 *c) +void __devinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, ecx, edx, l2size; @@ -130,7 +130,7 @@ void __init display_cacheinfo(struct cpuinfo_x86 *c) /* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ /* Look up CPU names by table lookup. */ -static char __init *table_lookup_model(struct cpuinfo_x86 *c) +static char __devinit *table_lookup_model(struct cpuinfo_x86 *c) { struct cpu_model_info *info; @@ -151,7 +151,7 @@ static char __init *table_lookup_model(struct cpuinfo_x86 *c) } -void __init get_cpu_vendor(struct cpuinfo_x86 *c, int early) +void __devinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) { char *v = c->x86_vendor_id; int i; @@ -202,7 +202,7 @@ static inline int flag_is_changeable_p(u32 flag) /* Probe for the CPUID instruction */ -static int __init have_cpuid_p(void) +static int __devinit have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -249,7 +249,7 @@ static void __init early_cpu_detect(void) #endif } -void __init generic_identify(struct cpuinfo_x86 * c) +void __devinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; int junk; @@ -296,7 +296,7 @@ void __init generic_identify(struct cpuinfo_x86 * c) } } -static void __init squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +static void __devinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { /* Disable processor serial number */ @@ -324,7 +324,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -void __init identify_cpu(struct cpuinfo_x86 *c) +void __devinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -432,10 +432,18 @@ void __init identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + if (c == &boot_cpu_data) + sysenter_setup(); + enable_sep_cpu(); + + if (c == &boot_cpu_data) + mtrr_bp_init(); + else + mtrr_ap_init(); } #ifdef CONFIG_X86_HT -void __init detect_ht(struct cpuinfo_x86 *c) +void __devinit detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; int index_msb, tmp; @@ -490,7 +498,7 @@ void __init detect_ht(struct cpuinfo_x86 *c) } #endif -void __init print_cpu_info(struct cpuinfo_x86 *c) +void __devinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -513,7 +521,7 @@ void __init print_cpu_info(struct cpuinfo_x86 *c) printk("\n"); } -cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; +cpumask_t cpu_initialized __devinitdata = CPU_MASK_NONE; /* This is hacky. :) * We're emulating future behavior. @@ -560,7 +568,7 @@ void __init early_cpu_init(void) * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. */ -void __init cpu_init (void) +void __devinit cpu_init(void) { int cpu = smp_processor_id(); struct tss_struct * t = &per_cpu(init_tss, cpu); @@ -648,3 +656,15 @@ void __init cpu_init (void) clear_used_math(); mxcsr_feature_mask_init(); } + +#ifdef CONFIG_HOTPLUG_CPU +void __devinit cpu_uninit(void) +{ + int cpu = raw_smp_processor_id(); + cpu_clear(cpu, cpu_initialized); + + /* lazy TLB state */ + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} +#endif diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c index 1a49adb1f4a..e86ea486c31 100644 --- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c @@ -190,7 +190,7 @@ static __init struct pci_dev *gx_detect_chipset(void) /* detect which companion chip is used */ while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { - if ((pci_match_device (gx_chipset_tbl, gx_pci)) != NULL) { + if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) { return gx_pci; } } diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c index 5c530064eb7..73a5dc5b26b 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c @@ -648,9 +648,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) { } #endif - if (powernow_table) - kfree(powernow_table); - + kfree(powernow_table); return 0; } diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c index 7dcbf70fc16..327a55d4d1c 100644 --- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c @@ -375,7 +375,7 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy) arg0.buffer.pointer = (u8 *) arg0_buf; arg0_buf[0] = ACPI_PDC_REVISION_ID; arg0_buf[1] = 1; - arg0_buf[2] = ACPI_PDC_EST_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_MSR; + arg0_buf[2] = ACPI_PDC_EST_CAPABILITY_SMP_MSR; p.pdc = &arg_list; diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 121aa2176e6..a2c33c1a46c 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -25,10 +25,10 @@ extern int trap_init_f00f_bug(void); /* * Alignment at which movsl is preferred for bulk memory copies. */ -struct movsl_mask movsl_mask; +struct movsl_mask movsl_mask __read_mostly; #endif -void __init early_intel_workaround(struct cpuinfo_x86 *c) +void __devinit early_intel_workaround(struct cpuinfo_x86 *c) { if (c->x86_vendor != X86_VENDOR_INTEL) return; @@ -43,7 +43,7 @@ void __init early_intel_workaround(struct cpuinfo_x86 *c) * This is called before we do cpu ident work */ -int __init ppro_with_ram_bug(void) +int __devinit ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -61,7 +61,7 @@ int __init ppro_with_ram_bug(void) * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ -static void __init Intel_errata_workarounds(struct cpuinfo_x86 *c) +static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -80,7 +80,7 @@ static void __init Intel_errata_workarounds(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __init num_cpu_cores(struct cpuinfo_x86 *c) +static int __devinit num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax; @@ -98,7 +98,7 @@ static int __init num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __init init_intel(struct cpuinfo_x86 *c) +static void __devinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; char *p = NULL; @@ -204,7 +204,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) return size; } -static struct cpu_dev intel_cpu_dev __initdata = { +static struct cpu_dev intel_cpu_dev __devinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, .c_models = { diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index a710dc4eb20..1d768b26326 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -28,7 +28,7 @@ struct _cache_table }; /* all the cache descriptor types we care about (no TLB or trace cache entries) */ -static struct _cache_table cache_table[] __initdata = +static struct _cache_table cache_table[] __devinitdata = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ @@ -160,7 +160,7 @@ static int __init find_num_cache_leaves(void) return retval; } -unsigned int __init init_intel_cacheinfo(struct cpuinfo_x86 *c) +unsigned int __devinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c index 8df52e86c4d..c4abe765739 100644 --- a/arch/i386/kernel/cpu/mcheck/k7.c +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -69,7 +69,7 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) /* AMD K7 machine check is Intel like */ -void __init amd_mcheck_init(struct cpuinfo_x86 *c) +void __devinit amd_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index bf6d1aefafc..2cf25d2ba0f 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -16,7 +16,7 @@ #include "mce.h" -int mce_disabled __initdata = 0; +int mce_disabled __devinitdata = 0; int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ @@ -31,7 +31,7 @@ static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_ void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; /* This has to be run for each processor */ -void __init mcheck_init(struct cpuinfo_x86 *c) +void __devinit mcheck_init(struct cpuinfo_x86 *c) { if (mce_disabled==1) return; diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 8b16ceb929b..0abccb6fdf9 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -78,7 +78,7 @@ fastcall void smp_thermal_interrupt(struct pt_regs *regs) } /* P4/Xeon Thermal regulation detect and init */ -static void __init intel_init_thermal(struct cpuinfo_x86 *c) +static void __devinit intel_init_thermal(struct cpuinfo_x86 *c) { u32 l, h; unsigned int cpu = smp_processor_id(); @@ -232,7 +232,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) } -void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c) +void __devinit intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; diff --git a/arch/i386/kernel/cpu/mcheck/p5.c b/arch/i386/kernel/cpu/mcheck/p5.c index c45a1b485c8..ec0614cd292 100644 --- a/arch/i386/kernel/cpu/mcheck/p5.c +++ b/arch/i386/kernel/cpu/mcheck/p5.c @@ -29,7 +29,7 @@ static fastcall void pentium_machine_check(struct pt_regs * regs, long error_cod } /* Set up machine check reporting for processors with Intel style MCE */ -void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c) +void __devinit intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; diff --git a/arch/i386/kernel/cpu/mcheck/p6.c b/arch/i386/kernel/cpu/mcheck/p6.c index 46640f8c249..f01b73f947e 100644 --- a/arch/i386/kernel/cpu/mcheck/p6.c +++ b/arch/i386/kernel/cpu/mcheck/p6.c @@ -80,7 +80,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) } /* Set up machine check reporting for processors with Intel style MCE */ -void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c) +void __devinit intel_p6_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; diff --git a/arch/i386/kernel/cpu/mcheck/winchip.c b/arch/i386/kernel/cpu/mcheck/winchip.c index 753fa7acb98..7bae68fa168 100644 --- a/arch/i386/kernel/cpu/mcheck/winchip.c +++ b/arch/i386/kernel/cpu/mcheck/winchip.c @@ -23,7 +23,7 @@ static fastcall void winchip_machine_check(struct pt_regs * regs, long error_cod } /* Set up machine check reporting on the Winchip C6 series */ -void __init winchip_mcheck_init(struct cpuinfo_x86 *c) +void __devinit winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; machine_check_vector = winchip_machine_check; diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f468a979e9a..169ac8e0db6 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -67,14 +67,6 @@ void __init get_mtrr_state(void) mtrr_state.enabled = (lo & 0xc00) >> 10; } -/* Free resources associated with a struct mtrr_state */ -void __init finalize_mtrr_state(void) -{ - if (mtrr_state.var_ranges) - kfree(mtrr_state.var_ranges); - mtrr_state.var_ranges = NULL; -} - /* Some BIOS's are fucked and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { @@ -335,6 +327,9 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, */ { unsigned long flags; + struct mtrr_var_range *vr; + + vr = &mtrr_state.var_ranges[reg]; local_irq_save(flags); prepare_set(); @@ -343,11 +338,15 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, /* The invalid bit is kept in the mask, so we simply clear the relevant mask register to disable a range. */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); + memset(vr, 0, sizeof(struct mtrr_var_range)); } else { - mtrr_wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type, - (base & size_and_mask) >> (32 - PAGE_SHIFT)); - mtrr_wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800, - (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + vr->base_lo = base << PAGE_SHIFT | type; + vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT); + vr->mask_lo = -size << PAGE_SHIFT | 0x800; + vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT); + + mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); + mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); } post_set(); diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index d66b09e0c82..764cac64e21 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -332,6 +332,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = -EINVAL; + /* No CPU hotplug when we change MTRR entries */ + lock_cpu_hotplug(); /* Search for existing MTRR */ down(&main_lock); for (i = 0; i < num_var_ranges; ++i) { @@ -372,6 +374,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; out: up(&main_lock); + unlock_cpu_hotplug(); return error; } @@ -461,6 +464,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) return -ENXIO; max = num_var_ranges; + /* No CPU hotplug when we change MTRR entries */ + lock_cpu_hotplug(); down(&main_lock); if (reg < 0) { /* Search for existing MTRR */ @@ -501,6 +506,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) error = reg; out: up(&main_lock); + unlock_cpu_hotplug(); return error; } /** @@ -544,21 +550,9 @@ static void __init init_ifs(void) centaur_init_mtrr(); } -static void __init init_other_cpus(void) -{ - if (use_intel()) - get_mtrr_state(); - - /* bring up the other processors */ - set_mtrr(~0U,0,0,0); - - if (use_intel()) { - finalize_mtrr_state(); - mtrr_state_warn(); - } -} - - +/* The suspend/resume methods are only for CPU without MTRR. CPU using generic + * MTRR driver doesn't require this + */ struct mtrr_value { mtrr_type ltype; unsigned long lbase; @@ -611,13 +605,13 @@ static struct sysdev_driver mtrr_sysdev_driver = { /** - * mtrr_init - initialize mtrrs on the boot CPU + * mtrr_bp_init - initialize mtrrs on the boot CPU * * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). * */ -static int __init mtrr_init(void) +void __init mtrr_bp_init(void) { init_ifs(); @@ -674,12 +668,48 @@ static int __init mtrr_init(void) if (mtrr_if) { set_num_var_ranges(); init_table(); - init_other_cpus(); - - return sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); + if (use_intel()) + get_mtrr_state(); } - return -ENXIO; } -subsys_initcall(mtrr_init); +void mtrr_ap_init(void) +{ + unsigned long flags; + + if (!mtrr_if || !use_intel()) + return; + /* + * Ideally we should hold main_lock here to avoid mtrr entries changed, + * but this routine will be called in cpu boot time, holding the lock + * breaks it. This routine is called in two cases: 1.very earily time + * of software resume, when there absolutely isn't mtrr entry changes; + * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to + * prevent mtrr entry changes + */ + local_irq_save(flags); + + mtrr_if->set_all(); + + local_irq_restore(flags); +} + +static int __init mtrr_init_finialize(void) +{ + if (!mtrr_if) + return 0; + if (use_intel()) + mtrr_state_warn(); + else { + /* The CPUs haven't MTRR and seemes not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * TBD: is there any system with such CPU which supports + * suspend/resume? if no, we should remove the code. + */ + sysdev_driver_register(&cpu_sysdev_class, + &mtrr_sysdev_driver); + } + return 0; +} +subsys_initcall(mtrr_init_finialize); diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index de135124559..99c9f268204 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -91,7 +91,6 @@ extern struct mtrr_ops * mtrr_if; extern unsigned int num_var_ranges; -void finalize_mtrr_state(void); void mtrr_state_warn(void); char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c new file mode 100644 index 00000000000..e5fab12f792 --- /dev/null +++ b/arch/i386/kernel/crash.c @@ -0,0 +1,223 @@ +/* + * Architecture specific (i386) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/elf.h> +#include <linux/elfcore.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <mach_ipi.h> + + +note_buf_t crash_notes[NR_CPUS]; +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) +3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that that no need to invent something new. + */ + buf = &crash_notes[cpu][0]; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + +static void crash_get_current_regs(struct pt_regs *regs) +{ + __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); + + regs->eip = (unsigned long)current_text_addr(); +} + +/* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ +static void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) +{ + memcpy(newregs, oldregs, sizeof(*newregs)); + newregs->esp = (unsigned long)&(oldregs->esp); + __asm__ __volatile__("xorl %eax, %eax;"); + __asm__ __volatile__ ("movw %%ss, %%ax;" :"=a"(newregs->xss)); +} + +/* We may have saved_regs from where the error came from + * or it is NULL if via a direct panic(). + */ +static void crash_save_self(struct pt_regs *saved_regs) +{ + struct pt_regs regs; + int cpu; + + cpu = smp_processor_id(); + if (saved_regs) + crash_setup_regs(®s, saved_regs); + else + crash_get_current_regs(®s); + crash_save_this_cpu(®s, cpu); +} + +#ifdef CONFIG_SMP +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct pt_regs *regs, int cpu) +{ + struct pt_regs fixed_regs; + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return 1; + local_irq_disable(); + + if (!user_mode(regs)) { + crash_setup_regs(&fixed_regs, regs); + regs = &fixed_regs; + } + crash_save_this_cpu(regs, cpu); + disable_local_APIC(); + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + __asm__("hlt"); + for(;;); + + return 1; +} + +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +static void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} + +static void nmi_shootdown_cpus(void) +{ + unsigned long msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + set_nmi_callback(crash_nmi_callback); + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ + disable_local_APIC(); +} +#else +static void nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = smp_processor_id(); + nmi_shootdown_cpus(); + lapic_shutdown(); +#if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); +#endif + crash_save_self(regs); +} diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c index 6ed7e28f306..a3cdf894302 100644 --- a/arch/i386/kernel/dmi_scan.c +++ b/arch/i386/kernel/dmi_scan.c @@ -1,22 +1,15 @@ #include <linux/types.h> -#include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/slab.h> -#include <linux/acpi.h> -#include <asm/io.h> -#include <linux/pm.h> -#include <asm/system.h> #include <linux/dmi.h> #include <linux/bootmem.h> -struct dmi_header -{ - u8 type; - u8 length; - u16 handle; +struct dmi_header { + u8 type; + u8 length; + u16 handle; }; #undef DMI_DEBUG @@ -29,15 +22,13 @@ struct dmi_header static char * __init dmi_string(struct dmi_header *dm, u8 s) { - u8 *bp=(u8 *)dm; - bp+=dm->length; - if(!s) + u8 *bp = ((u8 *) dm) + dm->length; + + if (!s) return ""; s--; - while(s>0 && *bp) - { - bp+=strlen(bp); - bp++; + while (s > 0 && *bp) { + bp += strlen(bp) + 1; s--; } return bp; @@ -47,16 +38,14 @@ static char * __init dmi_string(struct dmi_header *dm, u8 s) * We have to be cautious here. We have seen BIOSes with DMI pointers * pointing to completely the wrong place for example */ - -static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dmi_header *)) +static int __init dmi_table(u32 base, int len, int num, + void (*decode)(struct dmi_header *)) { - u8 *buf; - struct dmi_header *dm; - u8 *data; - int i=0; + u8 *buf, *data; + int i = 0; buf = bt_ioremap(base, len); - if(buf==NULL) + if (buf == NULL) return -1; data = buf; @@ -65,36 +54,34 @@ static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dm * Stop when we see all the items the table claimed to have * OR we run off the end of the table (also happens) */ - - while(i<num && data-buf+sizeof(struct dmi_header)<=len) - { - dm=(struct dmi_header *)data; + while ((i < num) && (data - buf + sizeof(struct dmi_header)) <= len) { + struct dmi_header *dm = (struct dmi_header *)data; /* * We want to know the total length (formated area and strings) * before decoding to make sure we won't run off the table in * dmi_decode or dmi_string */ - data+=dm->length; - while(data-buf<len-1 && (data[0] || data[1])) + data += dm->length; + while ((data - buf < len - 1) && (data[0] || data[1])) data++; - if(data-buf<len-1) + if (data - buf < len - 1) decode(dm); - data+=2; + data += 2; i++; } bt_iounmap(buf, len); return 0; } - -inline static int __init dmi_checksum(u8 *buf) +static int __init dmi_checksum(u8 *buf) { - u8 sum=0; + u8 sum = 0; int a; - for(a=0; a<15; a++) - sum+=buf[a]; - return (sum==0); + for (a = 0; a < 15; a++) + sum += buf[a]; + + return sum == 0; } static int __init dmi_iterate(void (*decode)(struct dmi_header *)) @@ -110,28 +97,30 @@ static int __init dmi_iterate(void (*decode)(struct dmi_header *)) p = ioremap(0xF0000, 0x10000); if (p == NULL) return -1; + for (q = p; q < p + 0x10000; q += 16) { memcpy_fromio(buf, q, 15); - if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) - { - u16 num=buf[13]<<8|buf[12]; - u16 len=buf[7]<<8|buf[6]; - u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8]; + if ((memcmp(buf, "_DMI_", 5) == 0) && dmi_checksum(buf)) { + u16 num = (buf[13] << 8) | buf[12]; + u16 len = (buf[7] << 8) | buf[6]; + u32 base = (buf[11] << 24) | (buf[10] << 16) | + (buf[9] << 8) | buf[8]; /* * DMI version 0.0 means that the real version is taken from * the SMBIOS version, which we don't know at this point. */ - if(buf[14]!=0) + if (buf[14] != 0) printk(KERN_INFO "DMI %d.%d present.\n", - buf[14]>>4, buf[14]&0x0F); + buf[14] >> 4, buf[14] & 0xF); else printk(KERN_INFO "DMI present.\n"); + dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n", num, len)); - dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", - base)); - if(dmi_table(base,len, num, decode)==0) + dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base)); + + if (dmi_table(base,len, num, decode) == 0) return 0; } } @@ -143,16 +132,17 @@ static char *dmi_ident[DMI_STRING_MAX]; /* * Save a DMI string */ - static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string) { char *d = (char*)dm; char *p = dmi_string(dm, d[string]); - if(p==NULL || *p == 0) + + if (p == NULL || *p == 0) return; if (dmi_ident[slot]) return; - dmi_ident[slot] = alloc_bootmem(strlen(p)+1); + + dmi_ident[slot] = alloc_bootmem(strlen(p) + 1); if(dmi_ident[slot]) strcpy(dmi_ident[slot], p); else @@ -160,281 +150,47 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string) } /* - * Ugly compatibility crap. - */ -#define dmi_blacklist dmi_system_id -#define NO_MATCH { DMI_NONE, NULL} -#define MATCH DMI_MATCH - -/* - * Toshiba keyboard likes to repeat keys when they are not repeated. - */ - -static __init int broken_toshiba_keyboard(struct dmi_blacklist *d) -{ - printk(KERN_WARNING "Toshiba with broken keyboard detected. If your keyboard sometimes generates 3 keypresses instead of one, see http://davyd.ucc.asn.au/projects/toshiba/README\n"); - return 0; -} - - -#ifdef CONFIG_ACPI_SLEEP -static __init int reset_videomode_after_s3(struct dmi_blacklist *d) -{ - /* See acpi_wakeup.S */ - extern long acpi_video_flags; - acpi_video_flags |= 2; - return 0; -} -#endif - - -#ifdef CONFIG_ACPI_BOOT -extern int acpi_force; - -static __init __attribute__((unused)) int dmi_disable_acpi(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: acpi off\n",d->ident); - disable_acpi(); - } else { - printk(KERN_NOTICE - "Warning: DMI blacklist says broken, but acpi forced\n"); - } - return 0; -} - -/* - * Limit ACPI to CPU enumeration for HT - */ -static __init __attribute__((unused)) int force_acpi_ht(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", d->ident); - disable_acpi(); - acpi_ht = 1; - } else { - printk(KERN_NOTICE - "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); - } - return 0; -} -#endif - -#ifdef CONFIG_ACPI_PCI -static __init int disable_acpi_irq(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", - d->ident); - acpi_noirq_set(); - } - return 0; -} -static __init int disable_acpi_pci(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", - d->ident); - acpi_disable_pci(); - } - return 0; -} -#endif - -/* - * Process the DMI blacklists - */ - - -/* - * This will be expanded over time to force things like the APM - * interrupt mask settings according to the laptop - */ - -static __initdata struct dmi_blacklist dmi_blacklist[]={ - - { broken_toshiba_keyboard, "Toshiba Satellite 4030cdt", { /* Keyboard generates spurious repeats */ - MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), - NO_MATCH, NO_MATCH, NO_MATCH - } }, -#ifdef CONFIG_ACPI_SLEEP - { reset_videomode_after_s3, "Toshiba Satellite 4030cdt", { /* Reset video mode after returning from ACPI S3 sleep */ - MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), - NO_MATCH, NO_MATCH, NO_MATCH - } }, -#endif - -#ifdef CONFIG_ACPI_BOOT - /* - * If your system is blacklisted here, but you find that acpi=force - * works for you, please contact acpi-devel@sourceforge.net - */ - - /* - * Boxes that need ACPI disabled - */ - - { dmi_disable_acpi, "IBM Thinkpad", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "2629H1G"), - NO_MATCH, NO_MATCH }}, - - /* - * Boxes that need acpi=ht - */ - - { force_acpi_ht, "FSC Primergy T850", { - MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), - MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "DELL GX240", { - MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), - MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "HP VISUALIZE NT Workstation", { - MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "Compaq Workstation W8000", { - MATCH(DMI_SYS_VENDOR, "Compaq"), - MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ASUS P4B266", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "P4B266"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ASUS P2B-DS", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "P2B-DS"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ASUS CUR-DLS", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "CUR-DLS"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ABIT i440BX-W83977", { - MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), - MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM Bladecenter", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM eServer xSeries 360", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM eserver xSeries 330", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM eserver xSeries 440", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), - NO_MATCH, NO_MATCH }}, - -#endif // CONFIG_ACPI_BOOT - -#ifdef CONFIG_ACPI_PCI - /* - * Boxes that need ACPI PCI IRQ routing disabled - */ - - { disable_acpi_irq, "ASUS A7V", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), - MATCH(DMI_BOARD_NAME, "<A7V>"), - /* newer BIOS, Revision 1011, does work */ - MATCH(DMI_BIOS_VERSION, "ASUS A7V ACPI BIOS Revision 1007"), - NO_MATCH }}, - - /* - * Boxes that need ACPI PCI IRQ routing and PCI scan disabled - */ - { disable_acpi_pci, "ASUS PR-DLS", { /* _BBN 0 bug */ - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "PR-DLS"), - MATCH(DMI_BIOS_VERSION, "ASUS PR-DLS ACPI BIOS Revision 1010"), - MATCH(DMI_BIOS_DATE, "03/21/2003") }}, - - { disable_acpi_pci, "Acer TravelMate 36x Laptop", { - MATCH(DMI_SYS_VENDOR, "Acer"), - MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), - NO_MATCH, NO_MATCH - } }, - -#endif - - { NULL, } -}; - -/* * Process a DMI table entry. Right now all we care about are the BIOS * and machine entries. For 2.5 we should pull the smbus controller info * out of here. */ - static void __init dmi_decode(struct dmi_header *dm) { -#ifdef DMI_DEBUG - u8 *data = (u8 *)dm; -#endif + u8 *data __attribute__((__unused__)) = (u8 *)dm; - switch(dm->type) - { - case 0: - dmi_printk(("BIOS Vendor: %s\n", - dmi_string(dm, data[4]))); - dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); - dmi_printk(("BIOS Version: %s\n", - dmi_string(dm, data[5]))); - dmi_save_ident(dm, DMI_BIOS_VERSION, 5); - dmi_printk(("BIOS Release: %s\n", - dmi_string(dm, data[8]))); - dmi_save_ident(dm, DMI_BIOS_DATE, 8); - break; - case 1: - dmi_printk(("System Vendor: %s\n", - dmi_string(dm, data[4]))); - dmi_save_ident(dm, DMI_SYS_VENDOR, 4); - dmi_printk(("Product Name: %s\n", - dmi_string(dm, data[5]))); - dmi_save_ident(dm, DMI_PRODUCT_NAME, 5); - dmi_printk(("Version: %s\n", - dmi_string(dm, data[6]))); - dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); - dmi_printk(("Serial Number: %s\n", - dmi_string(dm, data[7]))); - break; - case 2: - dmi_printk(("Board Vendor: %s\n", - dmi_string(dm, data[4]))); - dmi_save_ident(dm, DMI_BOARD_VENDOR, 4); - dmi_printk(("Board Name: %s\n", - dmi_string(dm, data[5]))); - dmi_save_ident(dm, DMI_BOARD_NAME, 5); - dmi_printk(("Board Version: %s\n", - dmi_string(dm, data[6]))); - dmi_save_ident(dm, DMI_BOARD_VERSION, 6); - break; + switch(dm->type) { + case 0: + dmi_printk(("BIOS Vendor: %s\n", dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); + dmi_printk(("BIOS Version: %s\n", dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_BIOS_VERSION, 5); + dmi_printk(("BIOS Release: %s\n", dmi_string(dm, data[8]))); + dmi_save_ident(dm, DMI_BIOS_DATE, 8); + break; + case 1: + dmi_printk(("System Vendor: %s\n", dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_SYS_VENDOR, 4); + dmi_printk(("Product Name: %s\n", dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_PRODUCT_NAME, 5); + dmi_printk(("Version: %s\n", dmi_string(dm, data[6]))); + dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); + dmi_printk(("Serial Number: %s\n", dmi_string(dm, data[7]))); + dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7); + break; + case 2: + dmi_printk(("Board Vendor: %s\n", dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_BOARD_VENDOR, 4); + dmi_printk(("Board Name: %s\n", dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_BOARD_NAME, 5); + dmi_printk(("Board Version: %s\n", dmi_string(dm, data[6]))); + dmi_save_ident(dm, DMI_BOARD_VERSION, 6); + break; } } void __init dmi_scan_machine(void) { - int err = dmi_iterate(dmi_decode); - if(err == 0) - dmi_check_system(dmi_blacklist); - else + if (dmi_iterate(dmi_decode)) printk(KERN_INFO "DMI not present.\n"); } @@ -470,7 +226,6 @@ fail: d++; return count; } - EXPORT_SYMBOL(dmi_check_system); /** @@ -480,8 +235,8 @@ EXPORT_SYMBOL(dmi_check_system); * Returns one DMI data value, can be used to perform * complex DMI data checks. */ -char * dmi_get_system_info(int field) +char *dmi_get_system_info(int field) { return dmi_ident[field]; } - +EXPORT_SYMBOL(dmi_get_system_info); diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index f732f427b41..385883ea8c1 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -30,6 +30,7 @@ #include <linux/ioport.h> #include <linux/module.h> #include <linux/efi.h> +#include <linux/kexec.h> #include <asm/setup.h> #include <asm/io.h> @@ -598,6 +599,9 @@ efi_initialize_iomem_resources(struct resource *code_resource, if (md->type == EFI_CONVENTIONAL_MEMORY) { request_resource(res, code_resource); request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } } } diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index e966fc8c44c..4477bb10709 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -299,7 +299,6 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 - incb ready lgdt cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f @@ -316,8 +315,9 @@ is386: movl $2,%ecx # set MP lldt %ax cld # gcc2 wants the direction flag cleared at all times #ifdef CONFIG_SMP - movb ready, %cl - cmpb $1,%cl + movb ready, %cl + movb $1, ready + cmpb $0,%cl je 1f # the first CPU calls start_kernel # all other CPUs call initialize_secondary call initialize_secondary diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 2c4813b47e5..178f4e9bac9 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -268,10 +268,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state) return 0; } +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .suspend = i8259A_suspend, .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 08540bc4ba3..6578f40bd50 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -37,6 +37,7 @@ #include <asm/smp.h> #include <asm/desc.h> #include <asm/timer.h> +#include <asm/i8259.h> #include <mach_apic.h> @@ -573,12 +574,14 @@ static int balanced_irq(void *unused) for ( ; ; ) { set_current_state(TASK_INTERRUPTIBLE); time_remaining = schedule_timeout(time_remaining); - try_to_freeze(PF_FREEZE); + try_to_freeze(); if (time_after(jiffies, prev_balance_time+balanced_irq_interval)) { + preempt_disable(); do_irq_balance(); prev_balance_time = jiffies; time_remaining = balanced_irq_interval; + preempt_enable(); } } return 0; @@ -630,10 +633,8 @@ static int __init balanced_irq_init(void) printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); failed: for (i = 0; i < NR_CPUS; i++) { - if(irq_cpu_data[i].irq_delta) - kfree(irq_cpu_data[i].irq_delta); - if(irq_cpu_data[i].last_irq) - kfree(irq_cpu_data[i].last_irq); + kfree(irq_cpu_data[i].irq_delta); + kfree(irq_cpu_data[i].last_irq); } return 0; } @@ -1566,7 +1567,6 @@ void print_all_local_APICs (void) void /*__init*/ print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1634,12 +1634,43 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - disconnect_bsp_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + disconnect_bsp_APIC(pin != -1); } /* diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 73945a3c53c..ce66dcc26d9 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -15,6 +15,9 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/delay.h> DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); @@ -153,6 +156,11 @@ void irq_ctx_init(int cpu) cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); } +void irq_ctx_exit(int cpu) +{ + hardirq_ctx[cpu] = NULL; +} + extern asmlinkage void __do_softirq(void); asmlinkage void do_softirq(void) @@ -210,9 +218,8 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); - for (j=0; j<NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "CPU%d ",j); + for_each_cpu(j) + seq_printf(p, "CPU%d ",j); seq_putc(p, '\n'); } @@ -225,9 +232,8 @@ int show_interrupts(struct seq_file *p, void *v) #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + for_each_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif seq_printf(p, " %14s", irq_desc[i].handler->typename); seq_printf(p, " %s", action->name); @@ -240,16 +246,14 @@ skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", nmi_count(j)); + for_each_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); seq_putc(p, '\n'); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).apic_timer_irqs); + for_each_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).apic_timer_irqs); seq_putc(p, '\n'); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); @@ -259,3 +263,45 @@ skip: } return 0; } + +#ifdef CONFIG_HOTPLUG_CPU +#include <mach_apic.h> + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 3762f6b35ab..a6d8c45961d 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -127,48 +127,23 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->eip = (unsigned long)&p->ainsn.insn; } -struct task_struct *arch_get_kprobe_task(void *ptr) -{ - return ((struct thread_info *) (((unsigned long) ptr) & - (~(THREAD_SIZE -1))))->task; -} - void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)®s->esp; - struct kretprobe_instance *ri; - static void *orig_ret_addr; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; - /* - * Save the return address when the return probe hits - * the first time, and use it to populate the (krprobe - * instance)->ret_addr for subsequent return probes at - * the same addrress since stack address would have - * the kretprobe_trampoline by then. - */ - if (((void*) *sara) != kretprobe_trampoline) - orig_ret_addr = (void*) *sara; - - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->stack_addr = sara; - ri->ret_addr = orig_ret_addr; - add_rp_inst(ri); /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - } else { - rp->nmissed++; - } -} -void arch_kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - while ((ri = get_rp_inst_tsk(tk)) != NULL) { - *((unsigned long *)(ri->stack_addr)) = - (unsigned long) ri->ret_addr; - recycle_rp_inst(ri); - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } /* @@ -286,36 +261,59 @@ no_kprobe: */ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct task_struct *tsk; - struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *node; - unsigned long *sara = ((unsigned long *) ®s->esp) - 1; - - tsk = arch_get_kprobe_task(sara); - head = kretprobe_inst_table_head(tsk); - - hlist_for_each_entry(ri, node, head, hlist) { - if (ri->stack_addr == sara && ri->rp) { - if (ri->rp->handler) - ri->rp->handler(ri, regs); - } - } - return 0; -} + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; -void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kretprobe_instance *ri; - /* RA already popped */ - unsigned long *sara = ((unsigned long *)®s->esp) - 1; + head = kretprobe_inst_table_head(current); - while ((ri = get_rp_inst(sara))) { - regs->eip = (unsigned long)ri->ret_addr; + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; } - regs->eflags &= ~TF_MASK; + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->eip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; } /* @@ -403,8 +401,7 @@ static inline int post_kprobe_handler(struct pt_regs *regs) current_kprobe->post_handler(current_kprobe, regs, 0); } - if (current_kprobe->post_handler != trampoline_post_handler) - resume_execution(current_kprobe, regs); + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_eflags; /*Restore back the original saved kprobes variables and continue. */ @@ -534,3 +531,13 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c new file mode 100644 index 00000000000..52ed18d8b51 --- /dev/null +++ b/arch/i386/kernel/machine_kexec.c @@ -0,0 +1,226 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/cpufeature.h> + +static inline unsigned long read_cr3(void) +{ + unsigned long cr3; + asm volatile("movl %%cr3,%0": "=r"(cr3)); + return cr3; +} + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L2_ATTR (_PAGE_PRESENT) + +#define LEVEL0_SIZE (1UL << 12UL) + +#ifndef CONFIG_X86_PAE +#define LEVEL1_SIZE (1UL << 22UL) +static u32 pgtable_level1[1024] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index; + u32 *pgtable_level2; + + /* Find the current page table */ + pgtable_level2 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = address / LEVEL1_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level2); +} + +#else +#define LEVEL1_SIZE (1UL << 21UL) +#define LEVEL2_SIZE (1UL << 30UL) +static u64 pgtable_level1[512] PAGE_ALIGNED; +static u64 pgtable_level2[512] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index, level3_index; + u64 *pgtable_level3; + + /* Find the current page table */ + pgtable_level3 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; + level3_index = address / LEVEL2_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + set_64bit(&pgtable_level3[level3_index], + __pa(pgtable_level2) | L2_ATTR); + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level3); +} +#endif + + +static void set_idt(void *newidt, __u16 limit) +{ + unsigned char curidt[6]; + + /* ia32 supports unaliged loads & stores */ + (*(__u16 *)(curidt)) = limit; + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt); + + __asm__ __volatile__ ( + "lidt %0\n" + : "=m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, __u16 limit) +{ + unsigned char curgdt[6]; + + /* ia32 supports unaligned loads & stores */ + (*(__u16 *)(curgdt)) = limit; + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt); + + __asm__ __volatile__ ( + "lgdt %0\n" + : "=m" (curgdt) + ); +}; + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + "\tmovl %eax,%ss\n" + ); +#undef STR +#undef __STR +} + +typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address, + unsigned int has_pae) ATTRIB_NORET; + +const extern unsigned char relocate_new_kernel[]; +extern void relocate_new_kernel_end(void); +const extern unsigned int relocate_new_kernel_size; + +/* + * A architecture hook called to validate the + * proposed image and prepare the control pages + * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * have been allocated, but the segments have yet + * been copied into the kernel. + * + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + * + * Currently nothing. + */ +int machine_kexec_prepare(struct kimage *image) +{ + return 0; +} + +/* + * Undo anything leftover by machine_kexec_prepare + * when an image is freed. + */ +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list; + unsigned long reboot_code_buffer; + + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Compute some offsets */ + reboot_code_buffer = page_to_pfn(image->control_code_page) + << PAGE_SHIFT; + page_list = image->head; + + /* Set up an identity mapping for the reboot_code_buffer */ + identity_map_page(reboot_code_buffer); + + /* copy it out */ + memcpy((void *)reboot_code_buffer, relocate_new_kernel, + relocate_new_kernel_size); + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again you set the segment to a different selector. + * + * The more common model is are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + + /* now call it */ + rnk = (relocate_new_kernel_t) reboot_code_buffer; + (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); +} diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 383a11600d2..af917f609c7 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -67,7 +67,6 @@ unsigned long mp_lapic_addr; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; -unsigned int boot_cpu_logical_apicid = -1U; /* Internal processor count */ static unsigned int __initdata num_processors; @@ -180,7 +179,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m) if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { Dprintk(" Bootup CPU\n"); boot_cpu_physical_apicid = m->mpc_apicid; - boot_cpu_logical_apicid = apicid; } if (num_processors >= NR_CPUS) { diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index aea2ce1145d..ba243a4cc11 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -13,6 +13,7 @@ #include <stdarg.h> +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/fs.h> @@ -55,6 +56,9 @@ #include <linux/irq.h> #include <linux/err.h> +#include <asm/tlbflush.h> +#include <asm/cpu.h> + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); static int hlt_counter; @@ -143,14 +147,42 @@ static void poll_idle (void) } } +#ifdef CONFIG_HOTPLUG_CPU +#include <asm/nmi.h> +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + /* This must be done before dead CPU ack */ + cpu_exit_clear(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); + while (1) + __asm__ __volatile__("hlt":::"memory"); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ -void cpu_idle (void) +void cpu_idle(void) { + int cpu = raw_smp_processor_id(); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -165,6 +197,9 @@ void cpu_idle (void) if (!idle) idle = default_idle; + if (cpu_is_offline(cpu)) + play_dead(); + __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } @@ -223,7 +258,7 @@ static void mwait_idle(void) } } -void __init select_idle_routine(const struct cpuinfo_x86 *c) +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_MWAIT)) { printk("monitor/mwait feature present.\n"); @@ -582,6 +617,33 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) } /* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + +/* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time @@ -660,6 +722,8 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) handle_io_bitmap(next, tss); + disable_tsc(prev_p, next_p); + return prev_p; } diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index db912209a8d..b3e58484996 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -26,7 +26,6 @@ static int reboot_mode; static int reboot_thru_bios; #ifdef CONFIG_SMP -int reboot_smp = 0; static int reboot_cpu = -1; /* shamelessly grabbed from lib/vsprintf.c for readability */ #define is_digit(c) ((c) >= '0' && (c) <= '9') @@ -49,7 +48,6 @@ static int __init reboot_setup(char *str) break; #ifdef CONFIG_SMP case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ - reboot_smp = 1; if (is_digit(*(str+1))) { reboot_cpu = (int) (*(str+1) - '0'); if (is_digit(*(str+2))) @@ -88,33 +86,9 @@ static int __init set_bios_reboot(struct dmi_system_id *d) return 0; } -/* - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_reboot(struct dmi_system_id *d) -{ -#ifdef CONFIG_SMP - if (!reboot_smp) { - reboot_smp = 1; - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident); - } -#endif - return 0; -} - -/* - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_bios_reboot(struct dmi_system_id *d) -{ - set_smp_reboot(d); - set_bios_reboot(d); - return 0; -} - static struct dmi_system_id __initdata reboot_dmi_table[] = { { /* Handle problems with rebooting on Dell 1300's */ - .callback = set_smp_bios_reboot, + .callback = set_bios_reboot, .ident = "Dell PowerEdge 1300", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), @@ -301,41 +275,32 @@ void machine_real_restart(unsigned char *code, int length) EXPORT_SYMBOL(machine_real_restart); #endif -void machine_restart(char * __unused) +void machine_shutdown(void) { #ifdef CONFIG_SMP - int cpuid; - - cpuid = GET_APIC_ID(apic_read(APIC_ID)); - - if (reboot_smp) { - - /* check to see if reboot_cpu is valid - if its not, default to the BSP */ - if ((reboot_cpu == -1) || - (reboot_cpu > (NR_CPUS -1)) || - !physid_isset(cpuid, phys_cpu_present_map)) - reboot_cpu = boot_cpu_physical_apicid; - - reboot_smp = 0; /* use this as a flag to only go through this once*/ - /* re-run this function on the other CPUs - it will fall though this section since we have - cleared reboot_smp, and do the reboot if it is the - correct CPU, otherwise it halts. */ - if (reboot_cpu != cpuid) - smp_call_function((void *)machine_restart , NULL, 1, 0); + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* See if there has been given a command line override */ + if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) && + cpu_isset(reboot_cpu, cpu_online_map)) { + reboot_cpu_id = reboot_cpu; } - /* if reboot_cpu is still -1, then we want a tradional reboot, - and if we are not running on the reboot_cpu,, halt */ - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) { - for (;;) - __asm__ __volatile__ ("hlt"); + /* Make certain the cpu I'm rebooting on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); } - /* - * Stop all CPUs and turn off local APICs and the IO-APIC, so - * other OSs see a clean IRQ state. + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K. Now that I'm on the appropriate processor, stop + * all of the others, and disable their local APICs. */ + smp_send_stop(); #endif /* CONFIG_SMP */ @@ -344,6 +309,11 @@ void machine_restart(char * __unused) #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif +} + +void machine_restart(char * __unused) +{ + machine_shutdown(); if (!reboot_thru_bios) { if (efi_enabled) { diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S new file mode 100644 index 00000000000..d312616effa --- /dev/null +++ b/arch/i386/kernel/relocate_kernel.S @@ -0,0 +1,120 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/linkage.h> + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel +relocate_new_kernel: + /* read the arguments and say goodbye to the stack */ + movl 4(%esp), %ebx /* page_list */ + movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 12(%esp), %edx /* start address */ + movl 16(%esp), %ecx /* cpu_has_pae */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* set a new stack at the bottom of our page... */ + lea 4096(%ebp), %esp + + /* store the parameters back on the stack */ + pushl %edx /* store the start address */ + + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movl %cr0, %eax + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax + orl $(1<<0), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + movl %cr4, %eax + andl $0, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + /* Do the copies */ + movl %ebx, %ecx + jmp 1f + +0: /* top, read another word from the indirection page */ + movl (%ebx), %ecx + addl $4, %ebx +1: + testl $0x1, %ecx /* is it a destination page */ + jz 2f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +2: + testl $0x2, %ecx /* is it an indirection page */ + jz 2f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +2: + testl $0x4, %ecx /* is it the done indicator */ + jz 2f + jmp 3f +2: + testl $0x8, %ecx /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl $1024, %ecx + rep ; movsl + jmp 0b + +3: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .long relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 30406fd0b64..7306353c520 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -43,7 +43,12 @@ #include <linux/init.h> #include <linux/edd.h> #include <linux/nodemask.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> + #include <video/edid.h> + +#include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> #include <asm/setup.h> @@ -55,12 +60,15 @@ #include "setup_arch_pre.h" #include <bios_ebda.h> +/* Forward Declaration. */ +void __init find_max_pfn(void); + /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ unsigned long init_pg_tables_end __initdata = ~0UL; -int disable_pse __initdata = 0; +int disable_pse __devinitdata = 0; /* * Machine setup.. @@ -732,6 +740,15 @@ static void __init parse_cmdline_early (char ** cmdline_p) if (to != command_line) to--; if (!memcmp(from+7, "exactmap", 8)) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif from += 8+7; e820.nr_map = 0; userdef = 1; @@ -835,6 +852,44 @@ static void __init parse_cmdline_early (char ** cmdline_p) #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* CONFIG_ACPI_BOOT */ +#ifdef CONFIG_X86_LOCAL_APIC + /* enable local APIC */ + else if (!memcmp(from, "lapic", 5)) + lapic_enable(); + + /* disable local APIC */ + else if (!memcmp(from, "nolapic", 6)) + lapic_disable(); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_KEXEC + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ + else if (!memcmp(from, "crashkernel=", 12)) { + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + } +#endif +#ifdef CONFIG_CRASH_DUMP + /* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. + */ + else if (!memcmp(from, "elfcorehdr=", 11)) + elfcorehdr_addr = memparse(from+11, &from); +#endif + /* * highmem=size forces highmem to be exactly 'size' bytes. * This works even on boxes that have no highmem otherwise. @@ -1113,8 +1168,8 @@ void __init setup_bootmem_allocator(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); + reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, @@ -1170,6 +1225,11 @@ void __init setup_bootmem_allocator(void) } } #endif +#ifdef CONFIG_KEXEC + if (crashk_res.start != crashk_res.end) + reserve_bootmem(crashk_res.start, + crashk_res.end - crashk_res.start + 1); +#endif } /* @@ -1223,6 +1283,9 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat */ request_resource(res, code_resource); request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } } } diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index b9b8f4e20fa..89ef7adc63a 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -608,10 +608,8 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) if (!user_mode(regs)) return 1; - if (current->flags & PF_FREEZE) { - refrigerator(0); + if (try_to_freeze()) goto no_signal; - } if (!oldset) oldset = ¤t->blocked; diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 68be7d0c723..cec4bde6716 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -19,6 +19,7 @@ #include <linux/mc146818rtc.h> #include <linux/cache.h> #include <linux/interrupt.h> +#include <linux/cpu.h> #include <linux/module.h> #include <asm/mtrr.h> @@ -164,7 +165,7 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) unsigned long flags; local_irq_save(flags); - + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); /* * Wait for idle. */ @@ -346,21 +347,21 @@ out: static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { - cpumask_t tmp; /* * A couple of (to be removed) sanity checks: * - * - we do not send IPIs to not-yet booted CPUs. * - current CPU must not be in mask * - mask must exist :) */ BUG_ON(cpus_empty(cpumask)); - - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(cpumask, tmp)); BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. @@ -476,6 +477,7 @@ void flush_tlb_all(void) */ void smp_send_reschedule(int cpu) { + WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } @@ -493,6 +495,16 @@ struct call_data_struct { int wait; }; +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + static struct call_data_struct * call_data; /* @@ -516,10 +528,15 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, */ { struct call_data_struct data; - int cpus = num_online_cpus()-1; + int cpus; - if (!cpus) + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + cpus = num_online_cpus() - 1; + if (!cpus) { + spin_unlock(&call_lock); return 0; + } /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -531,7 +548,6 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, if (wait) atomic_set(&data.finished, 0); - spin_lock(&call_lock); call_data = &data; mb(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index c20d96d5c15..8ac8e9fd561 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -44,6 +44,9 @@ #include <linux/smp_lock.h> #include <linux/irq.h> #include <linux/bootmem.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/percpu.h> #include <linux/delay.h> #include <linux/mc146818rtc.h> @@ -56,20 +59,30 @@ #include <smpboot_hooks.h> /* Set if we find a B stepping CPU */ -static int __initdata smp_b_stepping; +static int __devinitdata smp_b_stepping; /* Number of siblings per CPU package */ int smp_num_siblings = 1; #ifdef CONFIG_X86_HT EXPORT_SYMBOL(smp_num_siblings); #endif -int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ + +/* Package ID of each logical CPU */ +int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; EXPORT_SYMBOL(phys_proc_id); -int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */ + +/* Core ID of each logical CPU */ +int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; EXPORT_SYMBOL(cpu_core_id); +cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_sibling_map); + +cpumask_t cpu_core_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_core_map); + /* bitmap of online cpus */ -cpumask_t cpu_online_map; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; @@ -77,11 +90,17 @@ cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); static cpumask_t smp_commenced_mask; +/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there + * is no way to resync one AP against BP. TBD: for prescott and above, we + * should use IA64's algorithm + */ +static int __devinitdata tsc_sync_disabled; + /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_data); -u8 x86_cpu_to_apicid[NR_CPUS] = +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); @@ -96,13 +115,16 @@ static int trampoline_exec; static void map_cpu_to_logical_apicid(void); +/* State of each CPU. */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -static unsigned long __init setup_trampoline(void) +static unsigned long __devinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(trampoline_base); @@ -132,7 +154,7 @@ void __init smp_alloc_memory(void) * a given CPU */ -static void __init smp_store_cpu_info(int id) +static void __devinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; @@ -326,7 +348,7 @@ extern void calibrate_delay(void); static atomic_t init_deasserted; -static void __init smp_callin(void) +static void __devinit smp_callin(void) { int cpuid, phys_id; unsigned long timeout; @@ -411,16 +433,48 @@ static void __init smp_callin(void) /* * Synchronize the TSC with the BP */ - if (cpu_has_tsc && cpu_khz) + if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) synchronize_tsc_ap(); } static int cpucount; +static inline void +set_cpu_sibling_map(int cpu) +{ + int i; + + if (smp_num_siblings > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (cpu_core_id[cpu] == cpu_core_id[i]) { + cpu_set(i, cpu_sibling_map[cpu]); + cpu_set(cpu, cpu_sibling_map[i]); + } + } + } else { + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (current_cpu_data.x86_num_cores > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + } + } + } else { + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + } +} + /* * Activate a secondary processor. */ -static void __init start_secondary(void *unused) +static void __devinit start_secondary(void *unused) { /* * Dont put anything before smp_callin(), SMP @@ -443,7 +497,23 @@ static void __init start_secondary(void *unused) * the local TLBs too. */ local_flush_tlb(); + + /* This must be done before setting cpu_online_map */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + + /* + * We need to hold call_lock, so there is no inconsistency + * between the time smp_call_function() determines number of + * IPI receipients, and the time when the determination is made + * for which cpus receive the IPI. Holding this + * lock helps us to not include this cpu in a currently in progress + * smp_call_function(). + */ + lock_ipi_call_lock(); cpu_set(smp_processor_id(), cpu_online_map); + unlock_ipi_call_lock(); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -458,7 +528,7 @@ static void __init start_secondary(void *unused) * from the task structure * This function must not return. */ -void __init initialize_secondary(void) +void __devinit initialize_secondary(void) { /* * We don't actually need to load the full TSS, @@ -480,10 +550,10 @@ extern struct { #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -cpumask_t node_2_cpu_mask[MAX_NUMNODES] = +cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; /* which node each logical CPU is on */ -int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; +int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; EXPORT_SYMBOL(cpu_2_node); /* set up a mapping between cpu and node. */ @@ -511,7 +581,7 @@ static inline void unmap_cpu_to_node(int cpu) #endif /* CONFIG_NUMA */ -u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; static void map_cpu_to_logical_apicid(void) { @@ -572,7 +642,7 @@ static inline void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -static int __init +static int __devinit wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; @@ -618,7 +688,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) #endif /* WAKE_SECONDARY_VIA_NMI */ #ifdef WAKE_SECONDARY_VIA_INIT -static int __init +static int __devinit wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; @@ -753,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) #endif /* WAKE_SECONDARY_VIA_INIT */ extern cpumask_t cpu_initialized; +static inline int alloc_cpu_id(void) +{ + cpumask_t tmp_map; + int cpu; + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + if (cpu >= NR_CPUS) + return -ENODEV; + return cpu; +} + +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS]; +static inline struct task_struct * alloc_idle_task(int cpu) +{ + struct task_struct *idle; + + if ((idle = cpu_idle_tasks[cpu]) != NULL) { + /* initialize thread_struct. we really want to avoid destroy + * idle tread + */ + idle->thread.esp = (unsigned long)(((struct pt_regs *) + (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1); + init_idle(idle, cpu); + return idle; + } + idle = fork_idle(cpu); + + if (!IS_ERR(idle)) + cpu_idle_tasks[cpu] = idle; + return idle; +} +#else +#define alloc_idle_task(cpu) fork_idle(cpu) +#endif -static int __init do_boot_cpu(int apicid) +static int __devinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -763,16 +868,17 @@ static int __init do_boot_cpu(int apicid) { struct task_struct *idle; unsigned long boot_error; - int timeout, cpu; + int timeout; unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - cpu = ++cpucount; + ++cpucount; + /* * We can't use kernel_thread since we must avoid to * reschedule the child. */ - idle = fork_idle(cpu); + idle = alloc_idle_task(cpu); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); idle->thread.eip = (unsigned long) start_secondary; @@ -839,13 +945,16 @@ static int __init do_boot_cpu(int apicid) inquire_remote_apic(apicid); } } - x86_cpu_to_apicid[cpu] = apicid; + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ cpucount--; + } else { + x86_cpu_to_apicid[cpu] = apicid; + cpu_set(cpu, cpu_present_map); } /* mark "stuck" area as not stuck */ @@ -854,6 +963,75 @@ static int __init do_boot_cpu(int apicid) return boot_error; } +#ifdef CONFIG_HOTPLUG_CPU +void cpu_exit_clear(void) +{ + int cpu = raw_smp_processor_id(); + + idle_task_exit(); + + cpucount --; + cpu_uninit(); + irq_ctx_exit(cpu); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + cpu_clear(cpu, cpu_present_map); + + cpu_clear(cpu, smp_commenced_mask); + unmap_cpu_to_logical_apicid(cpu); +} + +struct warm_boot_cpu_info { + struct completion *complete; + int apicid; + int cpu; +}; + +static void __devinit do_warm_boot_cpu(void *p) +{ + struct warm_boot_cpu_info *info = p; + do_boot_cpu(info->apicid, info->cpu); + complete(info->complete); +} + +int __devinit smp_prepare_cpu(int cpu) +{ + DECLARE_COMPLETION(done); + struct warm_boot_cpu_info info; + struct work_struct task; + int apicid, ret; + + lock_cpu_hotplug(); + apicid = x86_cpu_to_apicid[cpu]; + if (apicid == BAD_APICID) { + ret = -ENODEV; + goto exit; + } + + info.complete = &done; + info.apicid = apicid; + info.cpu = cpu; + INIT_WORK(&task, do_warm_boot_cpu, &info); + + tsc_sync_disabled = 1; + + /* init low mem mapping */ + memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS); + flush_tlb_all(); + schedule_work(&task); + wait_for_completion(&done); + + tsc_sync_disabled = 0; + zap_low_mappings(); + ret = 0; +exit: + unlock_cpu_hotplug(); + return ret; +} +#endif + static void smp_tune_scheduling (void) { unsigned long cachesize; /* kB */ @@ -895,13 +1073,6 @@ void *xquad_portio; EXPORT_SYMBOL(xquad_portio); #endif -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; -#ifdef CONFIG_X86_HT -EXPORT_SYMBOL(cpu_sibling_map); -#endif -cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_core_map); - static void __init smp_boot_cpus(unsigned int max_cpus) { int apicid, cpu, bit, kicked; @@ -1013,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) if (max_cpus <= cpucount+1) continue; - if (do_boot_cpu(apicid)) + if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) printk("CPU #%d not responding - cannot use it.\n", apicid); else @@ -1065,44 +1236,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus) cpus_clear(cpu_core_map[cpu]); } - for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct cpuinfo_x86 *c = cpu_data + cpu; - int siblings = 0; - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; - - if (smp_num_siblings > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (cpu_core_id[cpu] == cpu_core_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (siblings != smp_num_siblings) { - printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } - - if (c->x86_num_cores > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_set(i, cpu_core_map[cpu]); - } - } - } else { - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - } - } + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); smpboot_setup_io_apic(); @@ -1119,6 +1254,9 @@ static void __init smp_boot_cpus(unsigned int max_cpus) who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) { + smp_commenced_mask = cpumask_of_cpu(0); + cpu_callin_map = cpumask_of_cpu(0); + mb(); smp_boot_cpus(max_cpus); } @@ -1126,23 +1264,98 @@ void __devinit smp_prepare_boot_cpu(void) { cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callout_map); + cpu_set(smp_processor_id(), cpu_present_map); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; } -int __devinit __cpu_up(unsigned int cpu) +#ifdef CONFIG_HOTPLUG_CPU +static void +remove_siblinginfo(int cpu) { - /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); - return -ENOSYS; + int sibling; + + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[sibling]); + for_each_cpu_mask(sibling, cpu_core_map[cpu]) + cpu_clear(cpu, cpu_core_map[sibling]); + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; +} + +int __cpu_disable(void) +{ + cpumask_t map = cpu_online_map; + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + /* We enable the timer again on the exit path of the death loop */ + disable_APIC_timer(); + /* Allow any queued timer interrupts to get serviced */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); + + remove_siblinginfo(cpu); + + cpu_clear(cpu, map); + fixup_irqs(map); + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + printk ("CPU %d is now offline\n", cpu); + return; + } + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} +#else /* ... !CONFIG_HOTPLUG_CPU */ +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ +int __devinit __cpu_up(unsigned int cpu) +{ /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { + printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); local_irq_enable(); return -EIO; } local_irq_enable(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) @@ -1156,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif zap_low_mappings(); +#ifndef CONFIG_HOTPLUG_CPU /* * Disable executability of the SMP trampoline: */ set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); +#endif } void __init smp_intr_init(void) diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index d408afaf649..468500a7e89 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -283,9 +283,14 @@ ENTRY(sys_call_table) .long sys_mq_timedreceive /* 280 */ .long sys_mq_notify .long sys_mq_getsetattr - .long sys_ni_syscall /* reserved for kexec */ + .long sys_kexec_load .long sys_waitid .long sys_ni_syscall /* 285 */ /* available */ .long sys_add_key .long sys_request_key .long sys_keyctl + .long sys_ioprio_set + .long sys_ioprio_get /* 290 */ + .long sys_inotify_init + .long sys_inotify_add_watch + .long sys_inotify_rm_watch diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 960d8bd137d..0bada1870bd 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -21,11 +21,16 @@ extern asmlinkage void sysenter_entry(void); -void enable_sep_cpu(void *info) +void enable_sep_cpu(void) { int cpu = get_cpu(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); + return; + } + tss->ss1 = __KERNEL_CS; tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); @@ -41,7 +46,7 @@ void enable_sep_cpu(void *info) extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; -static int __init sysenter_setup(void) +int __init sysenter_setup(void) { void *page = (void *)get_zeroed_page(GFP_ATOMIC); @@ -58,8 +63,5 @@ static int __init sysenter_setup(void) &vsyscall_sysenter_start, &vsyscall_sysenter_end - &vsyscall_sysenter_start); - on_each_cpu(enable_sep_cpu, NULL, 1, 1); return 0; } - -__initcall(sysenter_setup); diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index e68d9fdb075..0ee9dee8af0 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -68,7 +68,8 @@ #include "io_ports.h" -extern spinlock_t i8259A_lock; +#include <asm/i8259.h> + int pit_latch_buggy; /* extern */ #include "do_timer.h" @@ -85,10 +86,12 @@ extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); +#include <asm/i8253.h> + DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -struct timer_opts *cur_timer = &timer_none; +struct timer_opts *cur_timer __read_mostly = &timer_none; /* * This is a special lock that is owned by the CPU and holds the index diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c index 10a0cbb88e7..658c0629ba6 100644 --- a/arch/i386/kernel/time_hpet.c +++ b/arch/i386/kernel/time_hpet.c @@ -50,7 +50,7 @@ static void hpet_writel(unsigned long d, unsigned long a) * comparator value and continue. Next tick can be caught by checking * for a change in the comparator value. Used in apic.c. */ -static void __init wait_hpet_tick(void) +static void __devinit wait_hpet_tick(void) { unsigned int start_cmp_val, end_cmp_val; diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c index 37353bd3180..8163fe0cf1f 100644 --- a/arch/i386/kernel/timers/common.c +++ b/arch/i386/kernel/timers/common.c @@ -86,7 +86,7 @@ bad_ctc: #define CALIBRATE_CNT_HPET (5 * hpet_tick) #define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) -unsigned long __init calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) +unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) { unsigned long tsc_startlow, tsc_starthigh; unsigned long tsc_endlow, tsc_endhigh; diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c index f6f1206a11b..13892a65c94 100644 --- a/arch/i386/kernel/timers/timer_cyclone.c +++ b/arch/i386/kernel/timers/timer_cyclone.c @@ -17,9 +17,9 @@ #include <asm/io.h> #include <asm/pgtable.h> #include <asm/fixmap.h> -#include "io_ports.h" +#include <asm/i8253.h> -extern spinlock_t i8253_lock; +#include "io_ports.h" /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index d766e0963ac..ef8dac5dd33 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -18,7 +18,7 @@ #include "mach_timer.h" #include <asm/hpet.h> -static unsigned long hpet_usec_quotient; /* convert hpet clks to usec */ +static unsigned long __read_mostly hpet_usec_quotient; /* convert hpet clks to usec */ static unsigned long tsc_hpet_quotient; /* convert tsc to hpet clks */ static unsigned long hpet_last; /* hpet counter value at last tick*/ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ @@ -180,7 +180,7 @@ static int __init init_hpet(char* override) /************************************************************/ /* tsc timer_opts struct */ -static struct timer_opts timer_hpet = { +static struct timer_opts timer_hpet __read_mostly = { .name = "hpet", .mark_offset = mark_offset_hpet, .get_offset = get_offset_hpet, diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c index 967d5453cd0..06de036a820 100644 --- a/arch/i386/kernel/timers/timer_pit.c +++ b/arch/i386/kernel/timers/timer_pit.c @@ -15,9 +15,8 @@ #include <asm/smp.h> #include <asm/io.h> #include <asm/arch_hooks.h> +#include <asm/i8253.h> -extern spinlock_t i8259A_lock; -extern spinlock_t i8253_lock; #include "do_timer.h" #include "io_ports.h" @@ -166,7 +165,6 @@ struct init_timer_opts __initdata timer_pit_init = { void setup_pit_timer(void) { - extern spinlock_t i8253_lock; unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index 54c36b18202..8f4e4d5bc56 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -24,6 +24,7 @@ #include "mach_timer.h" #include <asm/hpet.h> +#include <asm/i8253.h> #ifdef CONFIG_HPET_TIMER static unsigned long hpet_usec_quotient; @@ -33,9 +34,7 @@ static struct timer_opts timer_tsc; static inline void cpufreq_delayed_get(void); -int tsc_disable __initdata = 0; - -extern spinlock_t i8253_lock; +int tsc_disable __devinitdata = 0; static int use_tsc; /* Number of usecs that the last interrupt was delayed */ diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index e4d4e2162c7..a61f33d06ea 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -27,6 +27,7 @@ #include <linux/ptrace.h> #include <linux/utsname.h> #include <linux/kprobes.h> +#include <linux/kexec.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -234,22 +235,22 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (in_kernel) { - u8 *eip; + u8 __user *eip; printk("\nStack: "); show_stack(NULL, (unsigned long*)esp); printk("Code: "); - eip = (u8 *)regs->eip - 43; + eip = (u8 __user *)regs->eip - 43; for (i = 0; i < 64; i++, eip++) { unsigned char c; - if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) { + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { printk(" Bad EIP value."); break; } - if (eip == (u8 *)regs->eip) + if (eip == (u8 __user *)regs->eip) printk("<%02x> ", c); else printk("%02x ", c); @@ -273,13 +274,13 @@ static void handle_BUG(struct pt_regs *regs) if (eip < PAGE_OFFSET) goto no_bug; - if (__get_user(ud2, (unsigned short *)eip)) + if (__get_user(ud2, (unsigned short __user *)eip)) goto no_bug; if (ud2 != 0x0b0f) goto no_bug; - if (__get_user(line, (unsigned short *)(eip + 2))) + if (__get_user(line, (unsigned short __user *)(eip + 2))) goto bug; - if (__get_user(file, (char **)(eip + 4)) || + if (__get_user(file, (char * __user *)(eip + 4)) || (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) file = "<bad filename>"; @@ -294,6 +295,9 @@ bug: printk("Kernel BUG\n"); } +/* This is gone through when something in the kernel + * has done something bad and is about to be terminated. +*/ void die(const char * str, struct pt_regs * regs, long err) { static struct { @@ -341,6 +345,10 @@ void die(const char * str, struct pt_regs * regs, long err) bust_spinlocks(0); die.lock_owner = -1; spin_unlock_irq(&die.lock); + + if (kexec_should_crash(current)) + crash_kexec(regs); + if (in_interrupt()) panic("Fatal exception in interrupt"); @@ -361,6 +369,10 @@ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long e static void do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs * regs, long error_code, siginfo_t *info) { + struct task_struct *tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (regs->eflags & VM_MASK) { if (vm86) goto vm86_trap; @@ -371,9 +383,6 @@ static void do_trap(int trapnr, int signr, char *str, int vm86, goto kernel_trap; trap_signal: { - struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (info) force_sig_info(signr, info, tsk); else @@ -486,6 +495,9 @@ fastcall void do_general_protection(struct pt_regs * regs, long error_code) } put_cpu(); + current->thread.error_code = error_code; + current->thread.trap_no = 13; + if (regs->eflags & VM_MASK) goto gp_in_vm86; @@ -570,6 +582,15 @@ void die_nmi (struct pt_regs *regs, const char *msg) console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); + + /* If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can. + */ + if (!user_mode(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + do_exit(SIGSEGV); } @@ -625,6 +646,14 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) nmi_enter(); cpu = smp_processor_id(); + +#ifdef CONFIG_HOTPLUG_CPU + if (!cpu_online(cpu)) { + nmi_exit(); + return; + } +#endif + ++nmi_count(cpu); if (!nmi_callback(regs, cpu)) @@ -872,9 +901,9 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs, error_code); return; } - die_if_kernel("cache flush denied", regs, error_code); current->thread.trap_no = 19; current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); force_sig(SIGSEGV, current); } } diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index e0512cc8bea..761972f8cb6 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -2,20 +2,23 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; */ +#define LOAD_OFFSET __PAGE_OFFSET + #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> #include <asm/page.h> OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) -ENTRY(startup_32) +ENTRY(phys_startup_32) jiffies = jiffies_64; SECTIONS { - . = __PAGE_OFFSET + 0x100000; + . = __KERNEL_START; + phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ _text = .; /* Text and read-only data */ - .text : { + .text : AT(ADDR(.text) - LOAD_OFFSET) { *(.text) SCHED_TEXT LOCK_TEXT @@ -27,49 +30,58 @@ SECTIONS . = ALIGN(16); /* Exception table */ __start___ex_table = .; - __ex_table : { *(__ex_table) } + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; RODATA /* writeable */ - .data : { /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) CONSTRUCTORS } . = ALIGN(4096); __nosave_begin = .; - .data_nosave : { *(.data.nosave) } + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } . = ALIGN(4096); __nosave_end = .; . = ALIGN(4096); - .data.page_aligned : { *(.data.idt) } + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.idt) + } . = ALIGN(32); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + /* rarely changed data like cpu maps */ + . = ALIGN(32); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } _edata = .; /* End of data section */ . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : { *(.data.init_task) } + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } /* will be freed after init */ . = ALIGN(4096); /* Init code and data */ __init_begin = .; - .init.text : { + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; *(.init.text) _einittext = .; } - .init.data : { *(.init.data) } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } . = ALIGN(16); __setup_start = .; - .init.setup : { *(.init.setup) } + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } __setup_end = .; __initcall_start = .; - .initcall.init : { + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { *(.initcall1.init) *(.initcall2.init) *(.initcall3.init) @@ -80,33 +92,41 @@ SECTIONS } __initcall_end = .; __con_initcall_start = .; - .con_initcall.init : { *(.con_initcall.init) } + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + *(.con_initcall.init) + } __con_initcall_end = .; SECURITY_INIT . = ALIGN(4); __alt_instructions = .; - .altinstructions : { *(.altinstructions) } + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + *(.altinstructions) + } __alt_instructions_end = .; - .altinstr_replacement : { *(.altinstr_replacement) } + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } - .exit.data : { *(.exit.data) } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } . = ALIGN(4096); __initramfs_start = .; - .init.ramfs : { *(.init.ramfs) } + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; . = ALIGN(32); __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; . = ALIGN(4096); __init_end = .; /* freed after init ends here */ __bss_start = .; /* BSS */ - .bss : { + .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) { *(.bss.page_aligned) + } + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { *(.bss) } . = ALIGN(4); diff --git a/arch/i386/mach-default/setup.c b/arch/i386/mach-default/setup.c index 0aa08eaa893..e5a1a83d09e 100644 --- a/arch/i386/mach-default/setup.c +++ b/arch/i386/mach-default/setup.c @@ -10,6 +10,14 @@ #include <asm/acpi.h> #include <asm/arch_hooks.h> +#ifdef CONFIG_HOTPLUG_CPU +#define DEFAULT_SEND_IPI (1) +#else +#define DEFAULT_SEND_IPI (0) +#endif + +int no_broadcast=DEFAULT_SEND_IPI; + /** * pre_intr_init_hook - initialisation prior to setting up interrupt vectors * @@ -104,3 +112,22 @@ void __init mca_nmi_hook(void) printk("NMI generated from unknown source!\n"); } #endif + +static __init int no_ipi_broadcast(char *str) +{ + get_option(&str, &no_broadcast); + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : + "IPI Broadcast"); + return 1; +} + +__setup("no_ipi_broadcast", no_ipi_broadcast); + +static int __init print_ipi_mode(void) +{ + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : + "Shortcut"); + return 0; +} + +late_initcall(print_ipi_mode); diff --git a/arch/i386/mach-default/topology.c b/arch/i386/mach-default/topology.c index 5b3e8817dae..23395fff35d 100644 --- a/arch/i386/mach-default/topology.c +++ b/arch/i386/mach-default/topology.c @@ -73,12 +73,11 @@ static int __init topology_init(void) { int i; - for (i = 0; i < MAX_NUMNODES; i++) { - if (node_online(i)) - arch_register_node(i); - } - for (i = 0; i < NR_CPUS; i++) - if (cpu_possible(i)) arch_register_cpu(i); + for_each_online_node(i) + arch_register_node(i); + + for_each_cpu(i) + arch_register_cpu(i); return 0; } @@ -88,8 +87,8 @@ static int __init topology_init(void) { int i; - for (i = 0; i < NR_CPUS; i++) - if (cpu_possible(i)) arch_register_cpu(i); + for_each_cpu(i) + arch_register_cpu(i); return 0; } diff --git a/arch/i386/mach-visws/mpparse.c b/arch/i386/mach-visws/mpparse.c index 5a22082147f..5f3d7e6de37 100644 --- a/arch/i386/mach-visws/mpparse.c +++ b/arch/i386/mach-visws/mpparse.c @@ -23,7 +23,6 @@ unsigned long mp_lapic_addr; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; -unsigned int boot_cpu_logical_apicid = -1U; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -52,10 +51,8 @@ static void __init MP_processor_info (struct mpc_config_processor *m) (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, m->mpc_apicver); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) boot_cpu_physical_apicid = m->mpc_apicid; - boot_cpu_logical_apicid = logical_apicid; - } ver = m->mpc_apicver; if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) { diff --git a/arch/i386/mach-voyager/voyager_basic.c b/arch/i386/mach-voyager/voyager_basic.c index 602aea240e9..3e439ce5e1b 100644 --- a/arch/i386/mach-voyager/voyager_basic.c +++ b/arch/i386/mach-voyager/voyager_basic.c @@ -30,6 +30,7 @@ #include <linux/irq.h> #include <asm/tlbflush.h> #include <asm/arch_hooks.h> +#include <asm/i8253.h> /* * Power off function, if any @@ -182,7 +183,6 @@ voyager_timer_interrupt(struct pt_regs *regs) * and swiftly introduce it to something sharp and * pointy. */ __u16 val; - extern spinlock_t i8253_lock; spin_lock(&i8253_lock); diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index f429c871e84..b358f0702a4 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -30,6 +30,8 @@ #include <linux/initrd.h> #include <linux/nodemask.h> #include <linux/module.h> +#include <linux/kexec.h> + #include <asm/e820.h> #include <asm/setup.h> #include <asm/mmzone.h> diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index a509237c481..8e90339d6ea 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -146,7 +146,7 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr) if (instr > limit) break; - if (__get_user(opcode, (unsigned char *) instr)) + if (__get_user(opcode, (unsigned char __user *) instr)) break; instr_hi = opcode & 0xf0; @@ -173,7 +173,7 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr) scan_more = 0; if (instr > limit) break; - if (__get_user(opcode, (unsigned char *) instr)) + if (__get_user(opcode, (unsigned char __user *) instr)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); @@ -463,6 +463,9 @@ no_context: printk(KERN_ALERT "*pte = %08lx\n", page); } #endif + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; die("Oops", regs, error_code); bust_spinlocks(0); do_exit(SIGKILL); diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index 4b7aaf99d7e..b6eb4dcb877 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -75,6 +75,24 @@ void kunmap_atomic(void *kvaddr, enum km_type type) preempt_check_resched(); } +/* This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + inc_preempt_count(); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + struct page *kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 3672e2ef51a..12216b52e28 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -352,7 +352,7 @@ static void __init pagetable_init (void) #endif } -#if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND) +#ifdef CONFIG_SOFTWARE_SUSPEND /* * Swap suspend & friends need this for resume because things like the intel-agp * driver might have split up a kernel 4MB mapping. diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index d393eefc705..f379b8d6755 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c @@ -228,7 +228,8 @@ EXPORT_SYMBOL(ioremap_nocache); void iounmap(volatile void __iomem *addr) { struct vm_struct *p; - if ((void __force *) addr <= high_memory) + + if ((void __force *)addr <= high_memory) return; /* @@ -241,9 +242,10 @@ void iounmap(volatile void __iomem *addr) return; write_lock(&vmlist_lock); - p = __remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); + p = __remove_vm_area((void *)(PAGE_MASK & (unsigned long __force)addr)); if (!p) { - printk("iounmap: bad address %p\n", addr); + printk(KERN_WARNING "iounmap: bad address %p\n", addr); + dump_stack(); goto out_unlock; } diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 270c59f099a..bd2f7afc7a2 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -32,9 +32,9 @@ void show_mem(void) unsigned long i; struct page_state ps; - printk("Mem-info:\n"); + printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); @@ -49,18 +49,18 @@ void show_mem(void) shared += page_count(page) - 1; } } - printk("%d pages of RAM\n", total); - printk("%d pages of HIGHMEM\n",highmem); - printk("%d reserved pages\n",reserved); - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); + printk(KERN_INFO "%d pages of RAM\n", total); + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); + printk(KERN_INFO "%d reserved pages\n", reserved); + printk(KERN_INFO "%d pages shared\n", shared); + printk(KERN_INFO "%d pages swap cached\n", cached); get_page_state(&ps); - printk("%lu pages dirty\n", ps.nr_dirty); - printk("%lu pages writeback\n", ps.nr_writeback); - printk("%lu pages mapped\n", ps.nr_mapped); - printk("%lu pages slab\n", ps.nr_slab); - printk("%lu pages pagetables\n", ps.nr_page_table_pages); + printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty); + printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback); + printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); + printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); + printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); } /* @@ -113,16 +113,16 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) pmd_t *pmd; if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ - printk ("set_pmd_pfn: vaddr misaligned\n"); + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); return; /* BUG(); */ } if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ - printk ("set_pmd_pfn: pfn misaligned\n"); + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); return; /* BUG(); */ } pgd = swapper_pg_dir + pgd_index(vaddr); if (pgd_none(*pgd)) { - printk ("set_pmd_pfn: pgd_none\n"); + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); return; /* BUG(); */ } pud = pud_offset(pgd, vaddr); diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 720975e1af5..70bcd53451f 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -25,7 +25,8 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | int pci_routeirq; int pcibios_last_bus = -1; -struct pci_bus *pci_root_bus = NULL; +unsigned long pirq_table_addr; +struct pci_bus *pci_root_bus; struct pci_raw_ops *raw_pci_ops; static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value) @@ -133,7 +134,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) printk("PCI: Probing PCI hardware (bus %02x)\n", busnum); - return pci_scan_bus(busnum, &pci_root_ops, NULL); + return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, NULL); } extern u8 pci_cache_line_size; @@ -164,6 +165,7 @@ static int __init pcibios_init(void) if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) pcibios_sort(); #endif + pci_assign_unassigned_resources(); return 0; } @@ -188,6 +190,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "biosirq")) { pci_probe |= PCI_BIOS_IRQ_SCAN; return NULL; + } else if (!strncmp(str, "pirqaddr=", 9)) { + pirq_table_addr = simple_strtoul(str+9, NULL, 0); + return NULL; } #endif #ifdef CONFIG_PCI_DIRECT diff --git a/arch/i386/pci/i386.c b/arch/i386/pci/i386.c index c205ea7e233..93a364c8215 100644 --- a/arch/i386/pci/i386.c +++ b/arch/i386/pci/i386.c @@ -106,11 +106,16 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) if ((dev = bus->self)) { for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { r = &dev->resource[idx]; - if (!r->start) + if (!r->flags) continue; pr = pci_find_parent_resource(dev, r); - if (!pr || request_resource(pr, r) < 0) + if (!r->start || !pr || request_resource(pr, r) < 0) { printk(KERN_ERR "PCI: Cannot allocate resource region %d of bridge %s\n", idx, pci_name(dev)); + /* Something is wrong with the region. + Invalidate the resource to prevent child + resource allocations in this range. */ + r->flags = 0; + } } } pcibios_allocate_bus_resources(&bus->children); @@ -227,7 +232,7 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask) pci_read_config_word(dev, PCI_COMMAND, &cmd); old_cmd = cmd; - for(idx=0; idx<6; idx++) { + for(idx = 0; idx < PCI_NUM_RESOURCES; idx++) { /* Only set up the requested stuff */ if (!(mask & (1<<idx))) continue; diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c index 83458f81e66..766b104ac1a 100644 --- a/arch/i386/pci/irq.c +++ b/arch/i386/pci/irq.c @@ -58,6 +58,35 @@ struct irq_router_handler { int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; /* + * Check passed address for the PCI IRQ Routing Table signature + * and perform checksum verification. + */ + +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr) +{ + struct irq_routing_table *rt; + int i; + u8 sum; + + rt = (struct irq_routing_table *) addr; + if (rt->signature != PIRQ_SIGNATURE || + rt->version != PIRQ_VERSION || + rt->size % 16 || + rt->size < sizeof(struct irq_routing_table)) + return NULL; + sum = 0; + for (i=0; i < rt->size; i++) + sum += addr[i]; + if (!sum) { + DBG("PCI: Interrupt Routing Table found at 0x%p\n", rt); + return rt; + } + return NULL; +} + + + +/* * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. */ @@ -65,23 +94,17 @@ static struct irq_routing_table * __init pirq_find_routing_table(void) { u8 *addr; struct irq_routing_table *rt; - int i; - u8 sum; + if (pirq_table_addr) { + rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr)); + if (rt) + return rt; + printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); + } for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { - rt = (struct irq_routing_table *) addr; - if (rt->signature != PIRQ_SIGNATURE || - rt->version != PIRQ_VERSION || - rt->size % 16 || - rt->size < sizeof(struct irq_routing_table)) - continue; - sum = 0; - for(i=0; i<rt->size; i++) - sum += addr[i]; - if (!sum) { - DBG("PCI: Interrupt Routing Table found at 0x%p\n", rt); + rt = pirq_check_routing_table(addr); + if (rt) return rt; - } } return NULL; } @@ -1028,24 +1051,28 @@ static int __init pcibios_irq_init(void) subsys_initcall(pcibios_irq_init); -static void pirq_penalize_isa_irq(int irq) +static void pirq_penalize_isa_irq(int irq, int active) { /* * If any ISAPnP device reports an IRQ in its list of possible * IRQ's, we try to avoid assigning it to PCI devices. */ - if (irq < 16) - pirq_penalty[irq] += 100; + if (irq < 16) { + if (active) + pirq_penalty[irq] += 1000; + else + pirq_penalty[irq] += 100; + } } -void pcibios_penalize_isa_irq(int irq) +void pcibios_penalize_isa_irq(int irq, int active) { #ifdef CONFIG_ACPI_PCI if (!acpi_noirq) - acpi_penalize_isa_irq(irq); + acpi_penalize_isa_irq(irq, active); else #endif - pirq_penalize_isa_irq(irq); + pirq_penalize_isa_irq(irq, active); } static int pirq_enable_irq(struct pci_dev *dev) diff --git a/arch/i386/pci/legacy.c b/arch/i386/pci/legacy.c index 1492e375386..149a9588c25 100644 --- a/arch/i386/pci/legacy.c +++ b/arch/i386/pci/legacy.c @@ -45,6 +45,8 @@ static int __init pci_legacy_init(void) printk("PCI: Probing PCI hardware\n"); pci_root_bus = pcibios_scan_root(0); + if (pci_root_bus) + pci_bus_add_devices(pci_root_bus); pcibios_fixup_peer_bridges(); diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 021a50aa51f..60f0e7a1162 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -11,11 +11,9 @@ #include <linux/pci.h> #include <linux/init.h> +#include <linux/acpi.h> #include "pci.h" -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -u32 pci_mmcfg_base_addr; - #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) /* The base address of the last MMCONFIG device accessed */ @@ -24,10 +22,31 @@ static u32 mmcfg_last_accessed_device; /* * Functions for accessing PCI configuration space with MMCONFIG accesses */ +static u32 get_base_addr(unsigned int seg, int bus) +{ + int cfg_num = -1; + struct acpi_table_mcfg_config *cfg; + + while (1) { + ++cfg_num; + if (cfg_num >= pci_mmcfg_config_num) { + /* something bad is going on, no cfg table is found. */ + /* so we fall back to the old way we used to do this */ + /* and just rely on the first entry to be correct. */ + return pci_mmcfg_config[0].base_address; + } + cfg = &pci_mmcfg_config[cfg_num]; + if (cfg->pci_segment_group_number != seg) + continue; + if ((cfg->start_bus_number <= bus) && + (cfg->end_bus_number >= bus)) + return cfg->base_address; + } +} -static inline void pci_exp_set_dev_base(int bus, int devfn) +static inline void pci_exp_set_dev_base(unsigned int seg, int bus, int devfn) { - u32 dev_base = pci_mmcfg_base_addr | (bus << 20) | (devfn << 12); + u32 dev_base = get_base_addr(seg, bus) | (bus << 20) | (devfn << 12); if (dev_base != mmcfg_last_accessed_device) { mmcfg_last_accessed_device = dev_base; set_fixmap_nocache(FIX_PCIE_MCFG, dev_base); @@ -44,7 +63,7 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, spin_lock_irqsave(&pci_config_lock, flags); - pci_exp_set_dev_base(bus, devfn); + pci_exp_set_dev_base(seg, bus, devfn); switch (len) { case 1: @@ -73,7 +92,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, spin_lock_irqsave(&pci_config_lock, flags); - pci_exp_set_dev_base(bus, devfn); + pci_exp_set_dev_base(seg, bus, devfn); switch (len) { case 1: @@ -101,7 +120,11 @@ static int __init pci_mmcfg_init(void) { if ((pci_probe & PCI_PROBE_MMCONF) == 0) goto out; - if (!pci_mmcfg_base_addr) + + acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].base_address == 0)) goto out; /* Kludge for now. Don't use mmconfig on AMD systems because diff --git a/arch/i386/pci/numa.c b/arch/i386/pci/numa.c index 9e369546189..adbe17a38f6 100644 --- a/arch/i386/pci/numa.c +++ b/arch/i386/pci/numa.c @@ -115,6 +115,8 @@ static int __init pci_numa_init(void) return 0; pci_root_bus = pcibios_scan_root(0); + if (pci_root_bus) + pci_bus_add_devices(pci_root_bus); if (num_online_nodes() > 1) for_each_online_node(quad) { if (quad == 0) diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index a8fc80ca69f..a80f0f55ff5 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -27,6 +27,7 @@ #define PCI_ASSIGN_ALL_BUSSES 0x4000 extern unsigned int pci_probe; +extern unsigned long pirq_table_addr; /* pci-i386.c */ diff --git a/arch/i386/pci/visws.c b/arch/i386/pci/visws.c index 6a924878443..314c933b6b8 100644 --- a/arch/i386/pci/visws.c +++ b/arch/i386/pci/visws.c @@ -21,7 +21,7 @@ static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; -void __init pcibios_penalize_isa_irq(int irq) {} +void __init pcibios_penalize_isa_irq(int irq, int active) {} unsigned int pci_bus0, pci_bus1; diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index 6f521cf19a1..c547c1af6fa 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -22,9 +22,11 @@ #include <linux/device.h> #include <linux/suspend.h> #include <linux/acpi.h> + #include <asm/uaccess.h> #include <asm/acpi.h> #include <asm/tlbflush.h> +#include <asm/processor.h> static struct saved_context saved_context; @@ -33,8 +35,6 @@ unsigned long saved_context_esp, saved_context_ebp; unsigned long saved_context_esi, saved_context_edi; unsigned long saved_context_eflags; -extern void enable_sep_cpu(void *); - void __save_processor_state(struct saved_context *ctxt) { kernel_fpu_begin(); @@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt) */ asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("sldt %0" : "=m" (ctxt->ldt)); asm volatile ("str %0" : "=m" (ctxt->tr)); /* @@ -107,7 +106,6 @@ static void fix_processor_context(void) void __restore_processor_state(struct saved_context *ctxt) { - /* * control registers */ @@ -117,6 +115,13 @@ void __restore_processor_state(struct saved_context *ctxt) asm volatile ("movl %0, %%cr0" :: "r" (ctxt->cr0)); /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); + asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + + /* * segment registers */ asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); @@ -125,21 +130,14 @@ void __restore_processor_state(struct saved_context *ctxt) asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); - asm volatile ("lldt %0" :: "m" (ctxt->ldt)); - - /* * sysenter MSRs */ if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(NULL); + enable_sep_cpu(); fix_processor_context(); do_fpu_end(); + mtrr_ap_init(); } void restore_processor_state(void) |