diff options
Diffstat (limited to 'arch/x86_64')
43 files changed, 714 insertions, 749 deletions
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index f4399c701b7..18c6e915d69 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -46,7 +46,7 @@ cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \ $(obj)/bzImage: $(obj)/bootsect $(obj)/setup \ $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) - @echo 'Kernel: $@ is ready' + @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE $(call if_changed,objcopy) diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index b38d5b8b5fb..0e10fd84c7c 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -83,7 +83,7 @@ static unsigned char *real_mode; /* Pointer to real-mode data */ #endif #define SCREEN_INFO (*(struct screen_info *)(real_mode+0)) -extern char input_data[]; +extern unsigned char input_data[]; extern int input_len; static long bytes_out = 0; @@ -288,7 +288,7 @@ void setup_normal_output_buffer(void) #else if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ + output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -305,7 +305,7 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv) #else if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); #endif - mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START; + mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; low_buffer_size = low_buffer_end - LOW_BUFFER_START; diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index bf57e2362bf..f8db7e500fb 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,11 +1,12 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.13-rc6-git3 -# Fri Aug 12 16:40:34 2005 +# Linux kernel version: 2.6.13-git11 +# Mon Sep 12 16:16:16 2005 # CONFIG_X86_64=y CONFIG_64BIT=y CONFIG_X86=y +CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_CALIBRATE_DELAY=y @@ -13,6 +14,7 @@ CONFIG_X86_CMPXCHG=y CONFIG_EARLY_PRINTK=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y # # Code maturity level options @@ -26,6 +28,7 @@ CONFIG_INIT_ENV_ARG_LIMIT=32 # General setup # CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y @@ -37,6 +40,7 @@ CONFIG_KOBJECT_UEVENT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set +CONFIG_INITRAMFS_SOURCE="" # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y @@ -102,6 +106,7 @@ CONFIG_DISCONTIGMEM_MANUAL=y CONFIG_DISCONTIGMEM=y CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_NEED_MULTIPLE_NODES=y +# CONFIG_SPARSEMEM_STATIC is not set CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_HAVE_DEC_LOCK=y CONFIG_NR_CPUS=32 @@ -122,6 +127,7 @@ CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_ISA_DMA_API=y +CONFIG_GENERIC_PENDING_IRQ=y # # Power management options @@ -194,7 +200,6 @@ CONFIG_UNORDERED_IO=y # CONFIG_PCIEPORTBUS is not set CONFIG_PCI_MSI=y # CONFIG_PCI_LEGACY_PROC is not set -# CONFIG_PCI_NAMES is not set # CONFIG_PCI_DEBUG is not set # @@ -234,7 +239,10 @@ CONFIG_INET=y CONFIG_IP_MULTICAST=y # CONFIG_IP_ADVANCED_ROUTER is not set CONFIG_IP_FIB_HASH=y -# CONFIG_IP_PNP is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +# CONFIG_IP_PNP_BOOTP is not set +# CONFIG_IP_PNP_RARP is not set # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set # CONFIG_IP_MROUTE is not set @@ -244,8 +252,8 @@ CONFIG_IP_FIB_HASH=y # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set # CONFIG_INET_TUNNEL is not set -CONFIG_IP_TCPDIAG=y -CONFIG_IP_TCPDIAG_IPV6=y +CONFIG_INET_DIAG=y +CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set CONFIG_TCP_CONG_BIC=y CONFIG_IPV6=y @@ -258,6 +266,11 @@ CONFIG_IPV6=y # CONFIG_NETFILTER is not set # +# DCCP Configuration (EXPERIMENTAL) +# +# CONFIG_IP_DCCP is not set + +# # SCTP Configuration (EXPERIMENTAL) # # CONFIG_IP_SCTP is not set @@ -280,9 +293,11 @@ CONFIG_IPV6=y # Network testing # # CONFIG_NET_PKTGEN is not set +# CONFIG_NETFILTER_NETLINK is not set # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set +# CONFIG_IEEE80211 is not set # # Device Drivers @@ -329,7 +344,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" CONFIG_LBD=y # CONFIG_CDROM_PKTCDVD is not set @@ -409,6 +423,7 @@ CONFIG_IDEDMA_AUTO=y # # SCSI device support # +# CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y # CONFIG_SCSI_PROC_FS is not set @@ -432,7 +447,7 @@ CONFIG_BLK_DEV_SD=y # # SCSI Transport Attributes # -# CONFIG_SCSI_SPI_ATTRS is not set +CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set # CONFIG_SCSI_ISCSI_ATTRS is not set @@ -458,6 +473,7 @@ CONFIG_SCSI_SATA=y # CONFIG_SCSI_SATA_AHCI is not set # CONFIG_SCSI_SATA_SVW is not set CONFIG_SCSI_ATA_PIIX=y +# CONFIG_SCSI_SATA_MV is not set # CONFIG_SCSI_SATA_NV is not set # CONFIG_SCSI_SATA_PROMISE is not set # CONFIG_SCSI_SATA_QSTOR is not set @@ -537,6 +553,11 @@ CONFIG_TUN=y # CONFIG_ARCNET is not set # +# PHY device support +# +# CONFIG_PHYLIB is not set + +# # Ethernet (10 or 100Mbit) # CONFIG_NET_ETHERNET=y @@ -586,6 +607,7 @@ CONFIG_E1000=y # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_R8169 is not set +# CONFIG_SIS190 is not set # CONFIG_SKGE is not set # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set @@ -595,6 +617,7 @@ CONFIG_TIGON3=y # # Ethernet (10000 Mbit) # +# CONFIG_CHELSIO_T1 is not set # CONFIG_IXGB is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set @@ -749,7 +772,6 @@ CONFIG_MAX_RAW_DEVS=256 # I2C support # # CONFIG_I2C is not set -# CONFIG_I2C_SENSOR is not set # # Dallas's 1-wire bus @@ -760,6 +782,7 @@ CONFIG_MAX_RAW_DEVS=256 # Hardware Monitoring support # CONFIG_HWMON=y +# CONFIG_HWMON_VID is not set # CONFIG_HWMON_DEBUG_CHIP is not set # @@ -768,6 +791,10 @@ CONFIG_HWMON=y # CONFIG_IBM_ASM is not set # +# Multimedia Capabilities Port drivers +# + +# # Multimedia devices # # CONFIG_VIDEO_DEV is not set @@ -858,9 +885,8 @@ CONFIG_USB_UHCI_HCD=y # # USB Device Class drivers # -# CONFIG_USB_AUDIO is not set +# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set # CONFIG_USB_BLUETOOTH_TTY is not set -# CONFIG_USB_MIDI is not set # CONFIG_USB_ACM is not set CONFIG_USB_PRINTER=y @@ -877,6 +903,7 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_SDDR09 is not set # CONFIG_USB_STORAGE_SDDR55 is not set # CONFIG_USB_STORAGE_JUMPSHOT is not set +# CONFIG_USB_STORAGE_ONETOUCH is not set # # USB Input Devices @@ -893,6 +920,7 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_MTOUCH is not set # CONFIG_USB_ITMTOUCH is not set # CONFIG_USB_EGALAX is not set +# CONFIG_USB_YEALINK is not set # CONFIG_USB_XPAD is not set # CONFIG_USB_ATI_REMOTE is not set # CONFIG_USB_KEYSPAN_REMOTE is not set @@ -976,6 +1004,8 @@ CONFIG_USB_MON=y # Firmware Drivers # # CONFIG_EDD is not set +# CONFIG_DELL_RBU is not set +CONFIG_DCDBAS=m # # File systems @@ -1000,10 +1030,6 @@ CONFIG_REISERFS_FS_POSIX_ACL=y # CONFIG_REISERFS_FS_SECURITY is not set # CONFIG_JFS_FS is not set CONFIG_FS_POSIX_ACL=y - -# -# XFS support -# # CONFIG_XFS_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set @@ -1012,6 +1038,7 @@ CONFIG_INOTIFY=y CONFIG_DNOTIFY=y CONFIG_AUTOFS_FS=y # CONFIG_AUTOFS4_FS is not set +# CONFIG_FUSE_FS is not set # # CD-ROM/DVD Filesystems @@ -1037,12 +1064,11 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y CONFIG_SYSFS=y -# CONFIG_DEVPTS_FS_XATTR is not set CONFIG_TMPFS=y -# CONFIG_TMPFS_XATTR is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y +# CONFIG_RELAYFS_FS is not set # # Miscellaneous filesystems @@ -1074,6 +1100,7 @@ CONFIG_NFSD_V3=y # CONFIG_NFSD_V3_ACL is not set # CONFIG_NFSD_V4 is not set CONFIG_NFSD_TCP=y +CONFIG_ROOT_NFS=y CONFIG_LOCKD=y CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=y @@ -1086,6 +1113,7 @@ CONFIG_SUNRPC=y # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set +# CONFIG_9P_FS is not set # # Partition Types @@ -1150,6 +1178,7 @@ CONFIG_OPROFILE=y CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y CONFIG_LOG_BUF_SHIFT=18 +CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_SPINLOCK is not set @@ -1157,6 +1186,7 @@ CONFIG_LOG_BUF_SHIFT=18 # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set CONFIG_DEBUG_FS=y +# CONFIG_FRAME_POINTER is not set CONFIG_INIT_DEBUG=y # CONFIG_IOMMU_DEBUG is not set CONFIG_KPROBES=y @@ -1180,5 +1210,6 @@ CONFIG_KPROBES=y # Library routines # # CONFIG_CRC_CCITT is not set +# CONFIG_CRC16 is not set CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5244f803203..e0eb0c712fe 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -55,20 +55,34 @@ * with the int 0x80 path. */ ENTRY(ia32_sysenter_target) - CFI_STARTPROC + CFI_STARTPROC simple + CFI_DEF_CFA rsp,0 + CFI_REGISTER rsp,rbp swapgs movq %gs:pda_kernelstack, %rsp addq $(PDA_STACKOFFSET),%rsp sti movl %ebp,%ebp /* zero extension */ pushq $__USER32_DS + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET ss,0*/ pushq %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rsp,0 pushfq + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET rflags,0*/ movl $VSYSCALL32_SYSEXIT, %r10d + CFI_REGISTER rip,r10 pushq $__USER32_CS + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET cs,0*/ movl %eax, %eax pushq %r10 + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip,0 pushq %rax + CFI_ADJUST_CFA_OFFSET 8 cld SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been @@ -79,6 +93,7 @@ ENTRY(ia32_sysenter_target) .previous GET_THREAD_INFO(%r10) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + CFI_REMEMBER_STATE jnz sysenter_tracesys sysenter_do_call: cmpl $(IA32_NR_syscalls),%eax @@ -94,14 +109,20 @@ sysenter_do_call: andl $~0x200,EFLAGS-R11(%rsp) RESTORE_ARGS 1,24,1,1,1,1 popfq + CFI_ADJUST_CFA_OFFSET -8 + /*CFI_RESTORE rflags*/ popq %rcx /* User %esp */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rsp,rcx movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ + CFI_REGISTER rip,rdx swapgs sti /* sti only takes effect after the next instruction */ /* sysexit */ .byte 0xf, 0x35 sysenter_tracesys: + CFI_RESTORE_STATE SAVE_REST CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* really needed? */ @@ -140,21 +161,28 @@ sysenter_tracesys: * with the int 0x80 path. */ ENTRY(ia32_cstar_target) - CFI_STARTPROC + CFI_STARTPROC simple + CFI_DEF_CFA rsp,0 + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ swapgs movl %esp,%r8d + CFI_REGISTER rsp,r8 movq %gs:pda_kernelstack,%rsp sti SAVE_ARGS 8,1,1 movl %eax,%eax /* zero extension */ movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) + CFI_REL_OFFSET rip,RIP-ARGOFFSET movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ movl %ebp,%ecx movq $__USER32_CS,CS-ARGOFFSET(%rsp) movq $__USER32_DS,SS-ARGOFFSET(%rsp) movq %r11,EFLAGS-ARGOFFSET(%rsp) + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ movq %r8,RSP-ARGOFFSET(%rsp) + CFI_REL_OFFSET rsp,RSP-ARGOFFSET /* no need to do an access_ok check here because r8 has been 32bit zero extended */ /* hardware stack frame is complete now */ @@ -164,6 +192,7 @@ ENTRY(ia32_cstar_target) .previous GET_THREAD_INFO(%r10) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + CFI_REMEMBER_STATE jnz cstar_tracesys cstar_do_call: cmpl $IA32_NR_syscalls,%eax @@ -177,12 +206,16 @@ cstar_do_call: jnz int_ret_from_sys_call RESTORE_ARGS 1,-ARG_SKIP,1,1,1 movl RIP-ARGOFFSET(%rsp),%ecx + CFI_REGISTER rip,rcx movl EFLAGS-ARGOFFSET(%rsp),%r11d + /*CFI_REGISTER rflags,r11*/ movl RSP-ARGOFFSET(%rsp),%esp + CFI_RESTORE rsp swapgs sysretl cstar_tracesys: + CFI_RESTORE_STATE SAVE_REST CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* really needed? */ @@ -226,11 +259,18 @@ ia32_badarg: */ ENTRY(ia32_syscall) - CFI_STARTPROC + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-RIP + /*CFI_REL_OFFSET ss,SS-RIP*/ + CFI_REL_OFFSET rsp,RSP-RIP + /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ + /*CFI_REL_OFFSET cs,CS-RIP*/ + CFI_REL_OFFSET rip,RIP-RIP swapgs sti movl %eax,%eax pushq %rax + CFI_ADJUST_CFA_OFFSET 8 cld /* note the registers are not zero extended to the sf. this could be a problem. */ @@ -278,6 +318,8 @@ quiet_ni_syscall: jmp ia32_ptregs_common .endm + CFI_STARTPROC + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx @@ -290,8 +332,9 @@ quiet_ni_syscall: PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx ENTRY(ia32_ptregs_common) - CFI_STARTPROC popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 SAVE_REST call *%rax RESTORE_REST diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 04d80406ce4..5389df610e7 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -751,7 +751,7 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count); set_fs(old_fs); - if (!ret && offset && put_user(of, offset)) + if (offset && put_user(of, offset)) return -EFAULT; return ret; diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 1579bdd0adc..bcdd0a805fe 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -46,3 +46,4 @@ microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o +msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 148f6f7ea31..867a0ebee17 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -34,7 +34,6 @@ #include <linux/slab.h> #include <linux/pci.h> #include <linux/bootmem.h> -#include <linux/irq.h> #include <linux/acpi.h> #include <asm/mpspec.h> #include <asm/io.h> diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index c9a6b812e92..962ad4823b6 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -245,6 +245,8 @@ void __init iommu_hole_init(void) if (aper_alloc) { /* Got the aperture from the AGP bridge */ + } else if (swiotlb && !valid_agp) { + /* Do nothing */ } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || force_iommu || valid_agp || diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 375d369570c..b6e7715d877 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -18,7 +18,6 @@ #include <linux/init.h> #include <linux/mm.h> -#include <linux/irq.h> #include <linux/delay.h> #include <linux/bootmem.h> #include <linux/smp_lock.h> @@ -109,11 +108,8 @@ void clear_local_APIC(void) if (maxlvt >= 4) apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); } void __init connect_bsp_APIC(void) @@ -316,8 +312,6 @@ void __init init_bsp_APIC(void) */ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); } @@ -325,14 +319,6 @@ void __cpuinit setup_local_APIC (void) { unsigned int value, ver, maxlvt; - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } - value = apic_read(APIC_LVR); ver = GET_APIC_VERSION(value); @@ -430,15 +416,11 @@ void __cpuinit setup_local_APIC (void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ + { unsigned oldvalue; maxlvt = get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); oldvalue = apic_read(APIC_ESR); value = ERROR_APIC_VECTOR; // enables sending errors apic_write_around(APIC_LVTERR, value); @@ -452,17 +434,6 @@ void __cpuinit setup_local_APIC (void) apic_printk(APIC_VERBOSE, "ESR value after enabling vector: %08x, after %08x\n", oldvalue, value); - } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n"); - else - apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n"); } nmi_watchdog_default(); @@ -650,8 +621,7 @@ void __init init_apic_mappings(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_id == -1U) - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); #ifdef CONFIG_X86_IO_APIC { @@ -693,8 +663,6 @@ static void __setup_APIC_LVTT(unsigned int clocks) ver = GET_APIC_VERSION(apic_read(APIC_LVR)); lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); apic_write_around(APIC_LVTT, lvtt_value); /* @@ -1081,7 +1049,7 @@ int __init APIC_init_uniprocessor (void) connect_bsp_APIC(); - phys_cpu_present_map = physid_mask_of_physid(0); + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); apic_write_around(APIC_ID, boot_cpu_id); setup_local_APIC(); diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c index 35b4c3fcbb3..aaa6d383351 100644 --- a/arch/x86_64/kernel/asm-offsets.c +++ b/arch/x86_64/kernel/asm-offsets.c @@ -39,7 +39,6 @@ int main(void) ENTRY(kernelstack); ENTRY(oldrsp); ENTRY(pcurrent); - ENTRY(irqrsp); ENTRY(irqcount); ENTRY(cpunumber); ENTRY(irqstackptr); diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index d7fa4248501..535e0446607 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/smp.h> -#include <linux/irq.h> #include <linux/reboot.h> #include <linux/kexec.h> diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index bb0ae18ec02..eb7929eea7b 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -131,7 +131,7 @@ void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned lon if (ei->type != E820_RAM || ei->addr+ei->size <= start || - ei->addr > end) + ei->addr >= end) continue; addr = round_up(ei->addr, PAGE_SIZE); diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 9631c747c5e..9cd968dd0f5 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -5,6 +5,7 @@ #include <linux/tty.h> #include <asm/io.h> #include <asm/processor.h> +#include <asm/fcntl.h> /* Simple VGA output */ @@ -158,6 +159,47 @@ static struct console early_serial_console = { .index = -1, }; +/* Console interface to a host file on AMD's SimNow! */ + +static int simnow_fd; + +enum { + MAGIC1 = 0xBACCD00A, + MAGIC2 = 0xCA110000, + XOPEN = 5, + XWRITE = 4, +}; + +static noinline long simnow(long cmd, long a, long b, long c) +{ + long ret; + asm volatile("cpuid" : + "=a" (ret) : + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); + return ret; +} + +void __init simnow_init(char *str) +{ + char *fn = "klog"; + if (*str == '=') + fn = ++str; + /* error ignored */ + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); +} + +static void simnow_write(struct console *con, const char *s, unsigned n) +{ + simnow(XWRITE, simnow_fd, (unsigned long)s, n); +} + +static struct console simnow_console = { + .name = "simnow", + .write = simnow_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + /* Direct interface for emergencies */ struct console *early_console = &early_vga_console; static int early_console_initialized = 0; @@ -205,6 +247,10 @@ int __init setup_early_printk(char *opt) max_xpos = SCREEN_INFO.orig_video_cols; max_ypos = SCREEN_INFO.orig_video_lines; early_console = &early_vga_console; + } else if (!strncmp(buf, "simnow", 6)) { + simnow_init(buf + 6); + early_console = &simnow_console; + keep_early = 1; } early_console_initialized = 1; register_console(early_console); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 3620508c8bd..7937971d185 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -79,16 +79,19 @@ xorl %eax, %eax pushq %rax /* ss */ CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET ss,0*/ pushq %rax /* rsp */ CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 + CFI_REL_OFFSET rsp,0 pushq $(1<<9) /* eflags - interrupts on */ CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET rflags,0*/ pushq $__KERNEL_CS /* cs */ CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET cs,0*/ pushq \child_rip /* rip */ CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 + CFI_REL_OFFSET rip,0 pushq %rax /* orig rax */ CFI_ADJUST_CFA_OFFSET 8 .endm @@ -98,32 +101,39 @@ CFI_ADJUST_CFA_OFFSET -(6*8) .endm - .macro CFI_DEFAULT_STACK - CFI_ADJUST_CFA_OFFSET (SS) - CFI_OFFSET r15,R15-SS - CFI_OFFSET r14,R14-SS - CFI_OFFSET r13,R13-SS - CFI_OFFSET r12,R12-SS - CFI_OFFSET rbp,RBP-SS - CFI_OFFSET rbx,RBX-SS - CFI_OFFSET r11,R11-SS - CFI_OFFSET r10,R10-SS - CFI_OFFSET r9,R9-SS - CFI_OFFSET r8,R8-SS - CFI_OFFSET rax,RAX-SS - CFI_OFFSET rcx,RCX-SS - CFI_OFFSET rdx,RDX-SS - CFI_OFFSET rsi,RSI-SS - CFI_OFFSET rdi,RDI-SS - CFI_OFFSET rsp,RSP-SS - CFI_OFFSET rip,RIP-SS + .macro CFI_DEFAULT_STACK start=1 + .if \start + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8 + .else + CFI_DEF_CFA_OFFSET SS+8 + .endif + CFI_REL_OFFSET r15,R15 + CFI_REL_OFFSET r14,R14 + CFI_REL_OFFSET r13,R13 + CFI_REL_OFFSET r12,R12 + CFI_REL_OFFSET rbp,RBP + CFI_REL_OFFSET rbx,RBX + CFI_REL_OFFSET r11,R11 + CFI_REL_OFFSET r10,R10 + CFI_REL_OFFSET r9,R9 + CFI_REL_OFFSET r8,R8 + CFI_REL_OFFSET rax,RAX + CFI_REL_OFFSET rcx,RCX + CFI_REL_OFFSET rdx,RDX + CFI_REL_OFFSET rsi,RSI + CFI_REL_OFFSET rdi,RDI + CFI_REL_OFFSET rip,RIP + /*CFI_REL_OFFSET cs,CS*/ + /*CFI_REL_OFFSET rflags,EFLAGS*/ + CFI_REL_OFFSET rsp,RSP + /*CFI_REL_OFFSET ss,SS*/ .endm /* * A newly forked process directly context switches into this. */ /* rdi: prev */ ENTRY(ret_from_fork) - CFI_STARTPROC CFI_DEFAULT_STACK call schedule_tail GET_THREAD_INFO(%rcx) @@ -172,16 +182,21 @@ rff_trace: */ ENTRY(system_call) - CFI_STARTPROC + CFI_STARTPROC simple + CFI_DEF_CFA rsp,0 + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ swapgs movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp sti SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) + CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + CFI_REMEMBER_STATE jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -201,9 +216,12 @@ sysret_check: cli movl threadinfo_flags(%rcx),%edx andl %edi,%edx + CFI_REMEMBER_STATE jnz sysret_careful movq RIP-ARGOFFSET(%rsp),%rcx + CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 + /*CFI_REGISTER rflags,r11*/ movq %gs:pda_oldrsp,%rsp swapgs sysretq @@ -211,12 +229,15 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: + CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal sti pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi + CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check /* Handle a signal */ @@ -234,8 +255,13 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi jmp sysret_check +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + /* Do syscall tracing */ tracesys: + CFI_RESTORE_STATE SAVE_REST movq $-ENOSYS,RAX(%rsp) FIXUP_TOP_OF_STACK %rdi @@ -254,16 +280,29 @@ tracesys: RESTORE_TOP_OF_STACK %rbx RESTORE_REST jmp ret_from_sys_call + CFI_ENDPROC -badsys: - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp ret_from_sys_call - /* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ -ENTRY(int_ret_from_sys_call) +ENTRY(int_ret_from_sys_call) + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-ARGOFFSET + /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ + CFI_REL_OFFSET rsp,RSP-ARGOFFSET + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ + /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/ + CFI_REL_OFFSET rip,RIP-ARGOFFSET + CFI_REL_OFFSET rdx,RDX-ARGOFFSET + CFI_REL_OFFSET rcx,RCX-ARGOFFSET + CFI_REL_OFFSET rax,RAX-ARGOFFSET + CFI_REL_OFFSET rdi,RDI-ARGOFFSET + CFI_REL_OFFSET rsi,RSI-ARGOFFSET + CFI_REL_OFFSET r8,R8-ARGOFFSET + CFI_REL_OFFSET r9,R9-ARGOFFSET + CFI_REL_OFFSET r10,R10-ARGOFFSET + CFI_REL_OFFSET r11,R11-ARGOFFSET cli testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args @@ -284,8 +323,10 @@ int_careful: jnc int_very_careful sti pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi + CFI_ADJUST_CFA_OFFSET -8 cli jmp int_with_check @@ -297,9 +338,11 @@ int_very_careful: testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx jz int_signal pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave popq %rdi + CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi cli jmp int_restore_rest @@ -329,6 +372,8 @@ int_restore_rest: jmp ptregscall_common .endm + CFI_STARTPROC + PTREGSCALL stub_clone, sys_clone, %r8 PTREGSCALL stub_fork, sys_fork, %rdi PTREGSCALL stub_vfork, sys_vfork, %rdi @@ -337,40 +382,49 @@ int_restore_rest: PTREGSCALL stub_iopl, sys_iopl, %rsi ENTRY(ptregscall_common) - CFI_STARTPROC popq %r11 - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 SAVE_REST movq %r11, %r15 + CFI_REGISTER rip, r15 FIXUP_TOP_OF_STACK %r11 call *%rax RESTORE_TOP_OF_STACK %r11 movq %r15, %r11 + CFI_REGISTER rip, r11 RESTORE_REST pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip, 0 ret CFI_ENDPROC ENTRY(stub_execve) CFI_STARTPROC popq %r11 - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 SAVE_REST movq %r11, %r15 + CFI_REGISTER rip, r15 FIXUP_TOP_OF_STACK %r11 call sys_execve GET_THREAD_INFO(%rcx) bt $TIF_IA32,threadinfo_flags(%rcx) + CFI_REMEMBER_STATE jc exec_32bit RESTORE_TOP_OF_STACK %r11 movq %r15, %r11 + CFI_REGISTER rip, r11 RESTORE_REST - push %r11 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip, 0 ret exec_32bit: - CFI_ADJUST_CFA_OFFSET REST_SKIP + CFI_RESTORE_STATE movq %rax,RAX(%rsp) RESTORE_REST jmp int_ret_from_sys_call @@ -382,7 +436,8 @@ exec_32bit: */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 SAVE_REST movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 @@ -392,6 +447,25 @@ ENTRY(stub_rt_sigreturn) jmp int_ret_from_sys_call CFI_ENDPROC +/* + * initial frame state for interrupts and exceptions + */ + .macro _frame ref + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-\ref + /*CFI_REL_OFFSET ss,SS-\ref*/ + CFI_REL_OFFSET rsp,RSP-\ref + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ + /*CFI_REL_OFFSET cs,CS-\ref*/ + CFI_REL_OFFSET rip,RIP-\ref + .endm + +/* initial frame state for interrupts (and exceptions without error code) */ +#define INTR_FRAME _frame RIP +/* initial frame state for exceptions with error code (and interrupts with + vector already pushed) */ +#define XCPT_FRAME _frame ORIG_RAX + /* * Interrupt entry/exit. * @@ -402,10 +476,6 @@ ENTRY(stub_rt_sigreturn) /* 0(%rsp): interrupt number */ .macro interrupt func - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) - CFI_REL_OFFSET rip,(RIP-ORIG_RAX) cld #ifdef CONFIG_DEBUG_INFO SAVE_ALL @@ -425,23 +495,27 @@ ENTRY(stub_rt_sigreturn) swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count movq %gs:pda_irqstackptr,%rax - cmoveq %rax,%rsp + cmoveq %rax,%rsp /*todo This needs CFI annotation! */ pushq %rdi # save old stack + CFI_ADJUST_CFA_OFFSET 8 call \func .endm ENTRY(common_interrupt) + XCPT_FRAME interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ -ret_from_intr: +ret_from_intr: popq %rdi + CFI_ADJUST_CFA_OFFSET -8 cli decl %gs:pda_irqcount #ifdef CONFIG_DEBUG_INFO movq RBP(%rdi),%rbp + CFI_DEF_CFA_REGISTER rsp #endif - leaq ARGOFFSET(%rdi),%rsp -exit_intr: + leaq ARGOFFSET(%rdi),%rsp /*todo This needs CFI annotation! */ +exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) je retint_kernel @@ -453,9 +527,10 @@ exit_intr: */ retint_with_reschedule: movl $_TIF_WORK_MASK,%edi -retint_check: +retint_check: movl threadinfo_flags(%rcx),%edx andl %edi,%edx + CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: swapgs @@ -476,14 +551,17 @@ bad_iret: jmp do_exit .previous - /* edi: workmask, edx: work */ + /* edi: workmask, edx: work */ retint_careful: + CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal sti pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi + CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) cli jmp retint_check @@ -523,7 +601,9 @@ retint_kernel: * APIC interrupts. */ .macro apicinterrupt num,func + INTR_FRAME pushq $\num-256 + CFI_ADJUST_CFA_OFFSET 8 interrupt \func jmp ret_from_intr CFI_ENDPROC @@ -536,8 +616,19 @@ ENTRY(thermal_interrupt) ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt -ENTRY(invalidate_interrupt) - apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt + .macro INVALIDATE_ENTRY num +ENTRY(invalidate_interrupt\num) + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt + .endm + + INVALIDATE_ENTRY 0 + INVALIDATE_ENTRY 1 + INVALIDATE_ENTRY 2 + INVALIDATE_ENTRY 3 + INVALIDATE_ENTRY 4 + INVALIDATE_ENTRY 5 + INVALIDATE_ENTRY 6 + INVALIDATE_ENTRY 7 ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt @@ -558,16 +649,23 @@ ENTRY(spurious_interrupt) * Exception entry points. */ .macro zeroentry sym + INTR_FRAME pushq $0 /* push error code/oldrax */ + CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* push real oldrax to the rdi slot */ + CFI_ADJUST_CFA_OFFSET 8 leaq \sym(%rip),%rax jmp error_entry + CFI_ENDPROC .endm .macro errorentry sym + XCPT_FRAME pushq %rax + CFI_ADJUST_CFA_OFFSET 8 leaq \sym(%rip),%rax jmp error_entry + CFI_ENDPROC .endm /* error code is on the stack already */ @@ -594,10 +692,7 @@ ENTRY(spurious_interrupt) * and the exception handler in %rax. */ ENTRY(error_entry) - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-RDI) - CFI_REL_OFFSET rip,(RIP-RDI) + _frame RDI /* rdi slot contains rax, oldrax contains error code */ cld subq $14*8,%rsp @@ -679,7 +774,9 @@ error_kernelspace: /* Reload gs selector with exception handling */ /* edi: new selector */ ENTRY(load_gs_index) + CFI_STARTPROC pushf + CFI_ADJUST_CFA_OFFSET 8 cli swapgs gs_change: @@ -687,7 +784,9 @@ gs_change: 2: mfence /* workaround */ swapgs popf + CFI_ADJUST_CFA_OFFSET -8 ret + CFI_ENDPROC .section __ex_table,"a" .align 8 @@ -799,7 +898,7 @@ ENTRY(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) - CFI_STARTPROC + INTR_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug @@ -809,9 +908,9 @@ KPROBE_ENTRY(debug) /* runs on exception stack */ ENTRY(nmi) - CFI_STARTPROC + INTR_FRAME pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 + CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi /* * "Paranoid" exit path from exception stack. @@ -877,7 +976,7 @@ ENTRY(reserved) /* runs on exception stack */ ENTRY(double_fault) - CFI_STARTPROC + XCPT_FRAME paranoidentry do_double_fault jmp paranoid_exit CFI_ENDPROC @@ -890,7 +989,7 @@ ENTRY(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) - CFI_STARTPROC + XCPT_FRAME paranoidentry do_stack_segment jmp paranoid_exit CFI_ENDPROC @@ -911,7 +1010,7 @@ ENTRY(spurious_interrupt_bug) #ifdef CONFIG_X86_MCE /* runs on exception stack */ ENTRY(machine_check) - CFI_STARTPROC + INTR_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check @@ -923,14 +1022,19 @@ ENTRY(call_debug) zeroentry do_call_debug ENTRY(call_softirq) + CFI_STARTPROC movq %gs:pda_irqstackptr,%rax pushq %r15 + CFI_ADJUST_CFA_OFFSET 8 movq %rsp,%r15 + CFI_DEF_CFA_REGISTER r15 incl %gs:pda_irqcount cmove %rax,%rsp call __do_softirq movq %r15,%rsp + CFI_DEF_CFA_REGISTER rsp decl %gs:pda_irqcount popq %r15 + CFI_ADJUST_CFA_OFFSET -8 ret - + CFI_ENDPROC diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index b1c144f7314..7a64ea18178 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -45,7 +45,7 @@ void __init clustered_apic_check(void) u8 clusters, max_cluster; u8 id; u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int num_cpus = 0; + int max_apic = 0; #if defined(CONFIG_ACPI) /* @@ -64,14 +64,15 @@ void __init clustered_apic_check(void) id = bios_cpu_apicid[i]; if (id == BAD_APICID) continue; - num_cpus++; + if (id > max_apic) + max_apic = id; cluster_cnt[APIC_CLUSTERID(id)]++; } /* Don't use clustered mode on AMD platforms. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { genapic = &apic_physflat; -#ifndef CONFIG_CPU_HOTPLUG +#ifndef CONFIG_HOTPLUG_CPU /* In the CPU hotplug case we cannot use broadcast mode because that opens a race when a CPU is removed. Stay at physflat mode in this case. @@ -79,7 +80,7 @@ void __init clustered_apic_check(void) we have ACPI platform support for CPU hotplug we should detect hotplug capablity from ACPI tables and only do this when really needed. -AK */ - if (num_cpus <= 8) + if (max_apic <= 8) genapic = &apic_flat; #endif goto print; @@ -103,9 +104,14 @@ void __init clustered_apic_check(void) * (We don't use lowest priority delivery + HW APIC IRQ steering, so * can ignore the clustered logical case and go straight to physical.) */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { +#ifdef CONFIG_HOTPLUG_CPU + /* Don't use APIC shortcuts in CPU hotplug to avoid races */ + genapic = &apic_physflat; +#else genapic = &apic_flat; - else +#endif + } else genapic = &apic_cluster; print: diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c index f6523dd1bc0..a472d62f899 100644 --- a/arch/x86_64/kernel/genapic_cluster.c +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -51,10 +51,10 @@ static void cluster_init_apic_ldr(void) count = 3; id = my_cluster | (1UL << count); x86_cpu_to_log_apicid[smp_processor_id()] = id; - apic_write_around(APIC_DFR, APIC_DFR_CLUSTER); + apic_write(APIC_DFR, APIC_DFR_CLUSTER); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); - apic_write_around(APIC_LDR, val); + apic_write(APIC_LDR, val); } /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 6d57da96bf8..9da3edb799e 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -38,10 +38,10 @@ static void flat_init_apic_ldr(void) num = smp_processor_id(); id = 1UL << num; x86_cpu_to_log_apicid[num] = id; - apic_write_around(APIC_DFR, APIC_DFR_FLAT); + apic_write(APIC_DFR, APIC_DFR_FLAT); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); - apic_write_around(APIC_LDR, val); + apic_write(APIC_LDR, val); } static void flat_send_IPI_mask(cpumask_t cpumask, int vector) @@ -62,7 +62,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) * prepare target chip field */ cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); + apic_write(APIC_ICR2, cfg); /* * program the ICR @@ -72,7 +72,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); local_irq_restore(flags); } @@ -177,9 +177,9 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) struct genapic apic_physflat = { .name = "physical flat", - .int_delivery_mode = dest_LowestPrio, + .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST, + .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = physflat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 98ff5eb32b9..4592bf21fca 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -270,7 +270,7 @@ ENTRY(level3_kernel_pgt) .org 0x4000 ENTRY(level2_ident_pgt) /* 40MB for bootup. */ - .quad 0x0000000000000283 + .quad 0x0000000000000183 .quad 0x0000000000200183 .quad 0x0000000000400183 .quad 0x0000000000600183 diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index a8916909512..b2a238b5a17 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -18,15 +18,12 @@ #include <asm/atomic.h> #include <asm/system.h> #include <asm/io.h> -#include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/pgtable.h> #include <asm/delay.h> #include <asm/desc.h> #include <asm/apic.h> -#include <linux/irq.h> - /* * Common place to define all x86 IRQ vectors * @@ -486,7 +483,14 @@ void spurious_interrupt(void); void error_interrupt(void); void reschedule_interrupt(void); void call_function_interrupt(void); -void invalidate_interrupt(void); +void invalidate_interrupt0(void); +void invalidate_interrupt1(void); +void invalidate_interrupt2(void); +void invalidate_interrupt3(void); +void invalidate_interrupt4(void); +void invalidate_interrupt5(void); +void invalidate_interrupt6(void); +void invalidate_interrupt7(void); void thermal_interrupt(void); void i8254_timer_resume(void); @@ -562,8 +566,15 @@ void __init init_IRQ(void) */ set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + /* IPIs for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c index c4dc9149115..e0ba5c1043f 100644 --- a/arch/x86_64/kernel/init_task.c +++ b/arch/x86_64/kernel/init_task.c @@ -44,6 +44,6 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS; #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 5f1529be123..522944a000a 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -21,7 +21,6 @@ */ #include <linux/mm.h> -#include <linux/irq.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/delay.h> @@ -45,6 +44,8 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; +int disable_timer_pin_1 __initdata; + static DEFINE_SPINLOCK(ioapic_lock); /* @@ -298,6 +299,15 @@ void __init check_ioapic(void) #endif /* RED-PEN skip them on mptables too? */ return; + case PCI_VENDOR_ID_ATI: + /* All timer interrupts on atiixp + are doubled. Disable one. */ + if (disable_timer_pin_1 == 0) { + disable_timer_pin_1 = 1; + printk(KERN_INFO + "ATI board detected. Disabling timer pin 1.\n"); + } + return; } /* No multi-function device? */ @@ -1022,13 +1032,11 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); v = apic_read(APIC_EOI); printk(KERN_DEBUG "... APIC EOI: %08x\n", v); @@ -1048,12 +1056,8 @@ void __apicdebuginit print_local_APIC(void * dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); v = apic_read(APIC_ICR); printk(KERN_DEBUG "... APIC ICR: %08x\n", v); @@ -1665,6 +1669,8 @@ static inline void check_timer(void) setup_nmi(); enable_8259A_irq(0); } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); return; } clear_IO_APIC_pin(0, pin1); diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 849a20aec7c..d6a04a8320a 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -99,7 +99,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) unsigned irq = regs->orig_rax & 0xff; irq_enter(); - BUG_ON(irq > 256); __do_IRQ(irq, regs); irq_exit(); diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 8aa56736cde..969365c0771 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/cpu.h> #include <linux/percpu.h> +#include <linux/ctype.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -56,15 +57,19 @@ void mce_log(struct mce *mce) smp_wmb(); for (;;) { entry = rcu_dereference(mcelog.next); - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); - return; + for (;;) { + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) - continue; smp_rmb(); next = entry + 1; if (cmpxchg(&mcelog.next, entry, next) == entry) @@ -404,9 +409,15 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff } err = 0; - for (i = 0; i < next; i++) { - if (!mcelog.entry[i].finished) - continue; + for (i = 0; i < next; i++) { + unsigned long start = jiffies; + while (!mcelog.entry[i].finished) { + if (!time_before(jiffies, start + 2)) { + memset(mcelog.entry + i,0, sizeof(struct mce)); + continue; + } + cpu_relax(); + } smp_rmb(); err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); buf += sizeof(struct mce); @@ -479,6 +490,7 @@ static int __init mcheck_disable(char *str) /* mce=off disables machine check. Note you can reenable it later using sysfs. + mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default to work around buggy BIOS that leave bogus MCEs. */ static int __init mcheck_enable(char *str) @@ -489,6 +501,8 @@ static int __init mcheck_enable(char *str) mce_dont_init = 1; else if (!strcmp(str, "bootlog")) mce_bootlog = 1; + else if (isdigit(str[0])) + get_option(&str, &tolerant); else printk("mce= argument %s ignored. Please use /sys", str); return 0; @@ -501,10 +515,12 @@ __setup("mce", mcheck_enable); * Sysfs support */ -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. + Only one CPU is active at this time, the others get readded later using + CPU hotplug. */ static int mce_resume(struct sys_device *dev) { - on_each_cpu(mce_init, NULL, 1, 1); + mce_init(NULL); return 0; } diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 8d8ed6ae1d0..f16d38d09da 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -14,7 +14,6 @@ */ #include <linux/mm.h> -#include <linux/irq.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/config.h> @@ -46,8 +45,6 @@ int acpi_found_madt; int apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -unsigned char pci_bus_to_node [256]; -EXPORT_SYMBOL(pci_bus_to_node); static int mp_current_pci_id = 0; /* I/O APIC entries */ @@ -705,7 +702,7 @@ void __init mp_register_lapic ( processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; - processor.mpc_apicver = 0x10; /* TBD: lapic version */ + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c deleted file mode 100644 index 598953ab015..00000000000 --- a/arch/x86_64/kernel/msr.c +++ /dev/null @@ -1,279 +0,0 @@ -/* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, - * USA; either version 2 of the License, or (at your option) any later - * version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * msr.c - * - * x86 MSR access device - * - * This device is accessed by lseek() to the appropriate register number - * and then read/write in chunks of 8 bytes. A larger size means multiple - * reads or writes of the same register. - * - * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on - * an SMP box will direct the access to CPU %d. - */ - -#include <linux/module.h> -#include <linux/config.h> - -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/init.h> -#include <linux/poll.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/major.h> -#include <linux/fs.h> - -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/uaccess.h> -#include <asm/system.h> - -/* Note: "err" is handled in a funny way below. Otherwise one version - of gcc or another breaks. */ - -static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) -{ - int err; - - asm volatile ("1: wrmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err) - :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); - - return err; -} - -static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) -{ - int err; - - asm volatile ("1: rdmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 1b,3b\n" - ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) - :"c"(reg), "i"(-EIO), "0"(0)); - - return err; -} - -#ifdef CONFIG_SMP - -struct msr_command { - int cpu; - int err; - u32 reg; - u32 data[2]; -}; - -static void msr_smp_wrmsr(void *cmd_block) -{ - struct msr_command *cmd = (struct msr_command *)cmd_block; - - if (cmd->cpu == smp_processor_id()) - cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); -} - -static void msr_smp_rdmsr(void *cmd_block) -{ - struct msr_command *cmd = (struct msr_command *)cmd_block; - - if (cmd->cpu == smp_processor_id()) - cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); -} - -static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) -{ - struct msr_command cmd; - int ret; - - preempt_disable(); - if (cpu == smp_processor_id()) { - ret = wrmsr_eio(reg, eax, edx); - } else { - cmd.cpu = cpu; - cmd.reg = reg; - cmd.data[0] = eax; - cmd.data[1] = edx; - - smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); - ret = cmd.err; - } - preempt_enable(); - return ret; -} - -static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) -{ - struct msr_command cmd; - int ret; - - preempt_disable(); - if (cpu == smp_processor_id()) { - ret = rdmsr_eio(reg, eax, edx); - } else { - cmd.cpu = cpu; - cmd.reg = reg; - - smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); - - *eax = cmd.data[0]; - *edx = cmd.data[1]; - - ret = cmd.err; - } - preempt_enable(); - return ret; -} - -#else /* ! CONFIG_SMP */ - -static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) -{ - return wrmsr_eio(reg, eax, edx); -} - -static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) -{ - return rdmsr_eio(reg, eax, edx); -} - -#endif /* ! CONFIG_SMP */ - -static loff_t msr_seek(struct file *file, loff_t offset, int orig) -{ - loff_t ret = -EINVAL; - - lock_kernel(); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - } - unlock_kernel(); - return ret; -} - -static ssize_t msr_read(struct file *file, char __user * buf, - size_t count, loff_t * ppos) -{ - u32 __user *tmp = (u32 __user *) buf; - u32 data[2]; - size_t rv; - u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); - int err; - - if (count % 8) - return -EINVAL; /* Invalid chunk size */ - - for (rv = 0; count; count -= 8) { - err = do_rdmsr(cpu, reg, &data[0], &data[1]); - if (err) - return err; - if (copy_to_user(tmp, &data, 8)) - return -EFAULT; - tmp += 2; - } - - return ((char __user *)tmp) - buf; -} - -static ssize_t msr_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - const u32 __user *tmp = (const u32 __user *)buf; - u32 data[2]; - size_t rv; - u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); - int err; - - if (count % 8) - return -EINVAL; /* Invalid chunk size */ - - for (rv = 0; count; count -= 8) { - if (copy_from_user(&data, tmp, 8)) - return -EFAULT; - err = do_wrmsr(cpu, reg, data[0], data[1]); - if (err) - return err; - tmp += 2; - } - - return ((char __user *)tmp) - buf; -} - -static int msr_open(struct inode *inode, struct file *file) -{ - unsigned int cpu = iminor(file->f_dentry->d_inode); - struct cpuinfo_x86 *c = &(cpu_data)[cpu]; - - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ - if (!cpu_has(c, X86_FEATURE_MSR)) - return -EIO; /* MSR not supported */ - - return 0; -} - -/* - * File operations we support - */ -static struct file_operations msr_fops = { - .owner = THIS_MODULE, - .llseek = msr_seek, - .read = msr_read, - .write = msr_write, - .open = msr_open, -}; - -static int __init msr_init(void) -{ - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { - printk(KERN_ERR "msr: unable to get major %d for msr\n", - MSR_MAJOR); - return -EBUSY; - } - - return 0; -} - -static void __exit msr_exit(void) -{ - unregister_chrdev(MSR_MAJOR, "cpu/msr"); -} - -module_init(msr_init); -module_exit(msr_exit) - -MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); -MODULE_DESCRIPTION("x86 generic MSR driver"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index caf164959e1..4388b8a5bae 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -14,7 +14,6 @@ #include <linux/config.h> #include <linux/mm.h> -#include <linux/irq.h> #include <linux/delay.h> #include <linux/bootmem.h> #include <linux/smp_lock.h> @@ -488,8 +487,8 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) == NOTIFY_STOP) { local_set(&__get_cpu_var(alert_counter), 0); return; - } - die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); + } + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); } } else { __get_cpu_var(last_irq_sum) = sum; diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 57f35c68aa3..cf0a0315d58 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -191,11 +191,9 @@ static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) { struct page *page; int node; - if (dev->bus == &pci_bus_type) { - cpumask_t mask; - mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); - node = cpu_to_node(first_cpu(mask)); - } else + if (dev->bus == &pci_bus_type) + node = pcibus_to_node(to_pci_dev(dev)->bus); + else node = numa_node_id(); page = alloc_pages_node(node, gfp, order); return page ? page_address(page) : NULL; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 8661f82ac70..b5a89c0bdf5 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -32,7 +32,6 @@ #include <linux/a.out.h> #include <linux/interrupt.h> #include <linux/delay.h> -#include <linux/irq.h> #include <linux/ptrace.h> #include <linux/utsname.h> #include <linux/random.h> @@ -123,6 +122,7 @@ static void poll_idle (void) : : "i" (_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + clear_thread_flag(TIF_POLLING_NRFLAG); } else { set_need_resched(); } @@ -271,8 +271,11 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s\n", - current->pid, current->comm, print_tainted(), system_utsname.release); + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + system_utsname.release, + (int)strcspn(system_utsname.version, " "), + system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); @@ -483,33 +486,6 @@ out: } /* - * This function selects if the context switch from prev to next - * has to tweak the TSC disable bit in the cr4. - */ -static inline void disable_tsc(struct task_struct *prev_p, - struct task_struct *next_p) -{ - struct thread_info *prev, *next; - - /* - * gcc should eliminate the ->thread_info dereference if - * has_secure_computing returns 0 at compile time (SECCOMP=n). - */ - prev = prev_p->thread_info; - next = next_p->thread_info; - - if (has_secure_computing(prev) || has_secure_computing(next)) { - /* slow path here */ - if (has_secure_computing(prev) && - !has_secure_computing(next)) { - write_cr4(read_cr4() & ~X86_CR4_TSD); - } else if (!has_secure_computing(prev) && - has_secure_computing(next)) - write_cr4(read_cr4() | X86_CR4_TSD); - } -} - -/* * This special macro can be used to load a debugging register */ #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) @@ -627,8 +603,6 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * } } - disable_tsc(prev_p, next_p); - return prev_p; } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 9aec524be3e..351d8d64c2f 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -336,6 +336,11 @@ static __init void parse_cmdline_early (char ** cmdline_p) #endif #endif + if (!memcmp(from, "disable_timer_pin_1", 19)) + disable_timer_pin_1 = 1; + if (!memcmp(from, "enable_timer_pin_1", 18)) + disable_timer_pin_1 = -1; + if (!memcmp(from, "nolapic", 7) || !memcmp(from, "disableapic", 11)) disable_apic = 1; @@ -755,6 +760,24 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) } } +#ifdef CONFIG_NUMA +static int nearby_node(int apicid) +{ + int i; + for (i = apicid - 1; i >= 0; i--) { + int node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + int node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + /* * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. @@ -763,8 +786,11 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP int cpu = smp_processor_id(); - int node = 0; unsigned bits; +#ifdef CONFIG_NUMA + int node = 0; + unsigned apicid = phys_proc_id[cpu]; +#endif bits = 0; while ((1 << bits) < c->x86_num_cores) @@ -776,20 +802,32 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) phys_proc_id[cpu] >>= bits; #ifdef CONFIG_NUMA - /* When an ACPI SRAT table is available use the mappings from SRAT - instead. */ - if (acpi_numa <= 0) { - node = phys_proc_id[cpu]; - if (!node_online(node)) - node = first_node(node_online_map); - cpu_to_node[cpu] = node; - } else { - node = cpu_to_node[cpu]; - } + node = phys_proc_id[cpu]; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + int ht_nodeid = apicid - (phys_proc_id[0] << bits); + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + cpu_to_node[cpu] = node; + + printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", + cpu, c->x86_num_cores, node, cpu_core_id[cpu]); #endif - - printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", - cpu, c->x86_num_cores, node, cpu_core_id[cpu]); #endif } @@ -909,6 +947,25 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } +static void srat_detect_node(void) +{ +#ifdef CONFIG_NUMA + unsigned apicid, node; + int cpu = smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + apicid = phys_proc_id[cpu]; + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE) + node = 0; + cpu_to_node[cpu] = node; + + if (acpi_numa > 0) + printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); +#endif +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ @@ -927,6 +984,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (c->x86 >= 15) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); c->x86_num_cores = intel_num_cpu_cores(c); + + srat_detect_node(); } static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index e3ffcacc8c9..bd33be24a38 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -94,7 +94,7 @@ void __init setup_per_cpu_areas(void) size = PERCPU_ENOUGH_ROOM; #endif - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu_mask (i, cpu_possible_map) { char *ptr; if (!NODE_DATA(cpu_to_node(i))) { @@ -119,7 +119,6 @@ void pda_init(int cpu) asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); wrmsrl(MSR_GS_BASE, cpu_pda + cpu); - pda->me = pda; pda->cpunumber = cpu; pda->irqcount = -1; pda->kernelstack = diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index e5958220d6b..9db9dda161b 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/mm.h> -#include <linux/irq.h> #include <linux/delay.h> #include <linux/spinlock.h> #include <linux/smp_lock.h> @@ -29,6 +28,8 @@ #include <asm/proto.h> #include <asm/apicdef.h> +#define __cpuinit __init + /* * Smarter SMP flushing macros. * c/o Linus Torvalds. @@ -37,19 +38,41 @@ * writing to user space from interrupts. (Its not allowed anyway). * * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right per cpu variable for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. */ -static cpumask_t flush_cpumask; -static struct mm_struct * flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +union smp_flush_state { + struct { + cpumask_t flush_cpumask; + struct mm_struct *flush_mm; + unsigned long flush_va; #define FLUSH_ALL -1ULL + spinlock_t tlbstate_lock; + }; + char pad[SMP_CACHE_BYTES]; +} ____cacheline_aligned; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static DEFINE_PER_CPU(union smp_flush_state, flush_state); /* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. */ -static inline void leave_mm (unsigned long cpu) +static inline void leave_mm(int cpu) { if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); @@ -101,15 +124,25 @@ static inline void leave_mm (unsigned long cpu) * * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. */ -asmlinkage void smp_invalidate_interrupt (void) +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) { - unsigned long cpu; + int cpu; + int sender; + union smp_flush_state *f; - cpu = get_cpu(); + cpu = smp_processor_id(); + /* + * orig_rax contains the interrupt vector - 256. + * Use that to determine where the sender put the data. + */ + sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START; + f = &per_cpu(flush_state, sender); - if (!cpu_isset(cpu, flush_cpumask)) + if (!cpu_isset(cpu, f->flush_cpumask)) goto out; /* * This was a BUG() but until someone can quote me the @@ -120,64 +153,63 @@ asmlinkage void smp_invalidate_interrupt (void) * BUG(); */ - if (flush_mm == read_pda(active_mm)) { + if (f->flush_mm == read_pda(active_mm)) { if (read_pda(mmu_state) == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (f->flush_va == FLUSH_ALL) local_flush_tlb(); else - __flush_tlb_one(flush_va); + __flush_tlb_one(f->flush_va); } else leave_mm(cpu); } out: ack_APIC_irq(); - cpu_clear(cpu, flush_cpumask); - put_cpu_no_resched(); + cpu_clear(cpu, f->flush_cpumask); } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { - cpumask_t tmp; - /* - * A couple of (to be removed) sanity checks: - * - * - we do not send IPIs to not-yet booted CPUs. - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(tmp, cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - if (!mm) - BUG(); + int sender; + union smp_flush_state *f; - /* - * I'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. - */ - spin_lock(&tlbstate_lock); - - flush_mm = mm; - flush_va = va; - cpus_or(flush_cpumask, cpumask, flush_cpumask); + /* Caller has disabled preemption */ + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + f = &per_cpu(flush_state, sender); + + /* Could avoid this lock when + num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + probably not worth checking this for a cache-hot lock. */ + spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); /* * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpus_empty(f->flush_cpumask)) + cpu_relax(); - while (!cpus_empty(flush_cpumask)) - mb(); /* nothing. lockup detection does not belong here */; + f->flush_mm = NULL; + f->flush_va = 0; + spin_unlock(&f->tlbstate_lock); +} - flush_mm = NULL; - flush_va = 0; - spin_unlock(&tlbstate_lock); +int __cpuinit init_smp_flush(void) +{ + int i; + for_each_cpu_mask(i, cpu_possible_map) { + spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i)); + } + return 0; } + +core_initcall(init_smp_flush); void flush_tlb_current_task(void) { @@ -295,8 +327,11 @@ void unlock_ipi_call_lock(void) /* * this function sends a 'generic call function' IPI to one other CPU * in the system. + * + * cpu is a standard Linux logical CPU number. */ -static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info, +static void +__smp_call_function_single(int cpu, void (*func) (void *info), void *info, int nonatomic, int wait) { struct call_data_struct data; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 4fb34b5cb1f..e12d7baeb33 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -44,7 +44,6 @@ #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/smp_lock.h> -#include <linux/irq.h> #include <linux/bootmem.h> #include <linux/thread_info.h> #include <linux/module.h> @@ -58,6 +57,8 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm/nmi.h> +#include <asm/irq.h> +#include <asm/hw_irq.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -413,8 +414,13 @@ void __cpuinit smp_callin(void) /* * Get our bogomips. + * + * Need to enable IRQs because it can take longer and then + * the NMI watchdog might kill us. */ + local_irq_enable(); calibrate_delay(); + local_irq_disable(); Dprintk("Stack at about %p\n",&cpuid); disable_APIC_timer(); @@ -540,8 +546,8 @@ static void inquire_remote_apic(int apicid) */ apic_wait_icr_idle(); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -574,12 +580,12 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); @@ -595,10 +601,10 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); timeout = 0; @@ -610,16 +616,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta atomic_set(&init_deasserted, 1); - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't send the STARTUP IPIs. - */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) - num_starts = 2; - else - num_starts = 0; + num_starts = 2; /* * Run STARTUP IPI loop. @@ -640,12 +637,11 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_rip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); /* * Give the other CPU some time to accept the IPI. @@ -864,17 +860,6 @@ static __cpuinit void smp_cleanup_boot(void) * Reset trampoline flag */ *((volatile int *) phys_to_virt(0x467)) = 0; - -#ifndef CONFIG_HOTPLUG_CPU - /* - * Free pages reserved for SMP bootup. - * When you add hotplug CPU support later remove this - * Note there is more work to be done for later CPU bootup. - */ - - free_page((unsigned long) __va(PAGE_SIZE)); - free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); -#endif } /* diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 0612640d91b..ebb9abf3ce6 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -8,23 +8,8 @@ */ #include <linux/config.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/poll.h> -#include <linux/delay.h> -#include <linux/sysrq.h> -#include <linux/proc_fs.h> -#include <linux/irq.h> -#include <linux/pm.h> -#include <linux/device.h> +#include <linux/smp.h> #include <linux/suspend.h> -#include <asm/uaccess.h> -#include <asm/acpi.h> -#include <asm/tlbflush.h> -#include <asm/io.h> #include <asm/proto.h> struct saved_context saved_context; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7b6abe05825..2373cb8b862 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -18,7 +18,6 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/mc146818rtc.h> -#include <linux/irq.h> #include <linux/time.h> #include <linux/ioport.h> #include <linux/module.h> @@ -937,7 +936,6 @@ void __init time_init(void) vxtime.mode = VXTIME_TSC; vxtime.quot = (1000000L << 32) / vxtime_hz; vxtime.tsc_quot = (1000L << 32) / cpu_khz; - vxtime.hz = vxtime_hz; rdtscll_sync(&vxtime.last_tsc); setup_irq(0, &irq0); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index f238d6078a5..b5e09e6b553 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -47,9 +47,6 @@ #include <asm/proto.h> #include <asm/nmi.h> -#include <linux/irq.h> - - extern struct gate_struct idt_table[256]; asmlinkage void divide_error(void); @@ -324,13 +321,13 @@ void handle_BUG(struct pt_regs *regs) if (__copy_from_user(&f, (struct bug_frame *) regs->rip, sizeof(struct bug_frame))) return; - if ((unsigned long)f.filename < __PAGE_OFFSET || + if (f.filename >= 0 || f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) return; - if (__get_user(tmp, f.filename)) - f.filename = "unmapped filename"; + if (__get_user(tmp, (char *)(long)f.filename)) + f.filename = (int)(long)"unmapped filename"; printk("----------- [cut here ] --------- [please bite here ] ---------\n"); - printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); + printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", (char *)(long)f.filename, f.line); } #ifdef CONFIG_BUG @@ -343,30 +340,33 @@ void out_of_line_bug(void) static DEFINE_SPINLOCK(die_lock); static int die_owner = -1; -void oops_begin(void) +unsigned long oops_begin(void) { - int cpu = safe_smp_processor_id(); - /* racy, but better than risking deadlock. */ - local_irq_disable(); + int cpu = safe_smp_processor_id(); + unsigned long flags; + + /* racy, but better than risking deadlock. */ + local_irq_save(flags); if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - spin_lock(&die_lock); + spin_lock(&die_lock); } - die_owner = cpu; + die_owner = cpu; console_verbose(); - bust_spinlocks(1); + bust_spinlocks(1); + return flags; } -void oops_end(void) +void oops_end(unsigned long flags) { die_owner = -1; - bust_spinlocks(0); - spin_unlock(&die_lock); + bust_spinlocks(0); + spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Oops"); -} + panic("Oops"); +} void __die(const char * str, struct pt_regs * regs, long err) { @@ -392,10 +392,11 @@ void __die(const char * str, struct pt_regs * regs, long err) void die(const char * str, struct pt_regs * regs, long err) { - oops_begin(); + unsigned long flags = oops_begin(); + handle_BUG(regs); __die(str, regs, err); - oops_end(); + oops_end(flags); do_exit(SIGSEGV); } static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) @@ -406,7 +407,8 @@ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long e void die_nmi(char *str, struct pt_regs *regs) { - oops_begin(); + unsigned long flags = oops_begin(); + /* * We are in trouble anyway, lets at least try * to get a message out. @@ -416,7 +418,7 @@ void die_nmi(char *str, struct pt_regs *regs) if (panic_on_timeout || panic_on_oops) panic("nmi watchdog"); printk("console shuts up ...\n"); - oops_end(); + oops_end(flags); do_exit(SIGSEGV); } @@ -790,13 +792,16 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { + switch (swd & ~cwd & 0x3f) { case 0x000: default: break; case 0x001: /* Invalid Op */ - case 0x041: /* Stack Fault */ - case 0x241: /* Stack Fault | Direction */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ info.si_code = FPE_FLTINV; break; case 0x002: /* Denormalize */ diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 2e573442594..70a0bd16085 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -107,7 +107,7 @@ static force_inline long time_syscall(long *t) return secs; } -static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { if (unlikely(!__sysctl_vsyscall)) return gettimeofday(tv,tz); @@ -120,7 +120,7 @@ static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz /* This will break when the xtime seconds get inaccurate, but that is * unlikely */ -static time_t __vsyscall(1) vtime(time_t *t) +time_t __vsyscall(1) vtime(time_t *t) { if (unlikely(!__sysctl_vsyscall)) return time_syscall(t); @@ -129,12 +129,12 @@ static time_t __vsyscall(1) vtime(time_t *t) return __xtime.tv_sec; } -static long __vsyscall(2) venosys_0(void) +long __vsyscall(2) venosys_0(void) { return -ENOSYS; } -static long __vsyscall(3) venosys_1(void) +long __vsyscall(3) venosys_1(void) { return -ENOSYS; } diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 816732d8858..b75b872ec15 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -221,12 +221,13 @@ int unhandled_signal(struct task_struct *tsk, int sig) static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { - oops_begin(); + unsigned long flags = oops_begin(); + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", current->comm, address); dump_pagetable(address); __die("Bad pagetable", regs, error_code); - oops_end(); + oops_end(flags); do_exit(SIGKILL); } @@ -304,6 +305,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long address; const struct exception_table_entry *fixup; int write; + unsigned long flags; siginfo_t info; #ifdef CONFIG_CHECKING @@ -521,7 +523,7 @@ no_context: * terminate things with extreme prejudice. */ - oops_begin(); + flags = oops_begin(); if (address < PAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); @@ -534,7 +536,7 @@ no_context: __die("Oops", regs, error_code); /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(); + oops_end(flags); do_exit(SIGKILL); /* diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index aa4a5189ece..e60a1a848de 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -57,31 +57,31 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); void show_mem(void) { - int i, total = 0, reserved = 0; - int shared = 0, cached = 0; + long i, total = 0, reserved = 0; + long shared = 0, cached = 0; pg_data_t *pgdat; struct page *page; - printk("Mem-info:\n"); + printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pfn_to_page(pgdat->node_start_pfn + i); total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; } } - printk("%d pages of RAM\n", total); - printk("%d reserved pages\n",reserved); - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); + printk(KERN_INFO "%lu pages of RAM\n", total); + printk(KERN_INFO "%lu reserved pages\n",reserved); + printk(KERN_INFO "%lu pages shared\n",shared); + printk(KERN_INFO "%lu pages swap cached\n",cached); } /* References to section boundaries */ @@ -381,41 +381,14 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) __flush_tlb_all(); } -static inline int page_is_ram (unsigned long pagenr) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - unsigned long addr, end; - - if (e820.map[i].type != E820_RAM) /* not usable memory */ - continue; - /* - * !!!FIXME!!! Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area. We need a sanity - * check here. - */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - -extern int swiotlb_force; - static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; void __init mem_init(void) { - int codesize, reservedpages, datasize, initsize; - int tmp; + long codesize, reservedpages, datasize, initsize; #ifdef CONFIG_SWIOTLB - if (swiotlb_force) - swiotlb = 1; if (!iommu_aperture && (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu)) swiotlb = 1; @@ -436,25 +409,11 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ #ifdef CONFIG_NUMA - totalram_pages += numa_free_all_bootmem(); - tmp = 0; - /* should count reserved pages here for all nodes */ + totalram_pages = numa_free_all_bootmem(); #else - -#ifdef CONFIG_FLATMEM - max_mapnr = end_pfn; - if (!mem_map) BUG(); -#endif - - totalram_pages += free_all_bootmem(); - - for (tmp = 0; tmp < end_pfn; tmp++) - /* - * Only count reserved RAM pages - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; + totalram_pages = free_all_bootmem(); #endif + reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); after_bootmem = 1; @@ -471,7 +430,7 @@ void __init mem_init(void) kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, VSYSCALL_END - VSYSCALL_START); - printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n", + printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), end_pfn << (PAGE_SHIFT-10), codesize >> 10, diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index ec35747aacd..65417b040c1 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -45,10 +45,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) unsigned long prevbase; struct node nodes[8]; int nodeid, i, nb; + unsigned char nodeids[8]; int found = 0; u32 reg; unsigned numnodes; nodemask_t nodes_parsed; + unsigned dualcore = 0; nodes_clear(nodes_parsed); @@ -67,11 +69,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = 0; for (i = 0; i < 8; i++) { unsigned long base,limit; - + u32 nodeid; + + /* Undefined before E stepping, but hopefully 0 */ + dualcore |= ((read_pci_config(0, nb, 3, 0xe8) >> 12) & 3) == 1; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); nodeid = limit & 7; + nodeids[i] = nodeid; if ((base & 3) == 0) { if (i < numnodes) printk("Skipping disabled node %d\n", i); @@ -157,8 +163,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { - /* assume 1:1 NODE:CPU */ - cpu_to_node[i] = i; + nodeid = nodeids[i]; + apicid_to_node[nodeid << dualcore] = i; + apicid_to_node[(nodeid << dualcore) + dualcore] = i; setup_node_bootmem(i, nodes[i].start, nodes[i].end); } } diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 04f7a33e144..80a49d9bd8a 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -28,8 +28,13 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; int memnode_shift; u8 memnodemap[NODEMAPSIZE]; -unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; +unsigned char cpu_to_node[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = NUMA_NO_NODE +}; +unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 8e3d097a9dd..4b2e844c15a 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -20,14 +20,20 @@ static struct acpi_table_slit *acpi_slit; -/* Internal processor count */ -static unsigned int __initdata num_processors = 0; - static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; static struct node nodes[MAX_NUMNODES] __initdata; static __u8 pxm2node[256] = { [0 ... 255] = 0xff }; +static int node_to_pxm(int n); + +int pxm_to_node(int pxm) +{ + if ((unsigned)pxm >= 256) + return 0; + return pxm2node[pxm]; +} + static __init int setup_node(int pxm) { unsigned node = pxm2node[pxm]; @@ -44,14 +50,14 @@ static __init int setup_node(int pxm) static __init int conflicting_nodes(unsigned long start, unsigned long end) { int i; - for_each_online_node(i) { + for_each_node_mask(i, nodes_parsed) { struct node *nd = &nodes[i]; if (nd->start == nd->end) continue; if (nd->end > start && nd->start < end) - return 1; + return i; if (nd->end == end && nd->start == start) - return 1; + return i; } return -1; } @@ -75,8 +81,11 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end) static __init void bad_srat(void) { + int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; + for (i = 0; i < MAX_LOCAL_APIC; i++) + apicid_to_node[i] = NUMA_NO_NODE; } static __init inline int srat_disabled(void) @@ -104,18 +113,10 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) bad_srat(); return; } - if (num_processors >= NR_CPUS) { - printk(KERN_ERR "SRAT: Processor #%d (lapic %u) INVALID. (Max ID: %d).\n", - num_processors, pa->apic_id, NR_CPUS); - bad_srat(); - return; - } - cpu_to_node[num_processors] = node; + apicid_to_node[pa->apic_id] = node; acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> CPU %u -> Node %u\n", - pxm, pa->apic_id, num_processors, node); - - num_processors++; + printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + pxm, pa->apic_id, node); } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -143,10 +144,15 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", start, end); i = conflicting_nodes(start, end); - if (i >= 0) { + if (i == node) { + printk(KERN_WARNING + "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", + pxm, start, end, nodes[i].start, nodes[i].end); + } else if (i >= 0) { printk(KERN_ERR - "SRAT: pxm %d overlap %lx-%lx with node %d(%Lx-%Lx)\n", - pxm, start, end, i, nodes[i].start, nodes[i].end); + "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", + pxm, start, end, node_to_pxm(i), + nodes[i].start, nodes[i].end); bad_srat(); return; } @@ -174,6 +180,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) int i; if (acpi_numa <= 0) return -1; + + /* First clean up the node list */ + for_each_node_mask(i, nodes_parsed) { + cutoff_node(i, start, end); + if (nodes[i].start == nodes[i].end) + node_clear(i, nodes_parsed); + } + memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed)); if (memnode_shift < 0) { printk(KERN_ERR @@ -181,16 +195,10 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) bad_srat(); return -1; } - for (i = 0; i < MAX_NUMNODES; i++) { - if (!node_isset(i, nodes_parsed)) - continue; - cutoff_node(i, start, end); - if (nodes[i].start == nodes[i].end) { - node_clear(i, nodes_parsed); - continue; - } + + /* Finally register nodes */ + for_each_node_mask(i, nodes_parsed) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] == NUMA_NO_NODE) continue; @@ -201,7 +209,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return 0; } -int node_to_pxm(int n) +static int node_to_pxm(int n) { int i; if (pxm2node[n] == n) diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index d80c323669e..3acf60ded2a 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -58,10 +58,16 @@ fill_mp_bus_to_cpumask(void) for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); j++) { - int node = NODE_ID(nid); + struct pci_bus *bus; + long node = NODE_ID(nid); + /* Algorithm a bit dumb, but + it shouldn't matter here */ + bus = pci_find_bus(0, j); + if (!bus) + continue; if (!node_online(node)) node = 0; - pci_bus_to_node[j] = node; + bus->sysdata = (void *)node; } } } diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 657e88aa090..a0838c4a94e 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -111,13 +111,6 @@ static int __init pci_mmcfg_init(void) (pci_mmcfg_config[0].base_address == 0)) return 0; - /* Kludge for now. Don't use mmconfig on AMD systems because - those have some busses where mmconfig doesn't work, - and we don't parse ACPI MCFG well enough to handle that. - Remove when proper handling is added. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return 0; - /* RED-PEN i386 doesn't do _nocache right now */ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { |