diff options
Diffstat (limited to 'arch/x86_64')
28 files changed, 602 insertions, 433 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 4b8326177c5..660a03a89e6 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -329,12 +329,15 @@ config HPET_EMULATE_RTC config GART_IOMMU bool "IOMMU support" + default y depends on PCI help - Support the K8 IOMMU. Needed to run systems with more than 4GB of memory + Support the IOMMU. Needed to run systems with more than 3GB of memory properly with 32-bit PCI devices that do not support DAC (Double Address Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter. Normally the kernel will take the right choice by itself. + This option includes a driver for the AMD Opteron/Athlon64 IOMMU + and a software emulation used on some other systems. If unsure, say Y. # need this always enabled with GART_IOMMU for the VIA workaround diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 42891569767..4c6ed96d5f7 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -21,18 +21,6 @@ # # $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ -# -# early bootup linking needs 32bit. You can either use real 32bit tools -# here or 64bit tools in 32bit mode. -# -IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer -IA32_LD := $(LD) -m elf_i386 -IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c -IA32_OBJCOPY := $(CROSS_COMPILE)objcopy -IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E -export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP - - LDFLAGS := -m elf_x86_64 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 569595b74c7..776f3c866b7 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.12-rc4 -# Fri May 13 06:39:11 2005 +# Linux kernel version: 2.6.13-rc3 +# Fri Jul 22 16:47:31 2005 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -84,14 +84,27 @@ CONFIG_X86_IO_APIC=y CONFIG_X86_LOCAL_APIC=y CONFIG_MTRR=y CONFIG_SMP=y -# CONFIG_PREEMPT is not set CONFIG_SCHED_SMT=y +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +CONFIG_PREEMPT_BKL=y CONFIG_K8_NUMA=y # CONFIG_NUMA_EMU is not set -CONFIG_DISCONTIGMEM=y +CONFIG_ARCH_DISCONTIGMEM_ENABLE=y CONFIG_NUMA=y +CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_SELECT_MEMORY_MODEL=y +# CONFIG_FLATMEM_MANUAL is not set +CONFIG_DISCONTIGMEM_MANUAL=y +# CONFIG_SPARSEMEM_MANUAL is not set +CONFIG_DISCONTIGMEM=y +CONFIG_FLAT_NODE_MEM_MAP=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_HAVE_DEC_LOCK=y -CONFIG_NR_CPUS=8 +CONFIG_NR_CPUS=32 CONFIG_HPET_TIMER=y CONFIG_X86_PM_TIMER=y CONFIG_HPET_EMULATE_RTC=y @@ -99,7 +112,13 @@ CONFIG_GART_IOMMU=y CONFIG_SWIOTLB=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y +CONFIG_PHYSICAL_START=0x100000 +# CONFIG_KEXEC is not set CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_ISA_DMA_API=y @@ -118,12 +137,11 @@ CONFIG_PM_STD_PARTITION="" CONFIG_ACPI=y CONFIG_ACPI_BOOT=y CONFIG_ACPI_INTERPRETER=y -CONFIG_ACPI_SLEEP=y -CONFIG_ACPI_SLEEP_PROC_FS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y # CONFIG_ACPI_VIDEO is not set +CONFIG_ACPI_HOTKEY=m CONFIG_ACPI_FAN=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y @@ -154,6 +172,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set # # CPUFreq processor drivers @@ -204,6 +223,76 @@ CONFIG_SYSVIPC_COMPAT=y CONFIG_UID16=y # +# Networking +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +# CONFIG_PACKET_MMAP is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +# CONFIG_IP_ADVANCED_ROUTER is not set +CONFIG_IP_FIB_HASH=y +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_IP_MROUTE is not set +# CONFIG_ARPD is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set +# CONFIG_INET_TUNNEL is not set +CONFIG_IP_TCPDIAG=y +CONFIG_IP_TCPDIAG_IPV6=y +# CONFIG_TCP_CONG_ADVANCED is not set +CONFIG_TCP_CONG_BIC=y +CONFIG_IPV6=y +# CONFIG_IPV6_PRIVACY is not set +# CONFIG_INET6_AH is not set +# CONFIG_INET6_ESP is not set +# CONFIG_INET6_IPCOMP is not set +# CONFIG_INET6_TUNNEL is not set +# CONFIG_IPV6_TUNNEL is not set +# CONFIG_NETFILTER is not set + +# +# SCTP Configuration (EXPERIMENTAL) +# +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_BRIDGE is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_DECNET is not set +# CONFIG_LLC2 is not set +# CONFIG_IPX is not set +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_SCHED is not set +# CONFIG_NET_CLS_ROUTE is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +CONFIG_NETPOLL=y +# CONFIG_NETPOLL_RX is not set +# CONFIG_NETPOLL_TRAP is not set +CONFIG_NET_POLL_CONTROLLER=y +# CONFIG_HAMRADIO is not set +# CONFIG_IRDA is not set +# CONFIG_BT is not set + +# # Device Drivers # @@ -308,6 +397,7 @@ CONFIG_BLK_DEV_AMD74XX=y # CONFIG_BLK_DEV_HPT366 is not set # CONFIG_BLK_DEV_SC1200 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set CONFIG_BLK_DEV_PDC202XX_NEW=y @@ -338,6 +428,7 @@ CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_OSST is not set # CONFIG_BLK_DEV_SR is not set # CONFIG_CHR_DEV_SG is not set +# CONFIG_CHR_DEV_SCH is not set # # Some SCSI devices (e.g. CD jukebox) support multiple LUNs @@ -372,7 +463,6 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set CONFIG_SCSI_SATA=y -# CONFIG_SCSI_SATA_AHCI is not set # CONFIG_SCSI_SATA_SVW is not set CONFIG_SCSI_ATA_PIIX=y # CONFIG_SCSI_SATA_NV is not set @@ -410,14 +500,21 @@ CONFIG_SCSI_QLA2XXX=y # # Multi-device support (RAID and LVM) # -# CONFIG_MD is not set +CONFIG_MD=y +# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_DM=y +# CONFIG_DM_CRYPT is not set +# CONFIG_DM_SNAPSHOT is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set # # Fusion MPT device support # -CONFIG_FUSION=y -CONFIG_FUSION_MAX_SGE=40 -# CONFIG_FUSION_CTL is not set +# CONFIG_FUSION is not set +# CONFIG_FUSION_SPI is not set +# CONFIG_FUSION_FC is not set # # IEEE 1394 (FireWire) support @@ -430,75 +527,8 @@ CONFIG_FUSION_MAX_SGE=40 # CONFIG_I2O is not set # -# Networking support -# -CONFIG_NET=y - -# -# Networking options -# -CONFIG_PACKET=y -# CONFIG_PACKET_MMAP is not set -CONFIG_UNIX=y -# CONFIG_NET_KEY is not set -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -# CONFIG_IP_ADVANCED_ROUTER is not set -# CONFIG_IP_PNP is not set -# CONFIG_NET_IPIP is not set -# CONFIG_NET_IPGRE is not set -# CONFIG_IP_MROUTE is not set -# CONFIG_ARPD is not set -# CONFIG_SYN_COOKIES is not set -# CONFIG_INET_AH is not set -# CONFIG_INET_ESP is not set -# CONFIG_INET_IPCOMP is not set -# CONFIG_INET_TUNNEL is not set -CONFIG_IP_TCPDIAG=y -CONFIG_IP_TCPDIAG_IPV6=y -CONFIG_IPV6=y -# CONFIG_IPV6_PRIVACY is not set -# CONFIG_INET6_AH is not set -# CONFIG_INET6_ESP is not set -# CONFIG_INET6_IPCOMP is not set -# CONFIG_INET6_TUNNEL is not set -# CONFIG_IPV6_TUNNEL is not set -# CONFIG_NETFILTER is not set - -# -# SCTP Configuration (EXPERIMENTAL) +# Network device support # -# CONFIG_IP_SCTP is not set -# CONFIG_ATM is not set -# CONFIG_BRIDGE is not set -# CONFIG_VLAN_8021Q is not set -# CONFIG_DECNET is not set -# CONFIG_LLC2 is not set -# CONFIG_IPX is not set -# CONFIG_ATALK is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set -# CONFIG_ECONET is not set -# CONFIG_WAN_ROUTER is not set - -# -# QoS and/or fair queueing -# -# CONFIG_NET_SCHED is not set -# CONFIG_NET_CLS_ROUTE is not set - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set -CONFIG_NETPOLL=y -# CONFIG_NETPOLL_RX is not set -# CONFIG_NETPOLL_TRAP is not set -CONFIG_NET_POLL_CONTROLLER=y -# CONFIG_HAMRADIO is not set -# CONFIG_IRDA is not set -# CONFIG_BT is not set CONFIG_NETDEVICES=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -517,7 +547,9 @@ CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set # CONFIG_SUNGEM is not set -# CONFIG_NET_VENDOR_3COM is not set +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=y +# CONFIG_TYPHOON is not set # # Tulip family network device support @@ -532,7 +564,7 @@ CONFIG_NET_PCI=y CONFIG_FORCEDETH=y # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set -# CONFIG_E100 is not set +CONFIG_E100=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set # CONFIG_NE2K_PCI is not set @@ -553,14 +585,15 @@ CONFIG_8139TOO=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_R8169 is not set +# CONFIG_SKGE is not set # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y +# CONFIG_BNX2 is not set # # Ethernet (10000 Mbit) @@ -647,7 +680,6 @@ CONFIG_SERIO_I8042=y CONFIG_SERIO_LIBPS2=y # CONFIG_SERIO_RAW is not set # CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y # # Character devices @@ -716,6 +748,7 @@ CONFIG_MAX_RAW_DEVS=256 # I2C support # # CONFIG_I2C is not set +# CONFIG_I2C_SENSOR is not set # # Dallas's 1-wire bus @@ -723,6 +756,12 @@ CONFIG_MAX_RAW_DEVS=256 # CONFIG_W1 is not set # +# Hardware Monitoring support +# +CONFIG_HWMON=y +# CONFIG_HWMON_DEBUG_CHIP is not set + +# # Misc devices # # CONFIG_IBM_ASM is not set @@ -808,6 +847,7 @@ CONFIG_USB_DEVICEFS=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y @@ -846,12 +886,15 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set # CONFIG_USB_WACOM is not set +# CONFIG_USB_ACECAD is not set # CONFIG_USB_KBTAB is not set # CONFIG_USB_POWERMATE is not set # CONFIG_USB_MTOUCH is not set +# CONFIG_USB_ITMTOUCH is not set # CONFIG_USB_EGALAX is not set # CONFIG_USB_XPAD is not set # CONFIG_USB_ATI_REMOTE is not set +# CONFIG_USB_KEYSPAN_REMOTE is not set # # USB Imaging devices @@ -902,10 +945,11 @@ CONFIG_USB_MON=y # CONFIG_USB_PHIDGETSERVO is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_SISUSBVGA is not set +# CONFIG_USB_LD is not set # CONFIG_USB_TEST is not set # -# USB ATM/DSL drivers +# USB DSL modem support # # @@ -924,6 +968,10 @@ CONFIG_USB_MON=y # CONFIG_INFINIBAND is not set # +# SN Devices +# + +# # Firmware Drivers # # CONFIG_EDD is not set @@ -935,6 +983,7 @@ CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y # CONFIG_EXT2_FS_SECURITY is not set +# CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y @@ -957,6 +1006,7 @@ CONFIG_FS_POSIX_ACL=y # CONFIG_XFS_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set +CONFIG_INOTIFY=y # CONFIG_QUOTA is not set CONFIG_DNOTIFY=y CONFIG_AUTOFS_FS=y @@ -986,7 +1036,6 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y CONFIG_SYSFS=y -# CONFIG_DEVFS_FS is not set # CONFIG_DEVPTS_FS_XATTR is not set CONFIG_TMPFS=y # CONFIG_TMPFS_XATTR is not set @@ -1016,15 +1065,18 @@ CONFIG_RAMFS=y # CONFIG_NFS_FS=y CONFIG_NFS_V3=y +# CONFIG_NFS_V3_ACL is not set # CONFIG_NFS_V4 is not set # CONFIG_NFS_DIRECTIO is not set CONFIG_NFSD=y CONFIG_NFSD_V3=y +# CONFIG_NFSD_V3_ACL is not set # CONFIG_NFSD_V4 is not set CONFIG_NFSD_TCP=y CONFIG_LOCKD=y CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=y +CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y # CONFIG_RPCSEC_GSS_KRB5 is not set # CONFIG_RPCSEC_GSS_SPKM3 is not set diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile index a12b19da4b5..f76217d8f57 100644 --- a/arch/x86_64/ia32/Makefile +++ b/arch/x86_64/ia32/Makefile @@ -4,14 +4,14 @@ obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \ ia32_signal.o tls32.o \ - ia32_binfmt.o fpu32.o ptrace32.o syscall32.o + ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) obj-$(CONFIG_IA32_AOUT) += ia32_aout.o -$(obj)/syscall32.o: $(src)/syscall32.c \ +$(obj)/syscall32_syscall.o: \ $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) # Teach kbuild about targets diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index c12edf5d97f..3e6780fa018 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c @@ -42,7 +42,7 @@ extern int ia32_setup_arg_pages(struct linux_binprm *bprm, static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); -#if CORE_DUMP +#ifdef CORE_DUMP static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); /* @@ -103,7 +103,7 @@ static struct linux_binfmt aout_format = { .module = THIS_MODULE, .load_binary = load_aout_binary, .load_shlib = load_aout_library, -#if CORE_DUMP +#ifdef CORE_DUMP .core_dump = aout_core_dump, #endif .min_coredump = PAGE_SIZE @@ -120,7 +120,7 @@ static void set_brk(unsigned long start, unsigned long end) up_write(¤t->mm->mmap_sem); } -#if CORE_DUMP +#ifdef CORE_DUMP /* * These are the only things you should do on a core-file: use only these * macros to write out all the necessary info. diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index cc935427d53..c45d6a05b98 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -591,11 +591,15 @@ ia32_sys_call_table: .quad compat_sys_mq_getsetattr .quad compat_sys_kexec_load /* reserved for kexec */ .quad compat_sys_waitid - .quad quiet_ni_syscall /* sys_altroot */ + .quad quiet_ni_syscall /* 285: sys_altroot */ .quad sys_add_key .quad sys_request_key .quad sys_keyctl - /* don't forget to change IA32_NR_syscalls */ + .quad sys_ioprio_set + .quad sys_ioprio_get /* 290 */ + .quad sys_inotify_init + .quad sys_inotify_add_watch + .quad sys_inotify_rm_watch ia32_syscall_end: .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 .quad ni_syscall diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 68a9ab06ee7..be996d1b691 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -61,6 +61,7 @@ #include <linux/ptrace.h> #include <linux/highuid.h> #include <linux/vmalloc.h> +#include <linux/fsnotify.h> #include <asm/mman.h> #include <asm/types.h> #include <asm/uaccess.h> @@ -984,8 +985,10 @@ asmlinkage long sys32_open(const char __user * filename, int flags, int mode) if (IS_ERR(f)) { put_unused_fd(fd); fd = error; - } else + } else { + fsnotify_open(f->f_dentry); fd_install(fd, f); + } } putname(tmp); } diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c index 01d8db1a1c0..adbc5f8089e 100644 --- a/arch/x86_64/ia32/syscall32.c +++ b/arch/x86_64/ia32/syscall32.c @@ -14,16 +14,6 @@ #include <asm/tlbflush.h> #include <asm/ia32_unistd.h> -/* 32bit VDSOs mapped into user space. */ -asm(".section \".init.data\",\"aw\"\n" - "syscall32_syscall:\n" - ".incbin \"arch/x86_64/ia32/vsyscall-syscall.so\"\n" - "syscall32_syscall_end:\n" - "syscall32_sysenter:\n" - ".incbin \"arch/x86_64/ia32/vsyscall-sysenter.so\"\n" - "syscall32_sysenter_end:\n" - ".previous"); - extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; extern int sysctl_vsyscall32; @@ -57,6 +47,7 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + int ret; vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) @@ -78,7 +69,11 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) vma->vm_mm = mm; down_write(&mm->mmap_sem); - insert_vm_struct(mm, vma); + if ((ret = insert_vm_struct(mm, vma))) { + up_write(&mm->mmap_sem); + kmem_cache_free(vm_area_cachep, vma); + return ret; + } mm->total_vm += npages; up_write(&mm->mmap_sem); return 0; diff --git a/arch/x86_64/ia32/syscall32_syscall.S b/arch/x86_64/ia32/syscall32_syscall.S new file mode 100644 index 00000000000..8f8271bdf13 --- /dev/null +++ b/arch/x86_64/ia32/syscall32_syscall.S @@ -0,0 +1,17 @@ +/* 32bit VDSOs mapped into user space. */ + + .section ".init.data","aw" + + .globl syscall32_syscall + .globl syscall32_syscall_end + +syscall32_syscall: + .incbin "arch/x86_64/ia32/vsyscall-syscall.so" +syscall32_syscall_end: + + .globl syscall32_sysenter + .globl syscall32_sysenter_end + +syscall32_sysenter: + .incbin "arch/x86_64/ia32/vsyscall-sysenter.so" +syscall32_sysenter_end: diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 28817490fdc..096d470e280 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -76,7 +76,7 @@ .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ - xorq %rax, %rax + xorl %eax, %eax pushq %rax /* ss */ CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* rsp */ @@ -423,7 +423,7 @@ ENTRY(stub_rt_sigreturn) testl $3,CS(%rdi) je 1f swapgs -1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count +1: incl %gs:pda_irqcount # RED-PEN should check preempt count movq %gs:pda_irqstackptr,%rax cmoveq %rax,%rsp pushq %rdi # save old stack @@ -436,7 +436,7 @@ ENTRY(common_interrupt) ret_from_intr: popq %rdi cli - subl $1,%gs:pda_irqcount + decl %gs:pda_irqcount #ifdef CONFIG_DEBUG_INFO movq RBP(%rdi),%rbp #endif @@ -494,7 +494,7 @@ retint_signal: sti SAVE_REST movq $-1,ORIG_RAX(%rsp) - xorq %rsi,%rsi # oldset + xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST @@ -752,7 +752,7 @@ child_rip: movq %rsi, %rdi call *%rax # exit - xorq %rdi, %rdi + xorl %edi, %edi call do_exit /* @@ -918,3 +918,15 @@ ENTRY(machine_check) ENTRY(call_debug) zeroentry do_call_debug +ENTRY(call_softirq) + movq %gs:pda_irqstackptr,%rax + pushq %r15 + movq %rsp,%r15 + incl %gs:pda_irqcount + cmove %rax,%rsp + call __do_softirq + movq %r15,%rsp + decl %gs:pda_irqcount + popq %r15 + ret + diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 69b9c25a8fc..30c843a5efd 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -31,6 +31,7 @@ u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; extern struct genapic apic_cluster; extern struct genapic apic_flat; +extern struct genapic apic_physflat; struct genapic *genapic = &apic_flat; @@ -44,12 +45,7 @@ void __init clustered_apic_check(void) u8 clusters, max_cluster; u8 id; u8 cluster_cnt[NUM_APIC_CLUSTERS]; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - /* AMD always uses flat mode right now */ - genapic = &apic_flat; - goto print; - } + int num_cpus = 0; #if defined(CONFIG_ACPI_BUS) /* @@ -64,15 +60,34 @@ void __init clustered_apic_check(void) #endif memset(cluster_cnt, 0, sizeof(cluster_cnt)); - for (i = 0; i < NR_CPUS; i++) { id = bios_cpu_apicid[i]; - if (id != BAD_APICID) - cluster_cnt[APIC_CLUSTERID(id)]++; + if (id == BAD_APICID) + continue; + num_cpus++; + cluster_cnt[APIC_CLUSTERID(id)]++; } + /* Don't use clustered mode on AMD platforms. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + genapic = &apic_physflat; +#ifndef CONFIG_CPU_HOTPLUG + /* In the CPU hotplug case we cannot use broadcast mode + because that opens a race when a CPU is removed. + Stay at physflat mode in this case. + It is bad to do this unconditionally though. Once + we have ACPI platform support for CPU hotplug + we should detect hotplug capablity from ACPI tables and + only do this when really needed. -AK */ + if (num_cpus <= 8) + genapic = &apic_flat; +#endif + goto print; + } + clusters = 0; max_cluster = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { if (cluster_cnt[i] > 0) { ++clusters; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 28284696508..adc96282a9e 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -2,13 +2,11 @@ * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 * - * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. + * Flat APIC subarch code. * * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. - * Ashok Raj <ashok.raj@intel.com> - * Removed IPI broadcast shortcut to support CPU hotplug */ #include <linux/config.h> #include <linux/threads.h> @@ -20,47 +18,6 @@ #include <asm/smp.h> #include <asm/ipi.h> -/* - * The following permit choosing broadcast IPI shortcut v.s sending IPI only - * to online cpus via the send_IPI_mask varient. - * The mask version is my preferred option, since it eliminates a lot of - * other extra code that would need to be written to cleanup intrs sent - * to a CPU while offline. - * - * Sending broadcast introduces lots of trouble in CPU hotplug situations. - * These IPI's are delivered to cpu's irrespective of their offline status - * and could pickup stale intr data when these CPUS are turned online. - * - * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with - * the idea of not using broadcast IPI's anymore. Hence the run time check - * is introduced, on his request so we can choose an alternate mechanism. - * - * Initial wacky performance tests that collect cycle counts show - * no increase in using mask v.s broadcast version. In fact they seem - * identical in terms of cycle counts. - * - * if we need to use broadcast, we need to do the following. - * - * cli; - * hold call_lock; - * clear any pending IPI, just ack and clear all pending intr - * set cpu_online_map; - * release call_lock; - * sti; - * - * The complicated dummy irq processing shown above is not required if - * we didnt sent IPI's to wrong CPU's in the first place. - * - * - Ashok Raj <ashok.raj@intel.com> - */ -#ifdef CONFIG_HOTPLUG_CPU -#define DEFAULT_SEND_IPI (1) -#else -#define DEFAULT_SEND_IPI (0) -#endif - -static int no_broadcast=DEFAULT_SEND_IPI; - static cpumask_t flat_target_cpus(void) { return cpu_online_map; @@ -119,37 +76,15 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } -static inline void __local_flat_send_IPI_allbutself(int vector) -{ - if (no_broadcast) { - cpumask_t mask = cpu_online_map; - int this_cpu = get_cpu(); - - cpu_clear(this_cpu, mask); - flat_send_IPI_mask(mask, vector); - put_cpu(); - } - else - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); -} - -static inline void __local_flat_send_IPI_all(int vector) -{ - if (no_broadcast) - flat_send_IPI_mask(cpu_online_map, vector); - else - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); -} - static void flat_send_IPI_allbutself(int vector) { if (((num_online_cpus()) - 1) >= 1) - __local_flat_send_IPI_allbutself(vector); + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); } static void flat_send_IPI_all(int vector) { - __local_flat_send_IPI_all(vector); + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } static int flat_apic_id_registered(void) @@ -170,16 +105,6 @@ static unsigned int phys_pkg_id(int index_msb) return ((ebx >> 24) & 0xFF) >> index_msb; } -static __init int no_ipi_broadcast(char *str) -{ - get_option(&str, &no_broadcast); - printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : - "IPI Broadcast"); - return 1; -} - -__setup("no_ipi_broadcast", no_ipi_broadcast); - struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, @@ -195,11 +120,62 @@ struct genapic apic_flat = { .phys_pkg_id = phys_pkg_id, }; -static int __init print_ipi_mode(void) +/* + * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * We cannot use logical delivery in this case because the mask + * overflows, so use physical mode. + */ + +static cpumask_t physflat_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) +{ + send_IPI_mask_sequence(cpumask, vector); +} + +static void physflat_send_IPI_allbutself(int vector) +{ + cpumask_t allbutme = cpu_online_map; + int me = get_cpu(); + cpu_clear(me, allbutme); + physflat_send_IPI_mask(allbutme, vector); + put_cpu(); +} + +static void physflat_send_IPI_all(int vector) { - printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : - "Shortcut"); - return 0; + physflat_send_IPI_mask(cpu_online_map, vector); } -late_initcall(print_ipi_mode); +static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return x86_cpu_to_apicid[cpu]; + else + return BAD_APICID; +} + +struct genapic apic_physflat = { + .name = "physical flat", + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST, + .target_cpus = physflat_target_cpus, + .apic_id_registered = flat_apic_id_registered, + .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ + .send_IPI_all = physflat_send_IPI_all, + .send_IPI_allbutself = physflat_send_IPI_allbutself, + .send_IPI_mask = physflat_send_IPI_mask, + .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 8d765aa77a2..98ff5eb32b9 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -137,14 +137,14 @@ startup_64: wrmsr /* Setup cr0 */ - xorq %rax, %rax - btsq $31, %rax /* Enable paging */ - btsq $0, %rax /* Enable protected mode */ - btsq $1, %rax /* Enable MP */ - btsq $4, %rax /* Enable ET */ - btsq $5, %rax /* Enable NE */ - btsq $16, %rax /* Enable WP */ - btsq $18, %rax /* Enable AM */ +#define CR0_PM 1 /* protected mode */ +#define CR0_MP (1<<1) +#define CR0_ET (1<<4) +#define CR0_NE (1<<5) +#define CR0_WP (1<<16) +#define CR0_AM (1<<18) +#define CR0_PAGING (1<<31) + movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax /* Make changes effective */ movq %rax, %cr0 diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index cc3fb85f514..849a20aec7c 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -135,3 +135,22 @@ void fixup_irqs(cpumask_t map) local_irq_disable(); } #endif + +extern void call_softirq(void); + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + pending = local_softirq_pending(); + /* Switch to interrupt stack */ + if (pending) + call_softirq(); + local_irq_restore(flags); +} +EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 60d1eff4156..89fab51e20f 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -8,43 +8,26 @@ #include <linux/mm.h> #include <linux/kexec.h> -#include <linux/delay.h> #include <linux/string.h> #include <linux/reboot.h> -#include <asm/pda.h> #include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/io.h> -#include <asm/apic.h> -#include <asm/cpufeature.h> -#include <asm/hw_irq.h> - -#define LEVEL0_SIZE (1UL << 12UL) -#define LEVEL1_SIZE (1UL << 21UL) -#define LEVEL2_SIZE (1UL << 30UL) -#define LEVEL3_SIZE (1UL << 39UL) -#define LEVEL4_SIZE (1UL << 48UL) - -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) -#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) - -static void init_level2_page(u64 *level2p, unsigned long addr) + +static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; addr &= PAGE_MASK; - end_addr = addr + LEVEL2_SIZE; + end_addr = addr + PUD_SIZE; while (addr < end_addr) { - *(level2p++) = addr | L1_ATTR; - addr += LEVEL1_SIZE; + set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; } } -static int init_level3_page(struct kimage *image, u64 *level3p, +static int init_level3_page(struct kimage *image, pud_t *level3p, unsigned long addr, unsigned long last_addr) { unsigned long end_addr; @@ -52,32 +35,32 @@ static int init_level3_page(struct kimage *image, u64 *level3p, result = 0; addr &= PAGE_MASK; - end_addr = addr + LEVEL3_SIZE; + end_addr = addr + PGDIR_SIZE; while ((addr < last_addr) && (addr < end_addr)) { struct page *page; - u64 *level2p; + pmd_t *level2p; page = kimage_alloc_control_pages(image, 0); if (!page) { result = -ENOMEM; goto out; } - level2p = (u64 *)page_address(page); + level2p = (pmd_t *)page_address(page); init_level2_page(level2p, addr); - *(level3p++) = __pa(level2p) | L2_ATTR; - addr += LEVEL2_SIZE; + set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + addr += PUD_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - *(level3p++) = 0; - addr += LEVEL2_SIZE; + pud_clear(level3p++); + addr += PUD_SIZE; } out: return result; } -static int init_level4_page(struct kimage *image, u64 *level4p, +static int init_level4_page(struct kimage *image, pgd_t *level4p, unsigned long addr, unsigned long last_addr) { unsigned long end_addr; @@ -85,28 +68,28 @@ static int init_level4_page(struct kimage *image, u64 *level4p, result = 0; addr &= PAGE_MASK; - end_addr = addr + LEVEL4_SIZE; + end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); while ((addr < last_addr) && (addr < end_addr)) { struct page *page; - u64 *level3p; + pud_t *level3p; page = kimage_alloc_control_pages(image, 0); if (!page) { result = -ENOMEM; goto out; } - level3p = (u64 *)page_address(page); + level3p = (pud_t *)page_address(page); result = init_level3_page(image, level3p, addr, last_addr); if (result) { goto out; } - *(level4p++) = __pa(level3p) | L3_ATTR; - addr += LEVEL3_SIZE; + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + addr += PGDIR_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - *(level4p++) = 0; - addr += LEVEL3_SIZE; + pgd_clear(level4p++); + addr += PGDIR_SIZE; } out: return result; @@ -115,52 +98,50 @@ out: static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { - u64 *level4p; - level4p = (u64 *)__va(start_pgtable); + pgd_t *level4p; + level4p = (pgd_t *)__va(start_pgtable); return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); } static void set_idt(void *newidt, u16 limit) { - unsigned char curidt[10]; + struct desc_ptr curidt; /* x86-64 supports unaliged loads & stores */ - (*(u16 *)(curidt)) = limit; - (*(u64 *)(curidt +2)) = (unsigned long)(newidt); + curidt.size = limit; + curidt.address = (unsigned long)newidt; __asm__ __volatile__ ( - "lidt %0\n" - : "=m" (curidt) + "lidtq %0\n" + : : "m" (curidt) ); }; static void set_gdt(void *newgdt, u16 limit) { - unsigned char curgdt[10]; + struct desc_ptr curgdt; /* x86-64 supports unaligned loads & stores */ - (*(u16 *)(curgdt)) = limit; - (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt); + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; __asm__ __volatile__ ( - "lgdt %0\n" - : "=m" (curgdt) + "lgdtq %0\n" + : : "m" (curgdt) ); }; static void load_segments(void) { __asm__ __volatile__ ( - "\tmovl $"STR(__KERNEL_DS)",%eax\n" - "\tmovl %eax,%ds\n" - "\tmovl %eax,%es\n" - "\tmovl %eax,%ss\n" - "\tmovl %eax,%fs\n" - "\tmovl %eax,%gs\n" + "\tmovl %0,%%ds\n" + "\tmovl %0,%%es\n" + "\tmovl %0,%%ss\n" + "\tmovl %0,%%fs\n" + "\tmovl %0,%%gs\n" + : : "a" (__KERNEL_DS) ); -#undef STR -#undef __STR } typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, @@ -178,7 +159,7 @@ int machine_kexec_prepare(struct kimage *image) /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + 4096UL; + control_code_buffer = start_pgtable + PAGE_SIZE; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); @@ -214,7 +195,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) /* Calculate the offsets */ page_list = image->head; start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + 4096UL; + control_code_buffer = start_pgtable + PAGE_SIZE; /* Set the low half of the page table to my identity mapped * page table for kexec. Leave the high half pointing at the diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 21e70625a49..3b267c91bb0 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -15,6 +15,8 @@ #include <linux/sysdev.h> #include <linux/miscdevice.h> #include <linux/fs.h> +#include <linux/cpu.h> +#include <linux/percpu.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -514,10 +516,7 @@ static struct sysdev_class mce_sysclass = { set_kset_name("machinecheck"), }; -static struct sys_device device_mce = { - .id = 0, - .cls = &mce_sysclass, -}; +static DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -542,27 +541,83 @@ ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) -static __cpuinit int mce_init_device(void) +/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +static __cpuinit int mce_create_device(unsigned int cpu) { int err; + if (!mce_available(&cpu_data[cpu])) + return -EIO; + + per_cpu(device_mce,cpu).id = cpu; + per_cpu(device_mce,cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce,cpu)); + + if (!err) { + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + } + return err; +} + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit void mce_remove_device(unsigned int cpu) +{ + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + sysdev_unregister(&per_cpu(device_mce,cpu)); +} +#endif + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + mce_create_device(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + mce_remove_device(cpu); + break; +#endif + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_device(void) +{ + int err; + int i = 0; + if (!mce_available(&boot_cpu_data)) return -EIO; err = sysdev_class_register(&mce_sysclass); - if (!err) - err = sysdev_register(&device_mce); - if (!err) { - /* could create per CPU objects, but it is not worth it. */ - sysdev_create_file(&device_mce, &attr_bank0ctl); - sysdev_create_file(&device_mce, &attr_bank1ctl); - sysdev_create_file(&device_mce, &attr_bank2ctl); - sysdev_create_file(&device_mce, &attr_bank3ctl); - sysdev_create_file(&device_mce, &attr_bank4ctl); - sysdev_create_file(&device_mce, &attr_tolerant); - sysdev_create_file(&device_mce, &attr_check_interval); - } - + + for_each_online_cpu(i) { + mce_create_device(i); + } + + register_cpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; - } + device_initcall(mce_init_device); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 9c5aa2a790c..08abf9f5b15 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -109,7 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver; + int ver, cpu; static int found_bsp=0; if (!(m->mpc_cpuflag & CPU_ENABLED)) @@ -131,7 +131,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) return; } - num_processors++; + cpu = num_processors++; if (m->mpc_apicid > MAX_APICS) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", @@ -155,13 +155,18 @@ static void __init MP_processor_info (struct mpc_config_processor *m) * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ + cpu = 0; + bios_cpu_apicid[0] = m->mpc_apicid; x86_cpu_to_apicid[0] = m->mpc_apicid; found_bsp = 1; - } else { - bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid; - x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid; - } + } else + cpu = num_processors - found_bsp; + bios_cpu_apicid[cpu] = m->mpc_apicid; + x86_cpu_to_apicid[cpu] = m->mpc_apicid; + + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); } static void __init MP_bus_info (struct mpc_config_bus *m) diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 57e71dbdfd6..47f95687905 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -109,23 +109,10 @@ void machine_shutdown(void) local_irq_enable(); } -void machine_restart(char * __unused) +void machine_emergency_restart(void) { int i; - printk("machine restart\n"); - - machine_shutdown(); - - if (!reboot_force) { - local_irq_disable(); -#ifndef CONFIG_SMP - disable_local_APIC(); -#endif - disable_IO_APIC(); - local_irq_enable(); - } - /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -150,18 +137,26 @@ void machine_restart(char * __unused) } } -EXPORT_SYMBOL(machine_restart); +void machine_restart(char * __unused) +{ + printk("machine restart\n"); + + if (!reboot_force) { + machine_shutdown(); + } + machine_emergency_restart(); +} void machine_halt(void) { } -EXPORT_SYMBOL(machine_halt); - void machine_power_off(void) { + if (!reboot_force) { + machine_shutdown(); + } if (pm_power_off) pm_power_off(); } -EXPORT_SYMBOL(machine_power_off); diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 5fd03225058..0aa526298e9 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -765,8 +765,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); int node = 0; unsigned bits; - if (c->x86_num_cores == 1) - return; bits = 0; while ((1 << bits) < c->x86_num_cores) @@ -1081,8 +1079,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) else mtrr_ap_init(); #ifdef CONFIG_NUMA - if (c != &boot_cpu_data) - numa_add_cpu(c - cpu_data); + numa_add_cpu(smp_processor_id()); #endif } diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index f2b9c6bc999..34082c1cc41 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/bootmem.h> #include <linux/bitops.h> +#include <linux/module.h> #include <asm/bootsetup.h> #include <asm/pda.h> #include <asm/pgtable.h> @@ -22,10 +23,8 @@ #include <asm/smp.h> #include <asm/i387.h> #include <asm/percpu.h> -#include <asm/mtrr.h> #include <asm/proto.h> -#include <asm/mman.h> -#include <asm/numa.h> +#include <asm/sections.h> char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; @@ -33,11 +32,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; -extern struct task_struct init_task; - -extern unsigned char __per_cpu_start[], __per_cpu_end[]; - -extern struct desc_ptr cpu_gdt_descr[]; struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); @@ -101,7 +95,7 @@ void __init setup_per_cpu_areas(void) #endif for (i = 0; i < NR_CPUS; i++) { - unsigned char *ptr; + char *ptr; if (!NODE_DATA(cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", @@ -190,11 +184,7 @@ void __cpuinit check_efer(void) */ void __cpuinit cpu_init (void) { -#ifdef CONFIG_SMP int cpu = stack_smp_processor_id(); -#else - int cpu = smp_processor_id(); -#endif struct tss_struct *t = &per_cpu(init_tss, cpu); unsigned long v; char *estacks = NULL; @@ -214,7 +204,7 @@ void __cpuinit cpu_init (void) printk("Initializing CPU#%d\n", cpu); - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index ccae392886a..e5958220d6b 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -129,10 +129,9 @@ asmlinkage void smp_invalidate_interrupt (void) } else leave_mm(cpu); } +out: ack_APIC_irq(); cpu_clear(cpu, flush_cpumask); - -out: put_cpu_no_resched(); } @@ -294,6 +293,69 @@ void unlock_ipi_call_lock(void) } /* + * this function sends a 'generic call function' IPI to one other CPU + * in the system. + */ +static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = 1; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function_single - Run a function on another CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Currently unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Retrurns 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute <func> + * or is or has executed. + */ + +int smp_call_function_single (int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + spin_lock_bh(&call_lock); + __smp_call_function_single(cpu, func, info, nonatomic, wait); + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; +} + +/* * this function sends a 'generic call function' IPI to all other CPUs * in the system. */ diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index b969ee12872..6e4807d64d4 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -113,24 +113,6 @@ struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) /* - * cpu_possible_map should be static, it cannot change as cpu's - * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to - * do this dynamically on cpu arrival/departure. - * cpu_present_map on the other hand can change dynamically. - * In case when cpu_hotplug is not compiled, then we resort to current - * behaviour, which is cpu_possible == cpu_present. - * If cpu-hotplug is supported, then we need to preallocate for all - * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. - * - Ashok Raj - */ -#ifdef CONFIG_HOTPLUG_CPU -#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map) -#else -#define fixup_cpu_possible_map(x) -#endif - -/* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. @@ -229,9 +211,6 @@ static __cpuinit void sync_master(void *arg) { unsigned long flags, i; - if (smp_processor_id() != boot_cpu_id) - return; - go[MASTER] = 0; local_irq_save(flags); @@ -280,12 +259,12 @@ get_delta(long *rt, long *master) return tcenter - best_tm; } -static __cpuinit void sync_tsc(void) +static __cpuinit void sync_tsc(unsigned int master) { int i, done = 0; long delta, adj, adjust_latency = 0; unsigned long flags, rt, master_time_stamp, bound; -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC static struct syncdebug { long rt; /* roundtrip time */ long master; /* master's timestamp */ @@ -294,9 +273,17 @@ static __cpuinit void sync_tsc(void) } t[NUM_ROUNDS] __cpuinitdata; #endif + printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", + smp_processor_id(), master); + go[MASTER] = 1; - smp_call_function(sync_master, NULL, 1, 0); + /* It is dangerous to broadcast IPI as cpus are coming up, + * as they may not be ready to accept them. So since + * we only need to send the ipi to the boot cpu direct + * the message, and avoid the race. + */ + smp_call_function_single(master, sync_master, NULL, 1, 0); while (go[MASTER]) /* wait for master to be ready */ no_cpu_relax(); @@ -321,7 +308,7 @@ static __cpuinit void sync_tsc(void) rdtscll(t); wrmsrl(MSR_IA32_TSC, t + adj); } -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC t[i].rt = rt; t[i].master = master_time_stamp; t[i].diff = delta; @@ -331,7 +318,7 @@ static __cpuinit void sync_tsc(void) } spin_unlock_irqrestore(&tsc_sync_lock, flags); -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC for (i = 0; i < NUM_ROUNDS; ++i) printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", t[i].rt, t[i].master, t[i].diff, t[i].lat); @@ -340,16 +327,14 @@ static __cpuinit void sync_tsc(void) printk(KERN_INFO "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " "maxerr %lu cycles)\n", - smp_processor_id(), boot_cpu_id, delta, rt); + smp_processor_id(), master, delta, rt); } static void __cpuinit tsc_sync_wait(void) { if (notscsync || !cpu_has_tsc) return; - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), - boot_cpu_id); - sync_tsc(); + sync_tsc(boot_cpu_id); } static __init int notscsync_setup(char *s) @@ -537,7 +522,7 @@ void __cpuinit start_secondary(void) extern volatile unsigned long init_rsp; extern void (*initial_code)(void); -#if APIC_DEBUG +#ifdef APIC_DEBUG static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; @@ -773,8 +758,9 @@ do_rest: initial_code = start_secondary; clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); - printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, - start_rip, init_rsp); + printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, + cpus_weight(cpu_present_map), + apicid); /* * This grunge runs the startup process for @@ -841,7 +827,7 @@ do_rest: else /* trampoline code not run */ printk("Not responding.\n"); -#if APIC_DEBUG +#ifdef APIC_DEBUG inquire_remote_apic(apicid); #endif } @@ -924,6 +910,27 @@ static __init void enforce_max_cpus(unsigned max_cpus) } } +#ifdef CONFIG_HOTPLUG_CPU +/* + * cpu_possible_map should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_map on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * If cpu-hotplug is supported, then we need to preallocate for all + * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. + * - Ashok Raj + */ +static void prefill_possible_map(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) + cpu_set(i, cpu_possible_map); +} +#endif + /* * Various sanity checks. */ @@ -987,25 +994,15 @@ static int __init smp_sanity_check(unsigned max_cpus) */ void __init smp_prepare_cpus(unsigned int max_cpus) { - int i; - nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ enforce_max_cpus(max_cpus); - /* - * Fill in cpu_present_mask - */ - for (i = 0; i < NR_CPUS; i++) { - int apicid = cpu_present_to_apicid(i); - if (physid_isset(apicid, phys_cpu_present_map)) { - cpu_set(i, cpu_present_map); - cpu_set(i, cpu_possible_map); - } - fixup_cpu_possible_map(i); - } +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1189,8 +1186,7 @@ void __cpu_die(unsigned int cpu) printk ("CPU %d is now offline\n", cpu); return; } - current->state = TASK_UNINTERRUPTIBLE; - schedule_timeout(HZ/10); + msleep(100); } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 10273663000..6ead433a388 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -594,9 +594,6 @@ asmlinkage void default_do_nmi(struct pt_regs *regs) if (!cpu) reason = get_nmi_reason(); - if (!cpu_online(cpu)) - return; - if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) == NOTIFY_STOP) diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index 33a873a3c22..841bd738a18 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -18,8 +18,6 @@ #include <asm/smp.h> #endif -int x86_udelay_tsc = 0; /* Delay via TSC */ - int read_current_timer(unsigned long *timer_value) { rdtscll(*timer_value); diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 2f187986f94..13792721037 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -23,7 +23,6 @@ #include <linux/vt_kern.h> /* For unblank_screen() */ #include <linux/compiler.h> #include <linux/module.h> -#include <linux/kprobes.h> #include <asm/system.h> #include <asm/uaccess.h> diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index ac61c186eb0..70cb2904a90 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,34 +36,36 @@ int numa_off __initdata; int __init compute_hash_shift(struct node *nodes, int numnodes) { int i; - int shift = 24; - u64 addr; + int shift = 20; + unsigned long addr,maxend=0; - /* When in doubt use brute force. */ - while (shift < 48) { - memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); - for (i = 0; i < numnodes; i++) { - if (nodes[i].start == nodes[i].end) - continue; - for (addr = nodes[i].start; - addr < nodes[i].end; - addr += (1UL << shift)) { - if (memnodemap[addr >> shift] != 0xff && - memnodemap[addr >> shift] != i) { - printk(KERN_INFO - "node %d shift %d addr %Lx conflict %d\n", - i, shift, addr, memnodemap[addr>>shift]); - goto next; - } - memnodemap[addr >> shift] = i; + for (i = 0; i < numnodes; i++) + if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend)) + maxend = nodes[i].end; + + while ((1UL << shift) < (maxend / NODEMAPSIZE)) + shift++; + + printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n", + shift,maxend); + memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); + for (i = 0; i < numnodes; i++) { + if (nodes[i].start == nodes[i].end) + continue; + for (addr = nodes[i].start; + addr < nodes[i].end; + addr += (1UL << shift)) { + if (memnodemap[addr >> shift] != 0xff) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d adder=%lu\n", + shift,addr); + return -1; } + memnodemap[addr >> shift] = i; } - return shift; - next: - shift++; } - memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); - return -1; + return shift; } #ifdef CONFIG_SPARSEMEM diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 5d01b31472e..8e3d097a9dd 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -20,6 +20,9 @@ static struct acpi_table_slit *acpi_slit; +/* Internal processor count */ +static unsigned int __initdata num_processors = 0; + static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; static struct node nodes[MAX_NUMNODES] __initdata; @@ -101,16 +104,18 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) bad_srat(); return; } - if (pa->apic_id >= NR_CPUS) { - printk(KERN_ERR "SRAT: lapic %u too large.\n", - pa->apic_id); + if (num_processors >= NR_CPUS) { + printk(KERN_ERR "SRAT: Processor #%d (lapic %u) INVALID. (Max ID: %d).\n", + num_processors, pa->apic_id, NR_CPUS); bad_srat(); return; } - cpu_to_node[pa->apic_id] = node; + cpu_to_node[num_processors] = node; acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", - pxm, pa->apic_id, node); + printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> CPU %u -> Node %u\n", + pxm, pa->apic_id, num_processors, node); + + num_processors++; } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -124,7 +129,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (srat_disabled() || ma->flags.enabled == 0) return; - /* hotplug bit is ignored for now */ pxm = ma->proximity_domain; node = setup_node(pxm); if (node < 0) { @@ -134,6 +138,10 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) } start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); + /* It is fine to add this area to the nodes data it will be used later*/ + if (ma->flags.hot_pluggable == 1) + printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", + start, end); i = conflicting_nodes(start, end); if (i >= 0) { printk(KERN_ERR diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 7e7d0c2a002..c2c38b57993 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -29,7 +29,7 @@ __init static int fill_mp_bus_to_cpumask(void) { struct pci_dev *nb_dev = NULL; - int i, j, printed; + int i, j; u32 ldtbus, nid; static int lbnr[3] = { LDT_BUS_NUMBER_REGISTER_0, |