From 86c0baf123e474b6eb404798926ecf62b426bf3a Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH] i386: Change sysenter_setup to __cpuinit & improve __INIT, __INITDATA Change sysenter_setup to __cpuinit. Change __INIT & __INITDATA to be cpu hotplug aware. Resolve MODPOST warnings similar to: WARNING: vmlinux - Section mismatch: reference to .init.text:sysenter_setup from .text between 'identify_cpu' (at offset 0xc040a380) and 'detect_ht' and WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_int80_end from .text between 'sysenter_setup' (at offset 0xc041a269) and 'enable_sep_cpu' WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_int80_start from .text between 'sysenter_setup' (at offset 0xc041a26e) and 'enable_sep_cpu' WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_sysenter_end from .text between 'sysenter_setup' (at offset 0xc041a275) and 'enable_sep_cpu' WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_sysenter_start from .text between 'sysenter_setup' (at offset 0xc041a27a) and 'enable_sep_cpu' Signed-off-by: Prarit Bhargava Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel/sysenter.c') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 13ca54a85a1..168f8147d3b 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -72,7 +72,7 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; -int __init sysenter_setup(void) +int __cpuinit sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); syscall_pages[0] = virt_to_page(syscall_page); -- cgit v1.2.3 From a6c4e076ee4c1ea670e4faa55814e63dd08e3f29 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH] i386: clean up identify_cpu identify_cpu() is used to identify both the boot CPU and secondary CPUs, but it performs some actions which only apply to the boot CPU. Those functions are therefore really __init functions, but because they're called by identify_cpu(), they must be marked __cpuinit. This patch splits identify_cpu() into identify_boot_cpu() and identify_secondary_cpu(), and calls the appropriate init functions from each. Also, identify_boot_cpu() and all the functions it dominates are marked __init. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel/sysenter.c') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 168f8147d3b..13ca54a85a1 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -72,7 +72,7 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; -int __cpuinit sysenter_setup(void) +int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); syscall_pages[0] = virt_to_page(syscall_page); -- cgit v1.2.3 From d4f7a2c18e59e0304a1c733589ce14fc02fec1bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH] i386: Relocate VDSO ELF headers to match mapped location with COMPAT_VDSO Some versions of libc can't deal with a VDSO which doesn't have its ELF headers matching its mapped address. COMPAT_VDSO maps the VDSO at a specific system-wide fixed address. Previously this was all done at build time, on the grounds that the fixed VDSO address is always at the top of the address space. However, a hypervisor may reserve some of that address space, pushing the fixmap address down. This patch does the adjustment dynamically at runtime, depending on the runtime location of the VDSO fixmap. [ Patch has been through several hands: Jan Beulich wrote the orignal version; Zach reworked it, and Jeremy converted it to relocate phdrs as well as sections. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden Cc: "Jan Beulich" Cc: Eric W. Biederman Cc: Andi Kleen Cc: Ingo Molnar Cc: Roland McGrath --- arch/i386/kernel/sysenter.c | 158 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 9 deletions(-) (limited to 'arch/i386/kernel/sysenter.c') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 13ca54a85a1..e5a958379ac 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Should the kernel map a VDSO page into processes and pass its @@ -46,6 +47,129 @@ __setup("vdso=", vdso_setup); extern asmlinkage void sysenter_entry(void); +#ifdef CONFIG_COMPAT_VDSO +static __init void reloc_symtab(Elf32_Ehdr *ehdr, + unsigned offset, unsigned size) +{ + Elf32_Sym *sym = (void *)ehdr + offset; + unsigned nsym = size / sizeof(*sym); + unsigned i; + + for(i = 0; i < nsym; i++, sym++) { + if (sym->st_shndx == SHN_UNDEF || + sym->st_shndx == SHN_ABS) + continue; /* skip */ + + if (sym->st_shndx > SHN_LORESERVE) { + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", + sym->st_shndx); + continue; + } + + switch(ELF_ST_TYPE(sym->st_info)) { + case STT_OBJECT: + case STT_FUNC: + case STT_SECTION: + case STT_FILE: + sym->st_value += VDSO_HIGH_BASE; + } + } +} + +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) +{ + Elf32_Dyn *dyn = (void *)ehdr + offset; + + for(; dyn->d_tag != DT_NULL; dyn++) + switch(dyn->d_tag) { + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_VERSYM: + case DT_VERDEF: + case DT_VERNEED: + case DT_ADDRRNGLO ... DT_ADDRRNGHI: + /* definitely pointers needing relocation */ + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_ENCODING ... OLD_DT_LOOS-1: + case DT_LOOS ... DT_HIOS-1: + /* Tags above DT_ENCODING are pointers if + they're even */ + if (dyn->d_tag >= DT_ENCODING && + (dyn->d_tag & 1) == 0) + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_VERDEFNUM: + case DT_VERNEEDNUM: + case DT_FLAGS_1: + case DT_RELACOUNT: + case DT_RELCOUNT: + case DT_VALRNGLO ... DT_VALRNGHI: + /* definitely not pointers */ + break; + + case OLD_DT_LOOS ... DT_LOOS-1: + case DT_HIOS ... DT_VALRNGLO-1: + default: + if (dyn->d_tag > DT_ENCODING) + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", + dyn->d_tag); + break; + } +} + +static __init void relocate_vdso(Elf32_Ehdr *ehdr) +{ + Elf32_Phdr *phdr; + Elf32_Shdr *shdr; + int i; + + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || + !elf_check_arch(ehdr) || + ehdr->e_type != ET_DYN); + + ehdr->e_entry += VDSO_HIGH_BASE; + + /* rebase phdrs */ + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++) { + phdr[i].p_vaddr += VDSO_HIGH_BASE; + + /* relocate dynamic stuff */ + if (phdr[i].p_type == PT_DYNAMIC) + reloc_dyn(ehdr, phdr[i].p_offset); + } + + /* rebase sections */ + shdr = (void *)ehdr + ehdr->e_shoff; + for(i = 0; i < ehdr->e_shnum; i++) { + if (!(shdr[i].sh_flags & SHF_ALLOC)) + continue; + + shdr[i].sh_addr += VDSO_HIGH_BASE; + + if (shdr[i].sh_type == SHT_SYMTAB || + shdr[i].sh_type == SHT_DYNSYM) + reloc_symtab(ehdr, shdr[i].sh_offset, + shdr[i].sh_size); + } +} +#else +static inline void relocate_vdso(Elf32_Ehdr *ehdr) +{ +} +#endif /* COMPAT_VDSO */ + void enable_sep_cpu(void) { int cpu = get_cpu(); @@ -75,6 +199,9 @@ static struct page *syscall_pages[1]; int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); + const void *vsyscall; + size_t vsyscall_len; + syscall_pages[0] = virt_to_page(syscall_page); #ifdef CONFIG_COMPAT_VDSO @@ -83,23 +210,23 @@ int __init sysenter_setup(void) #endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(syscall_page, - &vsyscall_int80_start, - &vsyscall_int80_end - &vsyscall_int80_start); - return 0; + vsyscall = &vsyscall_int80_start; + vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; + } else { + vsyscall = &vsyscall_sysenter_start; + vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; } - memcpy(syscall_page, - &vsyscall_sysenter_start, - &vsyscall_sysenter_end - &vsyscall_sysenter_start); + memcpy(syscall_page, vsyscall, vsyscall_len); + relocate_vdso(syscall_page); return 0; } -#ifndef CONFIG_COMPAT_VDSO /* Defined in vsyscall-sysenter.S */ extern void SYSENTER_RETURN; +#ifdef __HAVE_ARCH_GATE_AREA /* Setup a VMA at program startup for the vsyscall page */ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { @@ -159,4 +286,17 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } -#endif +#else /* !__HAVE_ARCH_GATE_AREA */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + /* + * If not creating userspace VMA, simply set vdso to point to + * fixmap page. + */ + current->mm->context.vdso = (void *)VDSO_HIGH_BASE; + current_thread_info()->sysenter_return = + (void *)VDSO_SYM(&SYSENTER_RETURN); + + return 0; +} +#endif /* __HAVE_ARCH_GATE_AREA */ -- cgit v1.2.3 From 1dbf527c51c6c20c19869c8125cb5b87c3d09506 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH] i386: Make COMPAT_VDSO runtime selectable. Now that relocation of the VDSO for COMPAT_VDSO users is done at runtime rather than compile time, it is possible to enable/disable compat mode at runtime. This patch allows you to enable COMPAT_VDSO mode with "vdso=2" on the kernel command line, or via sysctl. (Switching on a running system shouldn't be done lightly; any process which was relying on the compat VDSO will be upset if it goes away.) The COMPAT_VDSO config option still exists, but if enabled it just makes vdso_enabled default to VDSO_COMPAT. +From: Hugh Dickins Fix oops from i386-make-compat_vdso-runtime-selectable.patch. Even mingetty at system startup finds it easy to trigger an oops while reading /proc/PID/maps: though it has a good hold on the mm itself, that cannot stop exit_mm() from resetting tsk->mm to NULL. (It is usually show_map()'s call to get_gate_vma() which oopses, and I expect we could change that to check priv->tail_vma instead; but no matter, even m_start()'s call just after get_task_mm() is racy.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden Cc: "Jan Beulich" Cc: Eric W. Biederman Cc: Andi Kleen Cc: Ingo Molnar Cc: Roland McGrath --- arch/i386/kernel/sysenter.c | 145 ++++++++++++++++++++++++++++---------------- 1 file changed, 94 insertions(+), 51 deletions(-) (limited to 'arch/i386/kernel/sysenter.c') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index e5a958379ac..0b9768ee1e8 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -23,16 +23,25 @@ #include #include #include +#include + +enum { + VDSO_DISABLED = 0, + VDSO_ENABLED = 1, + VDSO_COMPAT = 2, +}; + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT VDSO_COMPAT +#else +#define VDSO_DEFAULT VDSO_ENABLED +#endif /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -#ifdef CONFIG_PARAVIRT -unsigned int __read_mostly vdso_enabled = 0; -#else -unsigned int __read_mostly vdso_enabled = 1; -#endif +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; EXPORT_SYMBOL_GPL(vdso_enabled); @@ -47,7 +56,6 @@ __setup("vdso=", vdso_setup); extern asmlinkage void sysenter_entry(void); -#ifdef CONFIG_COMPAT_VDSO static __init void reloc_symtab(Elf32_Ehdr *ehdr, unsigned offset, unsigned size) { @@ -164,11 +172,6 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) shdr[i].sh_size); } } -#else -static inline void relocate_vdso(Elf32_Ehdr *ehdr) -{ -} -#endif /* COMPAT_VDSO */ void enable_sep_cpu(void) { @@ -188,6 +191,25 @@ void enable_sep_cpu(void) put_cpu(); } +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} + /* * These symbols are defined by vsyscall.o to mark the bounds * of the ELF DSO images included therein. @@ -196,6 +218,22 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; +static void map_compat_vdso(int map) +{ + static int vdso_mapped; + + if (map == vdso_mapped) + return; + + vdso_mapped = map; + + __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, + map ? PAGE_READONLY_EXEC : PAGE_NONE); + + /* flush stray tlbs */ + flush_tlb_all(); +} + int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); @@ -204,10 +242,9 @@ int __init sysenter_setup(void) syscall_pages[0] = virt_to_page(syscall_page); -#ifdef CONFIG_COMPAT_VDSO - __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); + gate_vma_init(); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); -#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { vsyscall = &vsyscall_int80_start; @@ -226,42 +263,57 @@ int __init sysenter_setup(void) /* Defined in vsyscall-sysenter.S */ extern void SYSENTER_RETURN; -#ifdef __HAVE_ARCH_GATE_AREA /* Setup a VMA at program startup for the vsyscall page */ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { struct mm_struct *mm = current->mm; unsigned long addr; int ret; + bool compat; down_write(&mm->mmap_sem); - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall_pages); - if (ret) - goto up_fail; + /* Test compat mode once here, in case someone + changes it via sysctl */ + compat = (vdso_enabled == VDSO_COMPAT); + + map_compat_vdso(compat); + + if (compat) + addr = VDSO_HIGH_BASE; + else { + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + * + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully + * interpretable later without matching up the same + * kernel and hardware config to see what PC values + * meant. + */ + ret = install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + syscall_pages); + + if (ret) + goto up_fail; + } current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); -up_fail: + (void *)VDSO_SYM(&SYSENTER_RETURN); + + up_fail: up_write(&mm->mmap_sem); + return ret; } @@ -274,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma) struct vm_area_struct *get_gate_vma(struct task_struct *tsk) { + struct mm_struct *mm = tsk->mm; + + /* Check to see if this task was created in compat vdso mode */ + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) + return &gate_vma; return NULL; } @@ -286,17 +343,3 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } -#else /* !__HAVE_ARCH_GATE_AREA */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) -{ - /* - * If not creating userspace VMA, simply set vdso to point to - * fixmap page. - */ - current->mm->context.vdso = (void *)VDSO_HIGH_BASE; - current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); - - return 0; -} -#endif /* __HAVE_ARCH_GATE_AREA */ -- cgit v1.2.3 From a75c54f933bd8db9f4a609bd128663c179b3e6a1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH] i386: i386 separate hardware-defined TSS from Linux additions On Thu, 2007-03-29 at 13:16 +0200, Andi Kleen wrote: > Please clean it up properly with two structs. Not sure about this, now I've done it. Running it here. If you like it, I can do x86-64 as well. == lguest defines its own TSS struct because the "struct tss_struct" contains linux-specific additions. Andi asked me to split the struct in processor.h. Unfortunately it makes usage a little awkward. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel/sysenter.c') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 0b9768ee1e8..94defac6fc3 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -183,10 +183,10 @@ void enable_sep_cpu(void) return; } - tss->ss1 = __KERNEL_CS; - tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); put_cpu(); } -- cgit v1.2.3 From 752783c050f1729452a89b2baea45b0124ac91c7 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH] i386: In compat mode, the return value here was uninitialized. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel/sysenter.c') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 94defac6fc3..ff4ee6f3326 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -268,7 +268,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { struct mm_struct *mm = current->mm; unsigned long addr; - int ret; + int ret = 0; bool compat; down_write(&mm->mmap_sem); -- cgit v1.2.3