diff options
author | Vivek Goyal <vgoyal@in.ibm.com> | 2007-05-02 19:27:07 +0200 |
---|---|---|
committer | Andi Kleen <andi@basil.nowhere.org> | 2007-05-02 19:27:07 +0200 |
commit | 1ab60e0f72f71ec54831e525a3e1154f1c092408 (patch) | |
tree | bd7dd8bbff43e3e2e3597f2b7780e82a856bb9d7 /arch/x86_64/kernel | |
parent | 0dbf7028c0c1f266c9631139450a1502d3cd457e (diff) |
[PATCH] x86-64: Relocatable Kernel Support
This patch modifies the x86_64 kernel so that it can be loaded and run
at any 2M aligned address, below 512G. The technique used is to
compile the decompressor with -fPIC and modify it so the decompressor
is fully relocatable. For the main kernel the page tables are
modified so the kernel remains at the same virtual address. In
addition a variable phys_base is kept that holds the physical address
the kernel is loaded at. __pa_symbol is modified to add that when
we take the address of a kernel symbol.
When loaded with a normal bootloader the decompressor will decompress
the kernel to 2M and it will run there. This both ensures the
relocation code is always working, and makes it easier to use 2M
pages for the kernel and the cpu.
AK: changed to not make RELOCATABLE default in Kconfig
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r-- | arch/x86_64/kernel/head.S | 233 | ||||
-rw-r--r-- | arch/x86_64/kernel/suspend_asm.S | 7 |
2 files changed, 134 insertions, 106 deletions
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c211e52f133..36aa98a6d15 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,6 +5,7 @@ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> + * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> */ @@ -17,95 +18,127 @@ #include <asm/page.h> #include <asm/msr.h> #include <asm/cache.h> - + /* we are not able to switch in one step to the final KERNEL ADRESS SPACE - * because we need identity-mapped pages on setup so define __START_KERNEL to - * 0x100000 for this stage - * + * because we need identity-mapped pages. + * */ .text .section .bootstrap.text - .code32 - .globl startup_32 -/* %bx: 1 if coming from smp trampoline on secondary cpu */ -startup_32: - + .code64 + .globl startup_64 +startup_64: + /* - * At this point the CPU runs in 32bit protected mode (CS.D = 1) with - * paging disabled and the point of this file is to switch to 64bit - * long mode with a kernel mapping for kerneland to jump into the - * kernel virtual addresses. - * There is no stack until we set one up. + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86_64/boot/compressed/head.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. */ - /* Initialize the %ds segment register */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - - /* Load new GDT with the 64bit segments using 32bit descriptor */ - lgdt pGDT32 - __START_KERNEL_map - - /* If the CPU doesn't support CPUID this will double fault. - * Unfortunately it is hard to check for CPUID without a stack. + /* Compute the delta between the address I am compiled to run at and the + * address I am actually running at. */ - - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe no_long_mode - /* Check if long mode is implemented */ - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc no_long_mode - - /* - * Prepare for entering 64bits mode + leaq _text(%rip), %rbp + subq $_text - __START_KERNEL_map, %rbp + + /* Is the address not 2M aligned? */ + movq %rbp, %rax + andl $~LARGE_PAGE_MASK, %eax + testl %eax, %eax + jnz bad_address + + /* Is the address too large? */ + leaq _text(%rip), %rdx + movq $PGDIR_SIZE, %rax + cmpq %rax, %rdx + jae bad_address + + /* Fixup the physical addresses in the page table */ + addq %rbp, init_level4_pgt + 0(%rip) + addq %rbp, init_level4_pgt + (258*8)(%rip) + addq %rbp, init_level4_pgt + (511*8)(%rip) + + addq %rbp, level3_ident_pgt + 0(%rip) + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + + /* Add an Identity mapping if I am above 1G */ + leaq _text(%rip), %rdi + andq $LARGE_PAGE_MASK, %rdi + + movq %rdi, %rax + shrq $PUD_SHIFT, %rax + andq $(PTRS_PER_PUD - 1), %rax + jz ident_complete + + leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx + leaq level3_ident_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) + + movq %rdi, %rax + shrq $PMD_SHIFT, %rax + andq $(PTRS_PER_PMD - 1), %rax + leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq level2_spare_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) +ident_complete: + + /* Fixup the kernel text+data virtual addresses + */ + leaq level2_kernel_pgt(%rip), %rdi + leaq 4096(%rdi), %r8 + /* See if it is a valid page table entry */ +1: testq $1, 0(%rdi) + jz 2f + addq %rbp, 0(%rdi) + /* Go to the next page */ +2: addq $8, %rdi + cmp %r8, %rdi + jne 1b + + /* Fixup phys_base */ + addq %rbp, phys_base(%rip) - /* Enable PAE mode */ - xorl %eax, %eax - btsl $5, %eax - movl %eax, %cr4 - - /* Setup early boot stage 4 level pagetables */ - movl $(init_level4_pgt - __START_KERNEL_map), %eax - movl %eax, %cr3 - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - - /* Enable Long Mode */ - btsl $_EFER_LME, %eax - - /* Make changes effective */ - wrmsr +#ifdef CONFIG_SMP + addq %rbp, trampoline_level4_pgt + 0(%rip) + addq %rbp, trampoline_level4_pgt + (511*8)(%rip) +#endif +#ifdef CONFIG_ACPI_SLEEP + addq %rbp, wakeup_level4_pgt + 0(%rip) + addq %rbp, wakeup_level4_pgt + (511*8)(%rip) +#endif - xorl %eax, %eax - btsl $31, %eax /* Enable paging and in turn activate Long Mode */ - btsl $0, %eax /* Enable protected mode */ - /* Make changes effective */ - movl %eax, %cr0 - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + /* Due to ENTRY(), sometimes the empty space gets filled with + * zeros. Better take a jmp than relying on empty space being + * filled with 0x90 (nop) */ - ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) - - .code64 - .org 0x100 - .globl startup_64 -startup_64: + jmp secondary_startup_64 ENTRY(secondary_startup_64) - /* We come here either from startup_32 - * or directly from a 64bit bootloader. - * Since we may have come directly from a bootloader we - * reload the page tables here. + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded a mapped page table. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. */ /* Enable PAE mode and PGE */ @@ -116,8 +149,14 @@ ENTRY(secondary_startup_64) /* Setup early boot stage 4 level pagetables. */ movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax movq %rax, %cr3 + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + jmp *%rax +1: + /* Check if nx is implemented */ movl $0x80000001, %eax cpuid @@ -126,17 +165,11 @@ ENTRY(secondary_startup_64) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr - - /* Enable System Call */ - btsl $_EFER_SCE, %eax - - /* No Execute supported? */ - btl $20,%edi + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax -1: - /* Make changes effective */ - wrmsr +1: wrmsr /* Make changes effective */ /* Setup cr0 */ #define CR0_PM 1 /* protected mode */ @@ -163,7 +196,7 @@ ENTRY(secondary_startup_64) * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr + lgdt cpu_gdt_descr(%rip) /* set up data segments. actually 0 would do too */ movl $__KERNEL_DS,%eax @@ -214,6 +247,9 @@ initial_code: init_rsp: .quad init_thread_union+THREAD_SIZE-8 +bad_address: + jmp bad_address + ENTRY(early_idt_handler) cmpl $2,early_recursion_flag(%rip) jz 1f @@ -242,23 +278,7 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" -.code32 -ENTRY(no_long_mode) - /* This isn't an x86-64 CPU so hang */ -1: - jmp 1b - -.org 0xf00 - .globl pGDT32 -pGDT32: - .word gdt_end-cpu_gdt_table-1 - .long cpu_gdt_table-__START_KERNEL_map - -.org 0xf10 -ljumpvector: - .long startup_64-__START_KERNEL_map - .word __KERNEL_CS - +.balign PAGE_SIZE ENTRY(stext) ENTRY(_stext) @@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) - + NEXT_PAGE(level2_kernel_pgt) /* 40MB kernel mapping. The kernel code cannot be bigger than that. When you change this change KERNEL_TEXT_SIZE in page.h too. */ @@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt) /* Module mapping starts here */ .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 +NEXT_PAGE(level2_spare_pgt) + .fill 512,8,0 + #undef PMDS #undef NEXT_PAGE @@ -330,6 +353,10 @@ gdt: .endr #endif +ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ + .quad 0x0000000000000000 + /* We need valid kernel segments for data and code in long mode too * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index bfbe00763c6..16d183f67bc 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S @@ -71,9 +71,10 @@ loop: jmp loop done: /* go back to the original page tables */ - leaq init_level4_pgt(%rip), %rax - subq $__START_KERNEL_map, %rax - movq %rax, %cr3 + movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax + movq %rax, %cr3 + /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx |