From 79bf6d66abb5a20813a19dd365dfc49104f0bb88 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:54 -0700 Subject: x86: convert pgalloc_64.h from macros to inlines Convert asm-x86/pgalloc_64.h from macros into functions (#include hell prevents __*_free_tlb from being inline, but they're probably a bit big to inline anyway). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ff7906a9a4..2d89b7abbc5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -662,6 +662,22 @@ int memory_add_physaddr_to_nid(u64 start) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + tlb_remove_page(tlb, pte); +} + +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + tlb_remove_page(tlb, virt_to_page(pmd)); +} + +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + tlb_remove_page(tlb, virt_to_page(pud)); +} + #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, -- cgit v1.2.3 From 4f76cd382213b29dd3658e3e1ea47c0c2be06f3c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:55 -0700 Subject: x86: add common mm/pgtable.c Add a common arch/x86/mm/pgtable.c file for common pagetable functions. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/pgtable.c | 239 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/pgtable_32.c | 187 ------------------------------------ 3 files changed, 240 insertions(+), 188 deletions(-) create mode 100644 arch/x86/mm/pgtable.c (limited to 'arch') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 20941d2954e..b7b3e4c7cfc 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o + pat.o pgtable.o obj-$(CONFIG_X86_32) += pgtable_32.o diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 00000000000..d526b46ae18 --- /dev/null +++ b/arch/x86/mm/pgtable.c @@ -0,0 +1,239 @@ +#include +#include +#include + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + if (pte) + pgtable_page_ctor(pte); + return pte; +} + +#ifdef CONFIG_X86_64 +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + unsigned long flags; + + spin_lock_irqsave(&pgd_lock, flags); + list_add(&page->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + unsigned long flags; + + spin_lock_irqsave(&pgd_lock, flags); + list_del(&page->lru); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + unsigned boundary; + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + if (!pgd) + return NULL; + pgd_list_add(pgd); + /* + * Copy kernel pointers in from init. + * Could keep a freelist or slab cache of those because the kernel + * part never changes. + */ + boundary = pgd_index(__PAGE_OFFSET); + memset(pgd, 0, boundary * sizeof(pgd_t)); + memcpy(pgd + boundary, + init_level4_pgt + boundary, + (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); + return pgd; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + pgd_list_del(pgd); + free_page((unsigned long)pgd); +} +#else +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) + +static void pgd_ctor(void *p) +{ + pgd_t *pgd = p; + unsigned long flags; + + /* Clear usermode parts of PGD */ + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { + clone_pgd_range(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); +} + +static void pgd_dtor(void *pgd) +{ + unsigned long flags; /* can be called from interrupt context */ + + if (SHARED_KERNEL_PMD) + return; + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + unsigned long addr; + int i; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmd_alloc_one(mm, addr); + + if (!pmd) { + pgd_mop_up_pmds(mm, pgd); + return 0; + } + + if (i >= USER_PTRS_PER_PGD) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, pud, pmd); + } + + return 1; +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; +} + +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) +{ +} +#endif /* CONFIG_X86_PAE */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + /* so that alloc_pd can use it */ + mm->pgd = pgd; + if (pgd) + pgd_ctor(pgd); + + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + pgd_dtor(pgd); + free_page((unsigned long)pgd); + pgd = NULL; + } + + return pgd; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + free_page((unsigned long)pgd); +} +#endif diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6fb9e7c6893..b46893e45d0 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -173,193 +173,6 @@ void reserve_top_address(unsigned long reserve) __VMALLOC_RESERVE += reserve; } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -#endif - if (pte) - pgtable_page_ctor(pte); - return pte; -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * -- wli - */ -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_add(&page->lru, &pgd_list); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_del(&page->lru); -} - -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -static void pgd_ctor(void *p) -{ - pgd_t *pgd = p; - unsigned long flags; - - /* Clear usermode parts of PGD */ - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - - spin_lock_irqsave(&pgd_lock, flags); - - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { - clone_pgd_range(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - } - - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) - pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); -} - -static void pgd_dtor(void *pgd) -{ - unsigned long flags; /* can be called from interrupt context */ - - if (SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} - -#ifdef CONFIG_X86_PAE -/* - * Mop up any pmd pages which may still be attached to the pgd. - * Normally they will be freed by munmap/exit_mmap, but any pmd we - * preallocate which never got a corresponding vma will need to be - * freed manually. - */ -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ - int i; - - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { - pgd_t pgd = pgdp[i]; - - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - - pgdp[i] = native_make_pgd(0); - - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - } - } -} - -/* - * In PAE mode, we need to do a cr3 reload (=tlb flush) when - * updating the top-level pagetable entries to guarantee the - * processor notices the update. Since this is expensive, and - * all 4 top-level entries are used almost immediately in a - * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. - */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - pud_t *pud; - unsigned long addr; - int i; - - pud = pud_offset(pgd, 0); - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; - i++, pud++, addr += PUD_SIZE) { - pmd_t *pmd = pmd_alloc_one(mm, addr); - - if (!pmd) { - pgd_mop_up_pmds(mm, pgd); - return 0; - } - - if (i >= USER_PTRS_PER_PGD) - memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), - sizeof(pmd_t) * PTRS_PER_PMD); - - pud_populate(mm, pud, pmd); - } - - return 1; -} -#else /* !CONFIG_X86_PAE */ -/* No need to prepopulate any pagetable entries in non-PAE modes. */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - return 1; -} - -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ -} -#endif /* CONFIG_X86_PAE */ - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - - /* so that alloc_pd can use it */ - mm->pgd = pgd; - if (pgd) - pgd_ctor(pgd); - - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; - } - - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - pgd_mop_up_pmds(mm, pgd); - pgd_dtor(pgd); - free_page((unsigned long)pgd); -} - void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); -- cgit v1.2.3 From 1ec1fe73dfb711f9ea5a0ef8a7e3af5b6ac8b653 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 19 Mar 2008 20:30:40 +0100 Subject: x86: xen unify x86 add common mm pgtable c fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d526b46ae18..ed16b7704a3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -200,6 +200,24 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) return 1; } + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + write_cr3(read_cr3()); +} #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -- cgit v1.2.3 From 1d262d3a4932b5ae7222c8d9900696650ee95188 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:56 -0700 Subject: x86: put paravirt stubs into common asm/pgalloc.h Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f7823a17286..938130d49b7 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) goto out_unlock; pbase = (pte_t *)page_address(base); -#ifdef CONFIG_X86_32 paravirt_alloc_pt(&init_mm, page_to_pfn(base)); -#endif ref_prot = pte_pgprot(pte_clrhuge(*kpte)); #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 397f687ab7f840dbe50353c4b60108672b653d0c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:57 -0700 Subject: x86: move pte functions into common asm/pgalloc.h Common definitions for 2-level pagetable functions. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 6 ------ arch/x86/mm/pgtable.c | 7 +++++++ arch/x86/mm/pgtable_32.c | 7 ------- 3 files changed, 7 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 2d89b7abbc5..9c09893e54f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -662,12 +662,6 @@ int memory_add_physaddr_to_nid(u64 start) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) -{ - pgtable_page_dtor(pte); - tlb_remove_page(tlb, pte); -} - void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { tlb_remove_page(tlb, virt_to_page(pmd)); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ed16b7704a3..83765328e26 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -21,6 +21,13 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pt(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); +} + #ifdef CONFIG_X86_64 static inline void pgd_list_add(pgd_t *pgd) { diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b46893e45d0..b9e3dac3239 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -173,13 +173,6 @@ void reserve_top_address(unsigned long reserve) __VMALLOC_RESERVE += reserve; } -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) -{ - pgtable_page_dtor(pte); - paravirt_release_pt(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); -} - #ifdef CONFIG_X86_PAE void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) -- cgit v1.2.3 From 170fdff7057d4247e3f28cca96d0db1fbc854e3b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:58 -0700 Subject: x86: move pmd functions into common asm/pgalloc.h Common definitions for 3-level pagetable functions. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 5 ----- arch/x86/mm/pgtable.c | 8 ++++++++ arch/x86/mm/pgtable_32.c | 10 ---------- 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9c09893e54f..4465104f551 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -662,11 +662,6 @@ int memory_add_physaddr_to_nid(u64 start) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) -{ - tlb_remove_page(tlb, virt_to_page(pmd)); -} - void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { tlb_remove_page(tlb, virt_to_page(pud)); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 83765328e26..1c41efedf6d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -28,6 +28,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) tlb_remove_page(tlb, pte); } +#if PAGETABLE_LEVELS > 2 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); +} +#endif /* PAGETABLE_LEVELS > 2 */ + #ifdef CONFIG_X86_64 static inline void pgd_list_add(pgd_t *pgd) { diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b9e3dac3239..9ee007be914 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -173,16 +173,6 @@ void reserve_top_address(unsigned long reserve) __VMALLOC_RESERVE += reserve; } -#ifdef CONFIG_X86_PAE - -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) -{ - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pmd)); -} - -#endif - int pmd_bad(pmd_t pmd) { WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); -- cgit v1.2.3 From 5a5f8f42241cf09caec5530a7639cfa8dccc3a7b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:59 -0700 Subject: x86: move pgalloc pud and pgd operations into common place Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 5 ----- arch/x86/mm/pgtable.c | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4465104f551..1ff7906a9a4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -662,11 +662,6 @@ int memory_add_physaddr_to_nid(u64 start) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) -{ - tlb_remove_page(tlb, virt_to_page(pud)); -} - #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1c41efedf6d..c67966e10a9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -34,6 +34,13 @@ void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } + +#if PAGETABLE_LEVELS > 3 +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + tlb_remove_page(tlb, virt_to_page(pud)); +} +#endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 394158559d4c912cc58c311b6346cdea0ed2b1de Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:00 -0700 Subject: x86: move all the pgd_list handling to one place Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c67966e10a9..0d2866b8f42 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -43,34 +43,31 @@ void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ -#ifdef CONFIG_X86_64 static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); - unsigned long flags; - spin_lock_irqsave(&pgd_lock, flags); list_add(&page->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); } static inline void pgd_list_del(pgd_t *pgd) { struct page *page = virt_to_page(pgd); - unsigned long flags; - spin_lock_irqsave(&pgd_lock, flags); list_del(&page->lru); - spin_unlock_irqrestore(&pgd_lock, flags); } +#ifdef CONFIG_X86_64 pgd_t *pgd_alloc(struct mm_struct *mm) { unsigned boundary; pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + unsigned long flags; if (!pgd) return NULL; + spin_lock_irqsave(&pgd_lock, flags); pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); /* * Copy kernel pointers in from init. * Could keep a freelist or slab cache of those because the kernel @@ -86,8 +83,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm) void pgd_free(struct mm_struct *mm, pgd_t *pgd) { + unsigned long flags; BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); free_page((unsigned long)pgd); } #else @@ -101,20 +101,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) * vmalloc faults work because attached pagetables are never freed. * -- wli */ -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_add(&page->lru, &pgd_list); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_del(&page->lru); -} - #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) -- cgit v1.2.3 From 6944a9c8945212a0cc1de3589736d59ec542c539 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:01 -0700 Subject: x86: rename paravirt_alloc_pt etc after the pagetable structure Rename (alloc|release)_(pt|pd) to pte/pmd to explicitly match the name of the appropriate pagetable level structure. [ x86.git merge work by Mark McLoughlin ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Mark McLoughlin Signed-off-by: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 10 +++++----- arch/x86/kernel/vmi_32.c | 20 ++++++++++---------- arch/x86/mm/init_32.c | 6 +++--- arch/x86/mm/ioremap.c | 2 +- arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pgtable.c | 18 +++++++++--------- arch/x86/xen/enlighten.c | 32 ++++++++++++++++---------------- 7 files changed, 45 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3733412d135..362653da003 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -366,11 +366,11 @@ struct pv_mmu_ops pv_mmu_ops = { .flush_tlb_single = native_flush_tlb_single, .flush_tlb_others = native_flush_tlb_others, - .alloc_pt = paravirt_nop, - .alloc_pd = paravirt_nop, - .alloc_pd_clone = paravirt_nop, - .release_pt = paravirt_nop, - .release_pd = paravirt_nop, + .alloc_pte = paravirt_nop, + .alloc_pmd = paravirt_nop, + .alloc_pmd_clone = paravirt_nop, + .release_pte = paravirt_nop, + .release_pmd = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 12affe1f9bc..44f7ca153b7 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) { /* * This call comes in very early, before mem_map is setup. @@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } -static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) { vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } -static void vmi_release_pt(u32 pfn) +static void vmi_release_pte(u32 pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } -static void vmi_release_pd(u32 pfn) +static void vmi_release_pmd(u32 pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); @@ -871,15 +871,15 @@ static inline int __init activate_vmi(void) vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); if (vmi_ops.allocate_page) { - pv_mmu_ops.alloc_pt = vmi_allocate_pt; - pv_mmu_ops.alloc_pd = vmi_allocate_pd; - pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; + pv_mmu_ops.alloc_pte = vmi_allocate_pte; + pv_mmu_ops.alloc_pmd = vmi_allocate_pmd; + pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone; } vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); if (vmi_ops.release_page) { - pv_mmu_ops.release_pt = vmi_release_pt; - pv_mmu_ops.release_pd = vmi_release_pd; + pv_mmu_ops.release_pte = vmi_release_pte; + pv_mmu_ops.release_pmd = vmi_release_pmd; } /* Set linear is needed in all cases */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9ec62da85fd..df490905f37 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); @@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); } - paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -365,7 +365,7 @@ void __init native_pagetable_setup_start(pgd_t *base) pte_clear(NULL, va, pte); } - paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); } void __init native_pagetable_setup_done(pgd_t *base) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 3a4baf95e24..36a3f7ded62 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -407,7 +407,7 @@ void __init early_ioremap_clear(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); pmd_clear(pmd); - paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); + paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); __flush_tlb_all(); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 938130d49b7..57e762c141f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -483,7 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) goto out_unlock; pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); #ifdef CONFIG_X86_64 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0d2866b8f42..1d44d6dd4c9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -24,14 +24,14 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); - paravirt_release_pt(page_to_pfn(pte)); + paravirt_release_pte(page_to_pfn(pte)); tlb_remove_page(tlb, pte); } #if PAGETABLE_LEVELS > 2 void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } @@ -122,10 +122,10 @@ static void pgd_ctor(void *p) clone_pgd_range(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ @@ -166,7 +166,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) pgdp[i] = native_make_pgd(0); - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); } } @@ -211,7 +211,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { - paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT); + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ @@ -242,7 +242,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - /* so that alloc_pd can use it */ + /* so that alloc_pmd can use it */ mm->pgd = pgd; if (pgd) pgd_ctor(pgd); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0388220cf9..36f36e6b087 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -655,15 +655,15 @@ static void xen_write_cr3(unsigned long cr3) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) { BUG_ON(mem_map); /* should only be used early */ make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } -/* Early release_pt assumes that all pts are pinned, since there's +/* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pt_init(u32 pfn) +static void xen_release_pte_init(u32 pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -697,12 +697,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) } } -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) { xen_alloc_ptpage(mm, pfn, PT_PTE); } -static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) { xen_alloc_ptpage(mm, pfn, PT_PMD); } @@ -722,12 +722,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level) } } -static void xen_release_pt(u32 pfn) +static void xen_release_pte(u32 pfn) { xen_release_ptpage(pfn, PT_PTE); } -static void xen_release_pd(u32 pfn) +static void xen_release_pmd(u32 pfn) { xen_release_ptpage(pfn, PT_PMD); } @@ -849,10 +849,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base) { /* This will work as long as patching hasn't happened yet (which it hasn't) */ - pv_mmu_ops.alloc_pt = xen_alloc_pt; - pv_mmu_ops.alloc_pd = xen_alloc_pd; - pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pd; + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; pv_mmu_ops.set_pte = xen_set_pte; setup_shared_info(); @@ -1059,11 +1059,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .alloc_pt = xen_alloc_pt_init, - .release_pt = xen_release_pt_init, - .alloc_pd = xen_alloc_pt_init, - .alloc_pd_clone = paravirt_nop, - .release_pd = xen_release_pt_init, + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd_clone = paravirt_nop, + .release_pmd = xen_release_pte_init, #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = xen_kmap_atomic_pte, -- cgit v1.2.3 From 2761fa0920756dc471d297843646a4a9bca6656f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:02 -0700 Subject: x86: add pud_alloc for 4-level pagetables Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 2 ++ arch/x86/mm/pgtable.c | 1 + 2 files changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 362653da003..74f0c5ea2a0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -369,8 +369,10 @@ struct pv_mmu_ops pv_mmu_ops = { .alloc_pte = paravirt_nop, .alloc_pmd = paravirt_nop, .alloc_pmd_clone = paravirt_nop, + .alloc_pud = paravirt_nop, .release_pte = paravirt_nop, .release_pmd = paravirt_nop, + .release_pud = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1d44d6dd4c9..5accc08683c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -38,6 +38,7 @@ void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #if PAGETABLE_LEVELS > 3 void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); } #endif /* PAGETABLE_LEVELS > 3 */ -- cgit v1.2.3 From ee5aa8d3ba65d76157f22b7afedd089d8acfe524 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:03 -0700 Subject: x86/pgtable.h: demacro ptep_set_access_flags Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5accc08683c..e7cda2057e1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,5 +1,6 @@ #include #include +#include #include pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) @@ -264,3 +265,18 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } #endif + +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) { + *ptep = entry; + pte_update_defer(vma->vm_mm, address, ptep); + flush_tlb_page(vma, address); + } + + return changed; +} -- cgit v1.2.3 From f9fbf1a36a6bb6a639459802bccee01185ee3220 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:04 -0700 Subject: x86/pgtable.h: demacro ptep_test_and_clear_young Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e7cda2057e1..54bd77a7eee 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -280,3 +280,18 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return changed; } + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + &ptep->pte); + + if (ret) + pte_update(vma->vm_mm, addr, ptep); + + return ret; +} -- cgit v1.2.3 From c20311e165eb94f5ef12b15e452cc6ec24bd7813 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:05 -0700 Subject: x86/pgtable.h: demacro ptep_clear_flush_young Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 54bd77a7eee..af0c50161d9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -295,3 +295,15 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + + return young; +} -- cgit v1.2.3 From abf33038ffa65097939d86d2a90f93adc6115aa0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:07 -0700 Subject: xen: use appropriate pte types Convert Xen pagetable handling to use appropriate *val_t types. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2a054ef2a3d..363f7a3b67f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -214,35 +214,35 @@ void xen_pmd_clear(pmd_t *pmdp) xen_set_pmd(pmdp, __pmd(0)); } -unsigned long long xen_pte_val(pte_t pte) +pteval_t xen_pte_val(pte_t pte) { - unsigned long long ret = 0; + pteval_t ret = 0; if (pte.pte_low) { - ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; + ret = ((pteval_t)pte.pte_high << 32) | pte.pte_low; ret = machine_to_phys(XMADDR(ret)).paddr | 1; } return ret; } -unsigned long long xen_pmd_val(pmd_t pmd) +pmdval_t xen_pmd_val(pmd_t pmd) { - unsigned long long ret = pmd.pmd; + pmdval_t ret = pmd.pmd; if (ret) ret = machine_to_phys(XMADDR(ret)).paddr | 1; return ret; } -unsigned long long xen_pgd_val(pgd_t pgd) +pgdval_t xen_pgd_val(pgd_t pgd) { - unsigned long long ret = pgd.pgd; + pgdval_t ret = pgd.pgd; if (ret) ret = machine_to_phys(XMADDR(ret)).paddr | 1; return ret; } -pte_t xen_make_pte(unsigned long long pte) +pte_t xen_make_pte(pteval_t pte) { if (pte & _PAGE_PRESENT) { pte = phys_to_machine(XPADDR(pte)).maddr; @@ -252,7 +252,7 @@ pte_t xen_make_pte(unsigned long long pte) return (pte_t){ .pte = pte }; } -pmd_t xen_make_pmd(unsigned long long pmd) +pmd_t xen_make_pmd(pmdval_t pmd) { if (pmd & 1) pmd = phys_to_machine(XPADDR(pmd)).maddr; @@ -260,7 +260,7 @@ pmd_t xen_make_pmd(unsigned long long pmd) return (pmd_t){ pmd }; } -pgd_t xen_make_pgd(unsigned long long pgd) +pgd_t xen_make_pgd(pgdval_t pgd) { if (pgd & _PAGE_PRESENT) pgd = phys_to_machine(XPADDR(pgd)).maddr; @@ -273,9 +273,9 @@ void xen_set_pte(pte_t *ptep, pte_t pte) *ptep = pte; } -unsigned long xen_pte_val(pte_t pte) +pteval_t xen_pte_val(pte_t pte) { - unsigned long ret = pte.pte_low; + pteval_t ret = pte.pte_low; if (ret & _PAGE_PRESENT) ret = machine_to_phys(XMADDR(ret)).paddr; @@ -283,15 +283,15 @@ unsigned long xen_pte_val(pte_t pte) return ret; } -unsigned long xen_pgd_val(pgd_t pgd) +pgdval_t xen_pgd_val(pgd_t pgd) { - unsigned long ret = pgd.pgd; + pteval_t ret = pgd.pgd; if (ret) ret = machine_to_phys(XMADDR(ret)).paddr | 1; return ret; } -pte_t xen_make_pte(unsigned long pte) +pte_t xen_make_pte(pteval_t pte) { if (pte & _PAGE_PRESENT) { pte = phys_to_machine(XPADDR(pte)).maddr; @@ -301,7 +301,7 @@ pte_t xen_make_pte(unsigned long pte) return (pte_t){ pte }; } -pgd_t xen_make_pgd(unsigned long pgd) +pgd_t xen_make_pgd(pgdval_t pgd) { if (pgd & _PAGE_PRESENT) pgd = phys_to_machine(XPADDR(pgd)).maddr; -- cgit v1.2.3 From 430442e38e7f049841c5838f8c7027bd9e170045 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:08 -0700 Subject: xen: make use of pte_t union pte_t always contains a "pte" field for the whole pte value, so make use of it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 363f7a3b67f..3148db8e794 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -216,12 +216,10 @@ void xen_pmd_clear(pmd_t *pmdp) pteval_t xen_pte_val(pte_t pte) { - pteval_t ret = 0; + pteval_t ret = pte.pte; - if (pte.pte_low) { - ret = ((pteval_t)pte.pte_high << 32) | pte.pte_low; - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - } + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; return ret; } @@ -229,16 +227,16 @@ pteval_t xen_pte_val(pte_t pte) pmdval_t xen_pmd_val(pmd_t pmd) { pmdval_t ret = pmd.pmd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; return ret; } pgdval_t xen_pgd_val(pgd_t pgd) { pgdval_t ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; return ret; } @@ -254,7 +252,7 @@ pte_t xen_make_pte(pteval_t pte) pmd_t xen_make_pmd(pmdval_t pmd) { - if (pmd & 1) + if (pmd & _PAGE_PRESENT) pmd = phys_to_machine(XPADDR(pmd)).maddr; return (pmd_t){ pmd }; @@ -275,7 +273,7 @@ void xen_set_pte(pte_t *ptep, pte_t pte) pteval_t xen_pte_val(pte_t pte) { - pteval_t ret = pte.pte_low; + pteval_t ret = pte.pte; if (ret & _PAGE_PRESENT) ret = machine_to_phys(XMADDR(ret)).paddr; @@ -286,8 +284,8 @@ pteval_t xen_pte_val(pte_t pte) pgdval_t xen_pgd_val(pgd_t pgd) { pteval_t ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; return ret; } -- cgit v1.2.3 From 947a69c90c0d07ac7f214e46dabbe49f2a230e00 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:09 -0700 Subject: xen: unify pte operations We can fold the essentially common pte functions together now. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 125 +++++++++++++++++++---------------------------------- 1 file changed, 44 insertions(+), 81 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3148db8e794..272635aaef8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -171,6 +171,49 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, xen_set_pte(ptep, pteval); } +pteval_t xen_pte_val(pte_t pte) +{ + pteval_t ret = pte.pte; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + + return ret; +} + +pgdval_t xen_pgd_val(pgd_t pgd) +{ + pgdval_t ret = pgd.pgd; + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return ret; +} + +pte_t xen_make_pte(pteval_t pte) +{ + if (pte & _PAGE_PRESENT) { + pte = phys_to_machine(XPADDR(pte)).maddr; + pte &= ~(_PAGE_PCD | _PAGE_PWT); + } + + return (pte_t){ .pte = pte }; +} + +pgd_t xen_make_pgd(pgdval_t pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} + +pmdval_t xen_pmd_val(pmd_t pmd) +{ + pmdval_t ret = native_pmd_val(pmd); + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return ret; +} #ifdef CONFIG_X86_PAE void xen_set_pud(pud_t *ptr, pud_t val) { @@ -214,98 +257,18 @@ void xen_pmd_clear(pmd_t *pmdp) xen_set_pmd(pmdp, __pmd(0)); } -pteval_t xen_pte_val(pte_t pte) -{ - pteval_t ret = pte.pte; - - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - - return ret; -} - -pmdval_t xen_pmd_val(pmd_t pmd) -{ - pmdval_t ret = pmd.pmd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; -} - -pgdval_t xen_pgd_val(pgd_t pgd) -{ - pgdval_t ret = pgd.pgd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; -} - -pte_t xen_make_pte(pteval_t pte) -{ - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } - - return (pte_t){ .pte = pte }; -} - pmd_t xen_make_pmd(pmdval_t pmd) { if (pmd & _PAGE_PRESENT) pmd = phys_to_machine(XPADDR(pmd)).maddr; - return (pmd_t){ pmd }; -} - -pgd_t xen_make_pgd(pgdval_t pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + return native_make_pmd(pmd); } #else /* !PAE */ void xen_set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; } - -pteval_t xen_pte_val(pte_t pte) -{ - pteval_t ret = pte.pte; - - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr; - - return ret; -} - -pgdval_t xen_pgd_val(pgd_t pgd) -{ - pteval_t ret = pgd.pgd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; -} - -pte_t xen_make_pte(pteval_t pte) -{ - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } - - return (pte_t){ pte }; -} - -pgd_t xen_make_pgd(pgdval_t pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; -} #endif /* CONFIG_X86_PAE */ /* -- cgit v1.2.3 From 90e9f53662826db3cdd6d99bd394d727b05160c1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:12 -0700 Subject: xen: make sure iret faults are trapped Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/xen/xen-asm.S | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0f8934fc30..568c6ccd7ae 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -409,7 +409,7 @@ restore_nocheck_notrace: irq_return: INTERRUPT_RETURN .section .fixup,"ax" -iret_exc: +ENTRY(iret_exc) pushl $0 # no error code pushl $do_iret_error jmp error_code diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index fe161ed4b01..99223cc323b 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -184,8 +184,12 @@ iret_restore_end: region is OK. */ je xen_hypervisor_callback - iret +1: iret xen_iret_end_crit: +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous hyper_iret: /* put this out of line since its very rarely used */ -- cgit v1.2.3 From 68db065c845bd9d0eb96946ab104b4c82d0ae9da Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:13 -0700 Subject: x86: unify KERNEL_PGD_PTRS Make KERNEL_PGD_PTRS common, as previously it was only being defined for 32-bit. There are a couple of follow-on changes from this: - KERNEL_PGD_PTRS was being defined in terms of USER_PGD_PTRS. The definition of USER_PGD_PTRS doesn't really make much sense on x86-64, since it can have two different user address-space configurations. I renamed USER_PGD_PTRS to KERNEL_PGD_BOUNDARY, which is meaningful for all of 32/32, 32/64 and 64/64 process configurations. - USER_PTRS_PER_PGD was also defined and was being used for similar purposes. Converting its users to KERNEL_PGD_BOUNDARY left it completely unused, and so I removed it. Signed-off-by: Jeremy Fitzhardinge Cc: Andi Kleen Cc: Zach Amsden Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/reboot.c | 4 ++-- arch/x86/kernel/smpboot.c | 4 ++-- arch/x86/kernel/vmi_32.c | 2 +- arch/x86/mach-voyager/voyager_smp.c | 4 ++-- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/pgtable.c | 12 ++++++------ 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 19c9386ac11..1791a751a77 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -15,7 +16,6 @@ # include # include # include -# include #else # include #endif @@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length) /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ - memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6a925394bc7..2de2f7a2ed5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) #ifdef CONFIG_X86_32 /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); flush_tlb_all(); #endif diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 44f7ca153b7..956f38927aa 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page) * pdes need to be zeroed. */ if (type & VMI_PAGE_CLONE) - limit = USER_PTRS_PER_PGD; + limit = KERNEL_PGD_BOUNDARY; for (i = 0; i < limit; i++) BUG_ON(ptr[i]); } diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 96f60c7cd12..394046effd7 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -560,8 +560,8 @@ static void __init do_boot_cpu(__u8 cpu) hijack_source.idt.Offset, stack_start.sp)); /* init lowmem identity mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); flush_tlb_all(); if (quad_boot) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index df490905f37..08aa1878fad 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -457,7 +457,7 @@ void zap_low_mappings(void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) { + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index af0c50161d9..e2ac320e615 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -104,7 +104,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) * -- wli */ #define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) static void pgd_ctor(void *p) { @@ -112,7 +112,7 @@ static void pgd_ctor(void *p) unsigned long flags; /* Clear usermode parts of PGD */ - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); @@ -121,12 +121,12 @@ static void pgd_ctor(void *p) references from swapper_pg_dir. */ if (PAGETABLE_LEVELS == 2 || (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { - clone_pgd_range(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); } @@ -201,7 +201,7 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) return 0; } - if (i >= USER_PTRS_PER_PGD) + if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); -- cgit v1.2.3 From 85958b465c2e0de315575b1d3d7e7c2ce7126880 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:14 -0700 Subject: x86: unify pgd ctor/dtor All pagetables need fundamentally the same setup and destruction, so just use the same code for everything. Signed-off-by: Jeremy Fitzhardinge Cc: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable.c | 59 ++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e2ac320e615..50159764f69 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -59,50 +59,6 @@ static inline void pgd_list_del(pgd_t *pgd) list_del(&page->lru); } -#ifdef CONFIG_X86_64 -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - unsigned boundary; - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - unsigned long flags; - if (!pgd) - return NULL; - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - /* - * Copy kernel pointers in from init. - * Could keep a freelist or slab cache of those because the kernel - * part never changes. - */ - boundary = pgd_index(__PAGE_OFFSET); - memset(pgd, 0, boundary * sizeof(pgd_t)); - memcpy(pgd + boundary, - init_level4_pgt + boundary, - (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - unsigned long flags; - BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - free_page((unsigned long)pgd); -} -#else -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * -- wli - */ #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) @@ -120,7 +76,8 @@ static void pgd_ctor(void *p) ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + PAGETABLE_LEVELS == 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); @@ -149,6 +106,17 @@ static void pgd_dtor(void *pgd) spin_unlock_irqrestore(&pgd_lock, flags); } +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ + #ifdef CONFIG_X86_PAE /* * Mop up any pmd pages which may still be attached to the pgd. @@ -264,7 +232,6 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_dtor(pgd); free_page((unsigned long)pgd); } -#endif int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, -- cgit v1.2.3 From e2a81baf6604a2e08e10c7405b0349106f77c8af Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:17 -0700 Subject: xen: support sysenter/sysexit if hypervisor does 64-bit Xen supports sysenter for 32-bit guests, so support its use. (sysenter is faster than int $0x80 in 32-on-64.) sysexit is still not supported, so we fake it up using iret. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 18 ++++++++++++++- arch/x86/xen/enlighten.c | 3 +-- arch/x86/xen/setup.c | 21 +++++++++++++++++ arch/x86/xen/smp.c | 1 + arch/x86/xen/xen-asm.S | 56 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 3 +++ 6 files changed, 99 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 568c6ccd7ae..5d80d53eaff 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1017,6 +1017,13 @@ ENTRY(kernel_thread_helper) ENDPROC(kernel_thread_helper) #ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + RING0_INT_FRAME + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + ENTRY(xen_hypervisor_callback) CFI_STARTPROC pushl $0 @@ -1036,8 +1043,17 @@ ENTRY(xen_hypervisor_callback) jae 1f call xen_iret_crit_fixup + jmp 2f + +1: cmpl $xen_sysexit_start_crit,%eax + jb 2f + cmpl $xen_sysexit_end_crit,%eax + jae 2f + + jmp xen_sysexit_crit_fixup -1: mov %esp, %eax +ENTRY(xen_do_upcall) +2: mov %esp, %eax call xen_evtchn_do_upcall jmp ret_from_intr CFI_ENDPROC diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 36f36e6b087..943684566eb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -155,7 +155,6 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_SEP) | /* disable SEP */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" @@ -994,7 +993,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = xen_iret, - .irq_enable_syscall_ret = NULL, /* never called */ + .irq_enable_syscall_ret = xen_sysexit, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2341492bf7a..82517e4a752 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -68,6 +69,24 @@ static void __init fiddle_vdso(void) *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; } +void xen_enable_sysenter(void) +{ + int cpu = smp_processor_id(); + extern void xen_sysenter_target(void); + /* Mask events on entry, even though they get enabled immediately */ + static struct callback_register sysenter = { + .type = CALLBACKTYPE_sysenter, + .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, + .flags = CALLBACKF_mask_events, + }; + + if (!boot_cpu_has(X86_FEATURE_SEP) || + HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { + clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); + } +} + void __init xen_arch_setup(void) { struct physdev_set_iopl set_iopl; @@ -82,6 +101,8 @@ void __init xen_arch_setup(void) HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, __KERNEL_CS, (unsigned long)xen_failsafe_callback); + xen_enable_sysenter(); + set_iopl.iopl = 1; rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); if (rc != 0) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e340ff92f6b..d61e4f8b07c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -72,6 +72,7 @@ static __cpuinit void cpu_bringup_and_idle(void) int cpu = smp_processor_id(); cpu_init(); + xen_enable_sysenter(); preempt_disable(); per_cpu(cpu_state, cpu) = CPU_ONLINE; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 99223cc323b..1ac08082a4b 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -280,6 +280,62 @@ ENTRY(xen_iret_crit_fixup) 2: ret +ENTRY(xen_sysexit) + /* Store vcpu_info pointer for easy access. Do it this + way to avoid having to reload %fs */ +#ifdef CONFIG_SMP + GET_THREAD_INFO(%eax) + movl TI_cpu(%eax),%eax + movl __per_cpu_offset(,%eax,4),%eax + mov per_cpu__xen_vcpu(%eax),%eax +#else + movl per_cpu__xen_vcpu, %eax +#endif + + /* We can't actually use sysexit in a pv guest, + so fake it up with iret */ + pushl $__USER_DS /* user stack segment */ + pushl %ecx /* user esp */ + pushl PT_EFLAGS+2*4(%esp) /* user eflags */ + pushl $__USER_CS /* user code segment */ + pushl %edx /* user eip */ + +xen_sysexit_start_crit: + /* Unmask events... */ + movb $0, XEN_vcpu_info_mask(%eax) + /* ...and test for pending. + There's a preempt window here, but it doesn't + matter because we're within the critical section. */ + testb $0xff, XEN_vcpu_info_pending(%eax) + + /* If there's something pending, mask events again so we + can directly inject it back into the kernel. */ + jnz 1f + + movl PT_EAX+5*4(%esp),%eax +2: iret +1: movb $1, XEN_vcpu_info_mask(%eax) +xen_sysexit_end_crit: + addl $5*4, %esp /* remove iret frame */ + /* no need to re-save regs, but need to restore kernel %fs */ + mov $__KERNEL_PERCPU, %eax + mov %eax, %fs + jmp xen_do_upcall +.section __ex_table,"a" + .align 4 + .long 2b,iret_exc +.previous + + .globl xen_sysexit_start_crit, xen_sysexit_end_crit +/* + sysexit fixup is easy, since the old frame is still sitting there + on the stack. We just need to remove the new recursive + interrupt and return. + */ +ENTRY(xen_sysexit_crit_fixup) + addl $PT_OLDESP+5*4, %esp /* remove frame+iret */ + jmp xen_do_upcall + /* Force an event check by making a hypercall, but preserve regs before making the call. diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 956a491ea99..01d4ff2ce40 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -19,6 +19,7 @@ extern struct shared_info *HYPERVISOR_shared_info; char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); +void xen_enable_sysenter(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); @@ -64,4 +65,6 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); void xen_iret(void); +void xen_sysexit(void); + #endif /* XEN_OPS_H */ -- cgit v1.2.3 From ee523ca1e456d754d66be6deab910131e4e1dbf8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:18 -0700 Subject: xen: implement a debug-interrupt handler Xen supports the notion of a debug interrupt which can be triggered from the console. For now this is implemented to show pending events, masks and each CPU's pending event set. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/events.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/smp.c | 19 ++++++++++++++----- arch/x86/xen/xen-ops.h | 3 +++ 3 files changed, 64 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c index dcf613e1758..0140981e93c 100644 --- a/arch/x86/xen/events.c +++ b/arch/x86/xen/events.c @@ -455,6 +455,53 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) notify_remote_via_irq(irq); } +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + + spin_lock_irqsave(&debug_lock, flags); + + printk("vcpu %d\n ", cpu); + + for_each_online_cpu(i) { + struct vcpu_info *v = per_cpu(xen_vcpu, i); + printk("%d: masked=%d pending=%d event_sel %08lx\n ", i, + (get_irq_regs() && i == cpu) ? !(get_irq_regs()->flags & X86_EFLAGS_IF) : v->evtchn_upcall_mask, + v->evtchn_upcall_pending, + v->evtchn_pending_sel); + } + printk("pending:\n "); + for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%08lx%s", sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nmasks:\n "); + for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%08lx%s", sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nunmasked:\n "); + for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\npending list:\n"); + for(i = 0; i < NR_EVENT_CHANNELS; i++) { + if (sync_test_bit(i, sh->evtchn_pending)) { + printk(" %d: event %d -> irq %d\n", + cpu_evtchn[i], i, + evtchn_to_irq[i]); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + /* * Search the CPUs pending events bitmasks. For each one found, map diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d61e4f8b07c..92dd3dbf3ff 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -36,8 +36,9 @@ #include "mmu.h" static cpumask_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); +static DEFINE_PER_CPU(int, resched_irq) = -1; +static DEFINE_PER_CPU(int, callfunc_irq) = -1; +static DEFINE_PER_CPU(int, debug_irq) = -1; /* * Structure and data for smp_call_function(). This is designed to minimise @@ -89,9 +90,7 @@ static __cpuinit void cpu_bringup_and_idle(void) static int xen_smp_intr_init(unsigned int cpu) { int rc; - const char *resched_name, *callfunc_name; - - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + const char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -115,6 +114,14 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(callfunc_irq, cpu) = rc; + debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, + IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING, + debug_name, NULL); + if (rc < 0) + goto fail; + per_cpu(debug_irq, cpu) = rc; + return 0; fail: @@ -122,6 +129,8 @@ static int xen_smp_intr_init(unsigned int cpu) unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); if (per_cpu(callfunc_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + if (per_cpu(debug_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); return rc; } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 01d4ff2ce40..22395d20dd6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,7 @@ #define XEN_OPS_H #include +#include /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -29,6 +30,8 @@ unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); +irqreturn_t xen_debug_interrupt(int irq, void *dev_id); + bool xen_vcpu_stolen(int vcpu); void xen_mark_init_mm_pinned(void); -- cgit v1.2.3 From ee8fa1c67f0b873a324960f0ca9fa1d7e49aa86b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:19 -0700 Subject: xen: make sure retriggered events are set pending retrigger_dynirq() was incomplete, and didn't properly set the event to be pending again. It doesn't seem to actually get used. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/events.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c index 0140981e93c..f73b53bd65b 100644 --- a/arch/x86/xen/events.c +++ b/arch/x86/xen/events.c @@ -601,10 +601,16 @@ static void ack_dynirq(unsigned int irq) static int retrigger_dynirq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); + struct shared_info *sh = HYPERVISOR_shared_info; int ret = 0; if (VALID_EVTCHN(evtchn)) { - set_evtchn(evtchn); + int masked; + + masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask); + sync_set_bit(evtchn, sh->evtchn_pending); + if (!masked) + unmask_evtchn(evtchn); ret = 1; } -- cgit v1.2.3 From 229664bee6126e01f8662976a5fe2e79813b77c8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:20 -0700 Subject: xen: short-cut for recursive event handling If an event comes in while events are currently being processed, then just increment the counter and have the outer event loop reprocess the pending events. This prevents unbounded recursion on heavy event loads (of course massive event storms will cause infinite loops). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/events.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c index f73b53bd65b..85bac298b3c 100644 --- a/arch/x86/xen/events.c +++ b/arch/x86/xen/events.c @@ -517,29 +517,43 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) int cpu = get_cpu(); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - unsigned long pending_words; + static DEFINE_PER_CPU(unsigned, nesting_count); + unsigned count; - vcpu_info->evtchn_upcall_pending = 0; + do { + unsigned long pending_words; - /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { - unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); + vcpu_info->evtchn_upcall_pending = 0; - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; + if (__get_cpu_var(nesting_count)++) + goto out; - if (irq != -1) { - regs->orig_ax = ~irq; - do_IRQ(regs); + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); + while (pending_words != 0) { + unsigned long pending_bits; + int word_idx = __ffs(pending_words); + pending_words &= ~(1UL << word_idx); + + while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; + + if (irq != -1) { + regs->orig_ax = ~irq; + do_IRQ(regs); + } } } - } + BUG_ON(!irqs_disabled()); + + count = __get_cpu_var(nesting_count); + __get_cpu_var(nesting_count) = 0; + } while(count != 1); + +out: put_cpu(); } -- cgit v1.2.3 From dbe9e994c99ac9ac12d2b66ea42f44558f54fa52 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:21 -0700 Subject: xen: no need for domU to worry about MCE/MCA Mask MCE/MCA out of cpu caps. Its harmless to leave them there, but it does prevent the kernel from starting an unnecessary thread. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 943684566eb..8c5ff24a492 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -155,6 +155,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ + (1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" -- cgit v1.2.3 From 0f2c87695219b1129ccf93e0f58acdcdd49724b9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:37:22 -0700 Subject: xen: jump to iret fixup Use jmp rather than call for the iret fixup, so its consistent with the sysexit fixup, and it simplifies the stack (which is already complex). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 3 +-- arch/x86/xen/xen-asm.S | 22 +++++++++------------- 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5d80d53eaff..209c334bb92 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1042,8 +1042,7 @@ ENTRY(xen_hypervisor_callback) cmpl $xen_iret_end_crit,%eax jae 1f - call xen_iret_crit_fixup - jmp 2f + jmp xen_iret_crit_fixup 1: cmpl $xen_sysexit_start_crit,%eax jb 2f diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1ac08082a4b..53cae923e14 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -223,9 +223,7 @@ hyper_iret: ds } SAVE_ALL state eax } : : - ebx } - ---------------- - return addr <- esp + ebx }<- esp ---------------- In order to deliver the nested exception properly, we need to shift @@ -240,10 +238,8 @@ hyper_iret: it's usermode state which we eventually need to restore. */ ENTRY(xen_iret_crit_fixup) - /* offsets +4 for return address */ - /* - Paranoia: Make sure we're really coming from userspace. + Paranoia: Make sure we're really coming from kernel space. One could imagine a case where userspace jumps into the critical range address, but just before the CPU delivers a GP, it decides to deliver an interrupt instead. Unlikely? @@ -252,32 +248,32 @@ ENTRY(xen_iret_crit_fixup) jump instruction itself, not the destination, but some virtual environments get this wrong. */ - movl PT_CS+4(%esp), %ecx + movl PT_CS(%esp), %ecx andl $SEGMENT_RPL_MASK, %ecx cmpl $USER_RPL, %ecx je 2f - lea PT_ORIG_EAX+4(%esp), %esi - lea PT_EFLAGS+4(%esp), %edi + lea PT_ORIG_EAX(%esp), %esi + lea PT_EFLAGS(%esp), %edi /* If eip is before iret_restore_end then stack hasn't been restored yet. */ cmp $iret_restore_end, %eax jae 1f - movl 0+4(%edi),%eax /* copy EAX */ - movl %eax, PT_EAX+4(%esp) + movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ + movl %eax, PT_EAX(%esp) lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ /* set up the copy */ 1: std - mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ + mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ rep movsl cld lea 4(%edi),%esp /* point esp to new frame */ -2: ret +2: jmp xen_do_upcall ENTRY(xen_sysexit) -- cgit v1.2.3 From af711cda4f94b5fddcdc5eb4134387ae026e3171 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 2 Apr 2008 10:53:54 -0700 Subject: xen: move features.c from arch/x86/xen/features.c to drivers/xen ia64/xen also uses it too. Move it into common place so that ia64/xen can share the code. Signed-off-by: Isaku Yamahata Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/features.c | 29 ----------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 arch/x86/xen/features.c (limited to 'arch') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 343df246bd3..c5e9aa48990 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ -obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ +obj-y := enlighten.o setup.o multicalls.o mmu.o \ events.o time.o manage.o xen-asm.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c deleted file mode 100644 index 0707714e40d..00000000000 --- a/arch/x86/xen/features.c +++ /dev/null @@ -1,29 +0,0 @@ -/****************************************************************************** - * features.c - * - * Xen feature flags. - * - * Copyright (c) 2006, Ian Campbell, XenSource Inc. - */ -#include -#include -#include -#include -#include - -u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; -EXPORT_SYMBOL_GPL(xen_features); - -void xen_setup_features(void) -{ - struct xen_feature_info fi; - int i, j; - - for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { - fi.submap_idx = i; - if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) - break; - for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1< Date: Wed, 2 Apr 2008 10:53:55 -0700 Subject: xen: move events.c to drivers/xen for IA64/Xen support move arch/x86/xen/events.c undedr drivers/xen to share codes with x86 and ia64. And minor adjustment to compile. ia64/xen also uses events.c Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Isaku Yamahata Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/events.c | 658 ------------------------------------------------- arch/x86/xen/xen-ops.h | 2 +- 3 files changed, 2 insertions(+), 660 deletions(-) delete mode 100644 arch/x86/xen/events.c (limited to 'arch') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index c5e9aa48990..95c59260dcc 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - events.o time.o manage.o xen-asm.o + time.o manage.o xen-asm.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c deleted file mode 100644 index 85bac298b3c..00000000000 --- a/arch/x86/xen/events.c +++ /dev/null @@ -1,658 +0,0 @@ -/* - * Xen event channels - * - * Xen models interrupts with abstract event channels. Because each - * domain gets 1024 event channels, but NR_IRQ is not that large, we - * must dynamically map irqs<->event channels. The event channels - * interface with the rest of the kernel by defining a xen interrupt - * chip. When an event is recieved, it is mapped to an irq and sent - * through the normal interrupt processing path. - * - * There are four kinds of events which can be mapped to an event - * channel: - * - * 1. Inter-domain notifications. This includes all the virtual - * device events, since they're driven by front-ends in another domain - * (typically dom0). - * 2. VIRQs, typically used for timers. These are per-cpu events. - * 3. IPIs. - * 4. Hardware interrupts. Not supported at present. - * - * Jeremy Fitzhardinge , XenSource Inc, 2007 - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "xen-ops.h" - -/* - * This lock protects updates to the following mapping and reference-count - * arrays. The lock does not need to be acquired to read the mapping tables. - */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); - -/* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; - -/* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; - -/* Packed IRQ information: binding type, sub-type index, and event channel. */ -struct packed_irq -{ - unsigned short evtchn; - unsigned char index; - unsigned char type; -}; - -static struct packed_irq irq_info[NR_IRQS]; - -/* Binding types. */ -enum { - IRQT_UNBOUND, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* Convenient shorthand for packed representation of an unbound IRQ. */ -#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) - -static int evtchn_to_irq[NR_EVENT_CHANNELS] = { - [0 ... NR_EVENT_CHANNELS-1] = -1 -}; -static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; -static u8 cpu_evtchn[NR_EVENT_CHANNELS]; - -/* Reference counts for bindings to IRQs. */ -static int irq_bindcount[NR_IRQS]; - -/* Xen will never allocate port zero for any purpose. */ -#define VALID_EVTCHN(chn) ((chn) != 0) - -/* - * Force a proper event-channel callback from Xen after clearing the - * callback mask. We do this in a very simple manner, by making a call - * down into Xen. The pending flag will be checked by Xen on return. - */ -void force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} -EXPORT_SYMBOL_GPL(force_evtchn_callback); - -static struct irq_chip xen_dynamic_chip; - -/* Constructor for packed IRQ information. */ -static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) -{ - return (struct packed_irq) { evtchn, index, type }; -} - -/* - * Accessors for packed IRQ information. - */ -static inline unsigned int evtchn_from_irq(int irq) -{ - return irq_info[irq].evtchn; -} - -static inline unsigned int index_from_irq(int irq) -{ - return irq_info[irq].index; -} - -static inline unsigned int type_from_irq(int irq) -{ - return irq_info[irq].type; -} - -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return (sh->evtchn_pending[idx] & - cpu_evtchn_mask[cpu][idx] & - ~sh->evtchn_mask[idx]); -} - -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) -{ - int irq = evtchn_to_irq[chn]; - - BUG_ON(irq == -1); -#ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); -#endif - - __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); - __set_bit(chn, cpu_evtchn_mask[cpu]); - - cpu_evtchn[chn] = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ -#ifdef CONFIG_SMP - int i; - /* By default all event channels notify CPU#0. */ - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); -#endif - - memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); - memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); -} - -static inline unsigned int cpu_from_evtchn(unsigned int evtchn) -{ - return cpu_evtchn[evtchn]; -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); -} - -static inline void set_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} - - -/** - * notify_remote_via_irq - send event to remote end of event channel via irq - * @irq: irq of event channel to send event to - * - * Unlike notify_remote_via_evtchn(), this is safe to use across - * save/restore. Notifications on a broken connection are silently - * dropped. - */ -void notify_remote_via_irq(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - notify_remote_via_evtchn(evtchn); -} -EXPORT_SYMBOL_GPL(notify_remote_via_irq); - -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - - BUG_ON(!irqs_disabled()); - - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - -static int find_unbound_irq(void) -{ - int irq; - - /* Only allocate from dynirq range */ - for (irq = 0; irq < NR_IRQS; irq++) - if (irq_bindcount[irq] == 0) - break; - - if (irq == NR_IRQS) - panic("No available IRQ to bind to: increase NR_IRQS!\n"); - - return irq; -} - -int bind_evtchn_to_irq(unsigned int evtchn) -{ - int irq; - - spin_lock(&irq_mapping_update_lock); - - irq = evtchn_to_irq[evtchn]; - - if (irq == -1) { - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "event"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); - -static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) -{ - struct evtchn_bind_ipi bind_ipi; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = find_unbound_irq(); - if (irq < 0) - goto out; - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "ipi"); - - bind_ipi.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, - &bind_ipi) != 0) - BUG(); - evtchn = bind_ipi.port; - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); - - per_cpu(ipi_to_irq, cpu)[ipi] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - out: - spin_unlock(&irq_mapping_update_lock); - return irq; -} - - -static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) -{ - struct evtchn_bind_virq bind_virq; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(virq_to_irq, cpu)[virq]; - - if (irq == -1) { - bind_virq.virq = virq; - bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; - - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "virq"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); - - per_cpu(virq_to_irq, cpu)[virq] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} - -static void unbind_from_irq(unsigned int irq) -{ - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - - spin_lock(&irq_mapping_update_lock); - - if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [index_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - irq_info[irq] = IRQ_UNBOUND; - - dynamic_irq_init(irq); - } - - spin_unlock(&irq_mapping_update_lock); -} - -int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_evtchn_to_irq(evtchn); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); - -int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_virq_to_irq(virq, cpu); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); - -int bind_ipi_to_irqhandler(enum ipi_vector ipi, - unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) -{ - int irq, retval; - - irq = bind_ipi_to_irq(ipi, cpu); - if (irq < 0) - return irq; - - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} - -void unbind_from_irqhandler(unsigned int irq, void *dev_id) -{ - free_irq(irq, dev_id); - unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(unbind_from_irqhandler); - -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) -{ - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - int i; - unsigned long flags; - static DEFINE_SPINLOCK(debug_lock); - - spin_lock_irqsave(&debug_lock, flags); - - printk("vcpu %d\n ", cpu); - - for_each_online_cpu(i) { - struct vcpu_info *v = per_cpu(xen_vcpu, i); - printk("%d: masked=%d pending=%d event_sel %08lx\n ", i, - (get_irq_regs() && i == cpu) ? !(get_irq_regs()->flags & X86_EFLAGS_IF) : v->evtchn_upcall_mask, - v->evtchn_upcall_pending, - v->evtchn_pending_sel); - } - printk("pending:\n "); - for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nmasks:\n "); - for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nunmasked:\n "); - for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\npending list:\n"); - for(i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, sh->evtchn_pending)) { - printk(" %d: event %d -> irq %d\n", - cpu_evtchn[i], i, - evtchn_to_irq[i]); - } - } - - spin_unlock_irqrestore(&debug_lock, flags); - - return IRQ_HANDLED; -} - - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ -void xen_evtchn_do_upcall(struct pt_regs *regs) -{ - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); - unsigned count; - - do { - unsigned long pending_words; - - vcpu_info->evtchn_upcall_pending = 0; - - if (__get_cpu_var(nesting_count)++) - goto out; - - /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { - unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); - - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; - - if (irq != -1) { - regs->orig_ax = ~irq; - do_IRQ(regs); - } - } - } - - BUG_ON(!irqs_disabled()); - - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; - } while(count != 1); - -out: - put_cpu(); -} - -/* Rebind an evtchn so that it gets delivered to a specific cpu */ -static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) -{ - struct evtchn_bind_vcpu bind_vcpu; - int evtchn = evtchn_from_irq(irq); - - if (!VALID_EVTCHN(evtchn)) - return; - - /* Send future instances of this interrupt to other vcpu. */ - bind_vcpu.port = evtchn; - bind_vcpu.vcpu = tcpu; - - /* - * If this fails, it usually just indicates that we're dealing with a - * virq or IPI channel, which don't actually need to be rebound. Ignore - * it, but don't do the xenlinux-level rebind in that case. - */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); -} - - -static void set_affinity_irq(unsigned irq, cpumask_t dest) -{ - unsigned tcpu = first_cpu(dest); - rebind_irq_to_cpu(irq, tcpu); -} - -static void enable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); -} - -static void disable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); -} - -static void ack_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - move_native_irq(irq); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); -} - -static int retrigger_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - struct shared_info *sh = HYPERVISOR_shared_info; - int ret = 0; - - if (VALID_EVTCHN(evtchn)) { - int masked; - - masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask); - sync_set_bit(evtchn, sh->evtchn_pending); - if (!masked) - unmask_evtchn(evtchn); - ret = 1; - } - - return ret; -} - -static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", - .mask = disable_dynirq, - .unmask = enable_dynirq, - .ack = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, -}; - -void __init xen_init_IRQ(void) -{ - int i; - - init_evtchn_cpu_bindings(); - - /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); - - /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for (i = 0; i < NR_IRQS; i++) - irq_bindcount[i] = 0; - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 22395d20dd6..f1063ae0803 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -3,6 +3,7 @@ #include #include +#include /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -10,7 +11,6 @@ extern const char xen_failsafe_callback[]; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); -- cgit v1.2.3 From 8d3d2106c19f4e69f208f59fe484ca113fbb48b3 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 2 Apr 2008 10:54:00 -0700 Subject: xen: make grant table arch portable split out x86 specific part from grant-table.c and allow ia64/xen specific initialization. ia64/xen grant table is based on pseudo physical address (guest physical address) unlike x86/xen. On ia64 init_mm doesn't map identity straight mapped area. ia64/xen specific grant table initialization is necessary. Signed-off-by: Isaku Yamahata Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/grant-table.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 arch/x86/xen/grant-table.c (limited to 'arch') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 95c59260dcc..3d8df981d5f 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o manage.o xen-asm.o + time.o manage.o xen-asm.o grant-table.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c new file mode 100644 index 00000000000..49ba9b5224d --- /dev/null +++ b/arch/x86/xen/grant-table.c @@ -0,0 +1,91 @@ +/****************************************************************************** + * grant_table.c + * x86 specific part + * + * Granting foreign access to our memory reservation. + * + * Copyright (c) 2005-2006, Christopher Clark + * Copyright (c) 2004-2005, K A Fraser + * Copyright (c) 2008 Isaku Yamahata + * VA Linux Systems Japan. Split out x86 specific part. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include + +#include + +static int map_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + unsigned long **frames = (unsigned long **)data; + + set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); + (*frames)++; + return 0; +} + +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + + set_pte_at(&init_mm, addr, pte, __pte(0)); + return 0; +} + +int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + struct grant_entry **__shared) +{ + int rc; + struct grant_entry *shared = *__shared; + + if (shared == NULL) { + struct vm_struct *area = + xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes); + BUG_ON(area == NULL); + shared = area->addr; + *__shared = shared; + } + + rc = apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, + map_pte_fn, &frames); + return rc; +} + +void arch_gnttab_unmap_shared(struct grant_entry *shared, + unsigned long nr_gframes) +{ + apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); +} -- cgit v1.2.3 From 41e332b2a2dfe514cd441ed0ce1096ed1863e378 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 Apr 2008 10:54:09 -0700 Subject: xen: disable preemption during tlb flush Various places in the kernel flush the tlb even though preemption doens't guarantee the tlb flush is happening on any particular CPU. In many cases this doesn't seem to matter, so don't make a fuss about it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8c5ff24a492..bc129146f99 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -532,26 +532,37 @@ static void xen_apic_write(unsigned long reg, u32 val) static void xen_flush_tlb(void) { struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_TLB_FLUSH_LOCAL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } static void xen_flush_tlb_single(unsigned long addr) { struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + struct multicall_space mcs; + + preempt_disable(); + mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_INVLPG_LOCAL; op->arg1.linear_addr = addr & PAGE_MASK; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, -- cgit v1.2.3 From 2bd50036b5dfc929390ddc48be7f6314447b2be3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 Apr 2008 10:54:10 -0700 Subject: xen: allow set_pte_at on init_mm to be lockless The usual pagetable locking protocol doesn't seem to apply to updates to init_mm, so don't rely on preemption being disabled in xen_set_pte_at on init_mm. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 272635aaef8..6cbcf65609a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -156,6 +156,10 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + /* updates to init_mm may be done without lock */ + if (mm == &init_mm) + preempt_disable(); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; @@ -163,12 +167,16 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); xen_mc_issue(PARAVIRT_LAZY_MMU); - return; + goto out; } else if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) - return; + goto out; } xen_set_pte(ptep, pteval); + +out: + if (mm == &init_mm) + preempt_enable(); } pteval_t xen_pte_val(pte_t pte) -- cgit v1.2.3 From b77797fb2bf31bf076e6b69736119bc6a077525b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 Apr 2008 10:54:11 -0700 Subject: xen: fold xen_sysexit into xen_iret xen_sysexit and xen_iret were doing essentially the same thing. Rather than having a separate implementation for xen_sysexit, we can just strip the stack back to an iret frame and jump into xen_iret. This removes a lot of code and complexity - specifically, another critical region. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 9 +----- arch/x86/xen/xen-asm.S | 70 ++++++++++------------------------------------ 2 files changed, 15 insertions(+), 64 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 209c334bb92..2a609dc3271 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1044,15 +1044,8 @@ ENTRY(xen_hypervisor_callback) jmp xen_iret_crit_fixup -1: cmpl $xen_sysexit_start_crit,%eax - jb 2f - cmpl $xen_sysexit_end_crit,%eax - jae 2f - - jmp xen_sysexit_crit_fixup - ENTRY(xen_do_upcall) -2: mov %esp, %eax +1: mov %esp, %eax call xen_evtchn_do_upcall jmp ret_from_intr CFI_ENDPROC diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 53cae923e14..2497a30f41d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -107,6 +107,20 @@ ENDPATCH(xen_restore_fl_direct) ENDPROC(xen_restore_fl_direct) RELOC(xen_restore_fl_direct, 2b+1) +/* + We can't use sysexit directly, because we're not running in ring0. + But we can easily fake it up using iret. Assuming xen_sysexit + is jumped to with a standard stack frame, we can just strip it + back to a standard iret frame and use iret. + */ +ENTRY(xen_sysexit) + movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ + orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) + lea PT_EIP(%esp), %esp + + jmp xen_iret +ENDPROC(xen_sysexit) + /* This is run where a normal iret would be run, with the same stack setup: 8: eflags @@ -276,62 +290,6 @@ ENTRY(xen_iret_crit_fixup) 2: jmp xen_do_upcall -ENTRY(xen_sysexit) - /* Store vcpu_info pointer for easy access. Do it this - way to avoid having to reload %fs */ -#ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl TI_cpu(%eax),%eax - movl __per_cpu_offset(,%eax,4),%eax - mov per_cpu__xen_vcpu(%eax),%eax -#else - movl per_cpu__xen_vcpu, %eax -#endif - - /* We can't actually use sysexit in a pv guest, - so fake it up with iret */ - pushl $__USER_DS /* user stack segment */ - pushl %ecx /* user esp */ - pushl PT_EFLAGS+2*4(%esp) /* user eflags */ - pushl $__USER_CS /* user code segment */ - pushl %edx /* user eip */ - -xen_sysexit_start_crit: - /* Unmask events... */ - movb $0, XEN_vcpu_info_mask(%eax) - /* ...and test for pending. - There's a preempt window here, but it doesn't - matter because we're within the critical section. */ - testb $0xff, XEN_vcpu_info_pending(%eax) - - /* If there's something pending, mask events again so we - can directly inject it back into the kernel. */ - jnz 1f - - movl PT_EAX+5*4(%esp),%eax -2: iret -1: movb $1, XEN_vcpu_info_mask(%eax) -xen_sysexit_end_crit: - addl $5*4, %esp /* remove iret frame */ - /* no need to re-save regs, but need to restore kernel %fs */ - mov $__KERNEL_PERCPU, %eax - mov %eax, %fs - jmp xen_do_upcall -.section __ex_table,"a" - .align 4 - .long 2b,iret_exc -.previous - - .globl xen_sysexit_start_crit, xen_sysexit_end_crit -/* - sysexit fixup is easy, since the old frame is still sitting there - on the stack. We just need to remove the new recursive - interrupt and return. - */ -ENTRY(xen_sysexit_crit_fixup) - addl $PT_OLDESP+5*4, %esp /* remove frame+iret */ - jmp xen_do_upcall - /* Force an event check by making a hypercall, but preserve regs before making the call. -- cgit v1.2.3 From af7ae3b9c4a4c1337903f31131d58e3c0d2b6d55 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 Apr 2008 10:54:12 -0700 Subject: xen: allow compilation with non-flat memory There's no real reason we can't support sparsemem/discontigmem, so do so. This is mostly useful to support hotplug memory. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/Kconfig | 2 +- arch/x86/xen/enlighten.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 4d5f2649bee..2e641be2737 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bc129146f99..c8a56e457d6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -669,7 +669,9 @@ static void xen_write_cr3(unsigned long cr3) everything is pinned. */ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) { +#ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ +#endif make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } -- cgit v1.2.3