From 012f09e7942716d5f4339f1fd9a831a485bb1d4a Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 16:23:08 +0200 Subject: x86: compile pat debugfs interface only if CONFIG_X86_PAT is set Recently I've run a kernel w/o PAT support and wondered why there was a file "x86/pat_memtype_list" in debugfs. Of course it's empty if PAT is disabled ... so just get rid of this interface if PAT is disabled. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 2fe30916d4b..647b1c4de71 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -492,7 +492,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } -#if defined(CONFIG_DEBUG_FS) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ static struct memtype *memtype_get_idx(loff_t pos) @@ -576,4 +576,4 @@ static int __init pat_memtype_list_init(void) late_initcall(pat_memtype_list_init); -#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ -- cgit v1.2.3 From 1ac2f7d55b7ee1613c90631e87fea22ec06781e5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 4 Aug 2008 14:51:24 +0800 Subject: introduce two APIs for page attribute Introduce two APIs for page attribute. flushing tlb/cache in every page attribute is expensive. AGP gart usually will do a lot of operations to change a page to uc, new APIs can reduce flush. Signed-off-by: Shaohua Li Cc: airlied@linux.ie Cc: Andrew Morton Cc: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 58 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 65c6e46bf05..2c5c18c2464 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -752,12 +752,12 @@ static inline int cache_attr(pgprot_t attr) (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int change_page_attr_set_clr(unsigned long addr, int numpages, +static int do_change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split) + int force_split, int *tlb_flush) { struct cpa_data cpa; - int ret, cache, checkalias; + int ret, checkalias; /* * Check, if we are requested to change a not supported @@ -792,9 +792,22 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, /* * Check whether we really changed something: */ - if (!cpa.flushtlb) - goto out; + *tlb_flush = cpa.flushtlb; + cpa_fill_pool(NULL); + + return ret; +} + +static int change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, + int force_split) +{ + int cache, flush_cache = 0, ret; + ret = do_change_page_attr_set_clr(addr, numpages, mask_set, mask_clr, + force_split, &flush_cache); + if (!flush_cache) + goto out; /* * No need to flush, when we did not set any of the caching * attributes: @@ -811,10 +824,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_range(addr, numpages, cache); else cpa_flush_all(cache); - out: - cpa_fill_pool(NULL); - return ret; } @@ -852,6 +862,30 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); +int set_memory_uc_noflush(unsigned long addr, int numpages) +{ + int flush; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL)) + return -EINVAL; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + return do_change_page_attr_set_clr(addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), + __pgprot(0), 0, &flush); +} +EXPORT_SYMBOL(set_memory_uc_noflush); + +void set_memory_flush_all(void) +{ + cpa_flush_all(1); +} +EXPORT_SYMBOL(set_memory_flush_all); + int _set_memory_wc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, @@ -926,6 +960,14 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); +int set_pages_uc_noflush(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc_noflush(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc_noflush); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); -- cgit v1.2.3 From 5843d9a4d0ba89719916c8f07fc9c57b7126be6d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 1 Aug 2008 03:15:21 +0200 Subject: x86, pat: avoid highmem cache attribute aliasing Highmem code can leave ptes and tlb entries around for a given page even after kunmap, and after it has been freed. >From what I can gather, the PAT code may change the cache attributes of arbitrary physical addresses (ie. including highmem pages), which would result in aliases in the case that it operates on one of these lazy tlb highmem pages. Flushing kmaps should solve the problem. I've also just added code for conditional flushing if we haven't got any dangling highmem aliases -- this should help performance if we change page attributes frequently or systems that aren't using much highmem pages (eg. if < 4G RAM). Should be turned into 2 patches, but just for RFC... Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2c5c18c2464..4adb33628de 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -777,6 +777,9 @@ static int do_change_page_attr_set_clr(unsigned long addr, int numpages, WARN_ON_ONCE(1); } + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; -- cgit v1.2.3 From cacf890694a36124ceddce44ff4c7b02d372ce7c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Aug 2008 13:46:33 +0200 Subject: Revert "introduce two APIs for page attribute" This reverts commit 1ac2f7d55b7ee1613c90631e87fea22ec06781e5. --- arch/x86/mm/pageattr.c | 58 +++++++------------------------------------------- 1 file changed, 8 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4adb33628de..5c06469a065 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -752,12 +752,12 @@ static inline int cache_attr(pgprot_t attr) (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int do_change_page_attr_set_clr(unsigned long addr, int numpages, +static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split, int *tlb_flush) + int force_split) { struct cpa_data cpa; - int ret, checkalias; + int ret, cache, checkalias; /* * Check, if we are requested to change a not supported @@ -795,22 +795,9 @@ static int do_change_page_attr_set_clr(unsigned long addr, int numpages, /* * Check whether we really changed something: */ - *tlb_flush = cpa.flushtlb; - cpa_fill_pool(NULL); - - return ret; -} - -static int change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr, - int force_split) -{ - int cache, flush_cache = 0, ret; - - ret = do_change_page_attr_set_clr(addr, numpages, mask_set, mask_clr, - force_split, &flush_cache); - if (!flush_cache) + if (!cpa.flushtlb) goto out; + /* * No need to flush, when we did not set any of the caching * attributes: @@ -827,7 +814,10 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_range(addr, numpages, cache); else cpa_flush_all(cache); + out: + cpa_fill_pool(NULL); + return ret; } @@ -865,30 +855,6 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); -int set_memory_uc_noflush(unsigned long addr, int numpages) -{ - int flush; - /* - * for now UC MINUS. see comments in ioremap_nocache() - */ - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL)) - return -EINVAL; - /* - * for now UC MINUS. see comments in ioremap_nocache() - */ - return do_change_page_attr_set_clr(addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), - __pgprot(0), 0, &flush); -} -EXPORT_SYMBOL(set_memory_uc_noflush); - -void set_memory_flush_all(void) -{ - cpa_flush_all(1); -} -EXPORT_SYMBOL(set_memory_flush_all); - int _set_memory_wc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, @@ -963,14 +929,6 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); -int set_pages_uc_noflush(struct page *page, int numpages) -{ - unsigned long addr = (unsigned long)page_address(page); - - return set_memory_uc_noflush(addr, numpages); -} -EXPORT_SYMBOL(set_pages_uc_noflush); - int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); -- cgit v1.2.3 From d75586ad01e6c5a30e7337fb87d61e03556a1ecb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Aug 2008 10:46:06 +0800 Subject: x86, pageattr: introduce APIs to change pageattr of a page array Add array interface APIs of pageattr. page based cache flush is quite slow for a lot of pages. If pages are more than 1024 (4M), the patch will use a wbinvd(). We have a simple test here (run a 3d game - open arena), nearly all agp memory allocation are small (< 1M), so suppose this will not impact runtime performance. Signed-off-by: Dave Airlie Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 216 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 166 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 5c06469a065..041e81ef673 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -25,15 +25,19 @@ * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { - unsigned long vaddr; + unsigned long *vaddr; pgprot_t mask_set; pgprot_t mask_clr; int numpages; - int flushtlb; + int flags; unsigned long pfn; unsigned force_split : 1; + int curpage; }; +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -184,6 +188,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } +static void cpa_flush_array(unsigned long *start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long *addr; + + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* 4M threshold */ + if (numpages >= 1024) { + if (boot_cpu_data.x86_model >= 4) + wbinvd(); + return; + } + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr++) { + pte_t *pte = lookup_address(*addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) *addr, PAGE_SIZE); + } +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -392,7 +431,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); __set_pmd_pte(kpte, address, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; do_split = 0; } @@ -578,11 +617,16 @@ out_unlock: static int __change_page_attr(struct cpa_data *cpa, int primary) { - unsigned long address = cpa->vaddr; + unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; + if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; + repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -594,8 +638,8 @@ repeat: return 0; printk(KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", address, - cpa->vaddr); WARN_ON(1); + *cpa->vaddr); return -EINVAL; } @@ -621,7 +665,7 @@ repeat: */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; @@ -645,7 +689,7 @@ repeat: */ err = split_large_page(kpte, address); if (!err) { - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; goto repeat; } @@ -658,6 +702,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; int ret = 0; + unsigned long temp_cpa_vaddr, vaddr; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -670,16 +715,24 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (!(within(cpa->vaddr, PAGE_OFFSET, + if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) #ifdef CONFIG_X86_64 - || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), + || within(vaddr, PAGE_OFFSET + (1UL<<32), PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) #endif )) { alias_cpa = *cpa; - alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; + ret = __change_page_attr_set_clr(&alias_cpa, 0); } @@ -691,7 +744,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the high * mapping already: */ - if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) return 0; /* @@ -702,8 +755,9 @@ static int cpa_process_alias(struct cpa_data *cpa) return 0; alias_cpa = *cpa; - alias_cpa.vaddr = - (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; /* * The high mapping range is imprecise, so ignore the return value. @@ -723,6 +777,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * preservation check. */ cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & CPA_ARRAY) + cpa->numpages = 1; ret = __change_page_attr(cpa, checkalias); if (ret) @@ -741,7 +798,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) */ BUG_ON(cpa->numpages > numpages); numpages -= cpa->numpages; - cpa->vaddr += cpa->numpages * PAGE_SIZE; + if (cpa->flags & CPA_ARRAY) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } @@ -752,9 +813,9 @@ static inline int cache_attr(pgprot_t attr) (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int change_page_attr_set_clr(unsigned long addr, int numpages, +static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split) + int force_split, int array) { struct cpa_data cpa; int ret, cache, checkalias; @@ -769,12 +830,22 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, return 0; /* Ensure we are PAGE_SIZE aligned */ - if (addr & ~PAGE_MASK) { - addr &= PAGE_MASK; - /* - * People should not be passing in unaligned addresses: - */ - WARN_ON_ONCE(1); + if (!array) { + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + } else { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } } /* Must avoid aliasing mappings in the highmem code */ @@ -784,9 +855,13 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; - cpa.flushtlb = 0; + cpa.flags = 0; + cpa.curpage = 0; cpa.force_split = force_split; + if (array) + cpa.flags |= CPA_ARRAY; + /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -795,7 +870,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, /* * Check whether we really changed something: */ - if (!cpa.flushtlb) + if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* @@ -810,9 +885,12 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * error case we fall back to cpa_flush_all (which uses * wbindv): */ - if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages, cache); - else + if (!ret && cpu_has_clflush) { + if (cpa.flags & CPA_ARRAY) + cpa_flush_array(addr, numpages, cache); + else + cpa_flush_range(*addr, numpages, cache); + } else cpa_flush_all(cache); out: @@ -821,16 +899,18 @@ out: return ret; } -static inline int change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + array); } -static inline int change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + array); } int _set_memory_uc(unsigned long addr, int numpages) @@ -838,8 +918,8 @@ int _set_memory_uc(unsigned long addr, int numpages) /* * for now UC MINUS. see comments in ioremap_nocache() */ - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) @@ -855,10 +935,31 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + int i; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + for (i = 0; i < addrinarray; i++) { + if (reserve_memtype(addr[i], addr[i] + PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL)) + goto out; + } + + return change_page_attr_set(addr, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS), 1); +out: + while (--i >= 0) + free_memtype(addr[i], addr[i] + PAGE_SIZE); + return -EINVAL; +} +EXPORT_SYMBOL(set_memory_array_uc); + int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_WC)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_WC), 0); } int set_memory_wc(unsigned long addr, int numpages) @@ -876,8 +977,8 @@ EXPORT_SYMBOL(set_memory_wc); int _set_memory_wb(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_CACHE_MASK)); + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) @@ -888,37 +989,48 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + for (i = 0; i < addrinarray; i++) + free_memtype(addr[i], addr[i] + PAGE_SIZE); + + return change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); +} +EXPORT_SYMBOL(set_memory_array_wb); + int set_memory_x(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); int set_memory_ro(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_rw(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_np(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), - __pgprot(0), 1); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0); } int set_pages_uc(struct page *page, int numpages) @@ -971,20 +1083,24 @@ int set_pages_rw(struct page *page, int numpages) static int __set_pages_p(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .mask_clr = __pgprot(0)}; + .mask_clr = __pgprot(0), + .flags = 0}; return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; return __change_page_attr_set_clr(&cpa, 1); } -- cgit v1.2.3 From ab7e79243746e2a9c5f00243e60108189c44c9eb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Aug 2008 10:46:21 +0800 Subject: x86: fix pageattr-test We changed the interface of some pageattr, this patch makes pageattr-test compile. Signed-off-by: Dave Airlie Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 0dcd42eb94e..6ae1f28a7ff 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -118,6 +118,7 @@ static int pageattr_test(void) unsigned int level; int i, k; int err; + unsigned long test_addr; if (print) printk(KERN_INFO "CPA self-test:\n"); @@ -172,7 +173,8 @@ static int pageattr_test(void) continue; } - err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); + test_addr = addr[i]; + err = change_page_attr_set(&test_addr, len[i], PAGE_TESTBIT, 0); if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; @@ -204,7 +206,8 @@ static int pageattr_test(void) failed++; continue; } - err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); + test_addr = addr[i]; + err = change_page_attr_clear(&test_addr, len[i], PAGE_TESTBIT, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; -- cgit v1.2.3 From 9a79f4f491f92bc713e1f28f96516b141b752600 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Fri, 22 Aug 2008 00:10:13 +0200 Subject: x86: {reverve,free}_memtype() take a physical address The new set_memory_array_{uc,wb}() pass virtual addresses to {reserve,free}_memtype() it seems. Signed-off-by: Rene Herman Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1785591808b..fed6ba2a8e7 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -947,7 +947,7 @@ int set_memory_array_uc(unsigned long *addr, int addrinarray) * for now UC MINUS. see comments in ioremap_nocache() */ for (i = 0; i < addrinarray; i++) { - if (reserve_memtype(addr[i], addr[i] + PAGE_SIZE, + if (reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, _PAGE_CACHE_UC_MINUS, NULL)) goto out; } @@ -956,7 +956,7 @@ int set_memory_array_uc(unsigned long *addr, int addrinarray) __pgprot(_PAGE_CACHE_UC_MINUS), 1); out: while (--i >= 0) - free_memtype(addr[i], addr[i] + PAGE_SIZE); + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); return -EINVAL; } EXPORT_SYMBOL(set_memory_array_uc); @@ -998,7 +998,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray) { int i; for (i = 0; i < addrinarray; i++) - free_memtype(addr[i], addr[i] + PAGE_SIZE); + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); return change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); -- cgit v1.2.3 From c5e147cf5aeb31aa1a9030be9727914855fc4133 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Fri, 22 Aug 2008 01:02:20 +0200 Subject: x86: have set_memory_array_{uc,wb} coalesce memtypes. Actually, might as well simply reconstruct the memtype list at free time I guess. How is this for a coalescing version of the array functions? Compiles, boots and provides me with: root@7ixe4:~# wc -l /debug/x86/pat_memtype_list 53 /debug/x86/pat_memtype_list otherwise (down from 16384+). Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index fed6ba2a8e7..497108825da 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -942,21 +942,38 @@ EXPORT_SYMBOL(set_memory_uc); int set_memory_array_uc(unsigned long *addr, int addrinarray) { + unsigned long start; + unsigned long end; int i; /* * for now UC MINUS. see comments in ioremap_nocache() */ for (i = 0; i < addrinarray; i++) { - if (reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL)) + start = __pa(addr[i]); + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) goto out; } return change_page_attr_set(addr, addrinarray, __pgprot(_PAGE_CACHE_UC_MINUS), 1); out: - while (--i >= 0) - free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + for (i = 0; i < addrinarray; i++) { + unsigned long tmp = __pa(addr[i]); + + if (tmp == start) + break; + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(tmp, end); + } return -EINVAL; } EXPORT_SYMBOL(set_memory_array_uc); @@ -997,9 +1014,18 @@ EXPORT_SYMBOL(set_memory_wb); int set_memory_array_wb(unsigned long *addr, int addrinarray) { int i; - for (i = 0; i < addrinarray; i++) - free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + for (i = 0; i < addrinarray; i++) { + unsigned long start = __pa(addr[i]); + unsigned long end; + + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(start, end); + } return change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); } -- cgit v1.2.3 From 01de05af94db5d5214b0a5e191068d19c82059a8 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Fri, 22 Aug 2008 12:08:17 -0700 Subject: x86: have set_memory_array_{uc,wb} coalesce memtypes, fix Fix the start addr for free_memtype calls in the error path. Signed-off-by: Venkatesh Pallipadi Acked-by: Rene Herman Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 497108825da..4b6968ba086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -967,7 +967,7 @@ out: if (tmp == start) break; - for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { if (end != __pa(addr[i + 1])) break; i++; -- cgit v1.2.3 From 110e0358e7dfd9cc56d47077068f3680dae10b56 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 28 Aug 2008 13:58:39 -0700 Subject: x86: make sure the CPA test code's use of _PAGE_UNUSED1 is obvious The CPA test code uses _PAGE_UNUSED1, so make sure its obvious. Signed-off-by: Jeremy Fitzhardinge Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 7c301728711..e1d10690921 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -32,7 +32,7 @@ enum { GPS = (1<<30) }; -#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) +#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) static int pte_testbit(pte_t pte) { @@ -174,7 +174,7 @@ static int pageattr_test(void) } test_addr = addr[i]; - err = change_page_attr_set(&test_addr, len[i], PAGE_TESTBIT, 0); + err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; @@ -207,7 +207,7 @@ static int pageattr_test(void) continue; } test_addr = addr[i]; - err = change_page_attr_clear(&test_addr, len[i], PAGE_TESTBIT, 0); + err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; -- cgit v1.2.3 From b2bc27314664c4d1a2f02e6f4cd0c32e4681d61e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:36 -0700 Subject: x86, cpa: rename PTE attribute macros for kernel direct mapping in early boot Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 34 +++++++++++++++------------------- arch/x86/kernel/head_64.S | 4 ++-- 2 files changed, 17 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index a7010c3a377..e835b4eea70 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4 * * Note that the stack is not yet set up! */ -#define PTE_ATTR 0x007 /* PRESENT+RW+USER */ -#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ -#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ - default_entry: #ifdef CONFIG_X86_PAE @@ -196,9 +192,9 @@ default_entry: movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_pmd), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ movl %ecx,(%edx) /* Store PMD entry */ /* Upper half already zero */ addl $8,%edx @@ -215,7 +211,7 @@ default_entry: * End condition: we must map up to and including INIT_MAP_BEYOND_END * bytes beyond the end of our own page tables. */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b 1: @@ -224,7 +220,7 @@ default_entry: movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) #else /* Not PAE */ @@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_dir), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ movl %ecx,(%edx) /* Store identity PDE entry */ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ addl $4,%edx @@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * bytes beyond the end of our own page tables; the +0x007 is * the attribute bits */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b movl %edi,pa(init_pg_tables_end) @@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_dir+0xffc) #endif jmp 3f @@ -634,19 +630,19 @@ ENTRY(empty_zero_page) /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) - .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 # elif KPMDS == 2 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 # elif KPMDS == 1 .long 0,0 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 # else # error "Kernel PMDs should be 1, 2 or 3" # endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index db3280afe88..26cfdc1d7c7 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -110,7 +110,7 @@ startup_64: movq %rdi, %rax shrq $PMD_SHIFT, %rax andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx leaq level2_spare_pgt(%rip), %rbx movq %rdx, 0(%rbx, %rax, 8) ident_complete: @@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* -- cgit v1.2.3 From a2699e477b8e6b17d4da64916f766dd5a2576c9c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:38 -0700 Subject: x86, cpa: make the kernel physical mapping initialization a two pass sequence In the first pass, kernel physical mapping will be setup using large or small pages but uses the same PTE attributes as that of the early PTE attributes setup by early boot code in head_[32|64].S After flushing TLB's, we go through the second pass, which setups the direct mapped PTE's with the appropriate attributes (like NX, GLOBAL etc) which are runtime detectable. This two pass mechanism conforms to the TLB app note which says: "Software should not write to a paging-structure entry in a way that would change, for any linear address, both the page size and either the page frame or attributes." Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 65 ++++++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 99 +++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 144 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0..9b5f7d7049d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -194,11 +194,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, pgd_t *pgd; pmd_t *pmd; pte_t *pte; - unsigned pages_2m = 0, pages_4k = 0; + unsigned pages_2m, pages_4k; + int mapping_iter; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; if (!cpu_has_pse) use_pse = 0; +repeat: + pages_2m = pages_4k = 0; pfn = start_pfn; pgd_idx = pgd_index((pfn<> PAGE_SHIFT, PAGE_KERNEL).pte); + pages++; +repeat_set_pte: set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; - pages++; } - update_page_count(PG_LEVEL_4K, pages); + + if (physical_mapping_iter == 1) + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } @@ -318,7 +323,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, { unsigned long pages = 0; unsigned long last_map_addr = end; - unsigned long start = address; int i = pmd_index(address); @@ -341,15 +345,14 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, last_map_addr = phys_pte_update(pmd, address, end); spin_unlock(&init_mm.page_table_lock); + continue; } - /* Count entries we're using from level2_ident_pgt */ - if (start == 0) - pages++; - continue; + goto repeat_set_pte; } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -366,7 +369,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); spin_unlock(&init_mm.page_table_lock); } - update_page_count(PG_LEVEL_2M, pages); + if (physical_mapping_iter == 1) + update_page_count(PG_LEVEL_2M, pages); return last_map_addr; } @@ -405,14 +409,18 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, } if (pud_val(*pud)) { - if (!pud_large(*pud)) + if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, page_size_mask); - continue; + continue; + } + + goto repeat_set_pte; } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -430,7 +438,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); - update_page_count(PG_LEVEL_1G, pages); + + if (physical_mapping_iter == 1) + update_page_count(PG_LEVEL_1G, pages); return last_map_addr; } @@ -494,15 +504,54 @@ static void __init init_gbpages(void) direct_gbpages = 0; } +static int is_kernel(unsigned long pfn) +{ + unsigned long pg_addresss = pfn << PAGE_SHIFT; + + if (pg_addresss >= (unsigned long) __pa(_text) && + pg_addresss <= (unsigned long) __pa(_end)) + return 1; + + return 0; +} + static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - unsigned long next, last_map_addr = end; + unsigned long next, last_map_addr; + u64 cached_supported_pte_mask = __supported_pte_mask; + unsigned long cache_start = start; + unsigned long cache_end = end; + + /* + * First iteration will setup identity mapping using large/small pages + * based on page_size_mask, with other attributes same as set by + * the early code in head_64.S + * + * Second iteration will setup the appropriate attributes + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + * + * For now, only difference between very early PTE attributes used in + * head_64.S and here is _PAGE_NX. + */ + BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC) + != _PAGE_NX); + __supported_pte_mask &= ~(_PAGE_NX); + physical_mapping_iter = 1; - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); +repeat: + last_map_addr = cache_end; + + start = (unsigned long)__va(cache_start); + end = (unsigned long)__va(cache_end); for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); @@ -514,11 +563,21 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, next = end; if (pgd_val(*pgd)) { + /* + * Static identity mappings will be overwritten + * with run-time mappings. For example, this allows + * the static 0-1GB identity mapping to be mapped + * non-executable with this. + */ + if (is_kernel(pte_pfn(*((pte_t *) pgd)))) + goto realloc; + last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end), page_size_mask); continue; } +realloc: pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); @@ -528,6 +587,16 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); } + __flush_tlb_all(); + + if (physical_mapping_iter == 1) { + physical_mapping_iter = 2; + /* + * Second iteration will set the actual desired PTE attributes. + */ + __supported_pte_mask = cached_supported_pte_mask; + goto repeat; + } return last_map_addr; } -- cgit v1.2.3 From 0b8fdcbcd287a1fbe66817491e6149841ae25705 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:39 -0700 Subject: x86, cpa: dont use large pages for kernel identity mapping with DEBUG_PAGEALLOC Don't use large pages for kernel identity mapping with DEBUG_PAGEALLOC. This will remove the need to split the large page for the allocated kernel page in the interrupt context. This will simplify cpa code(as we don't do the split any more from the interrupt context). cpa code simplication in the subsequent patches. Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 18 ++++++++++++++---- arch/x86/mm/init_64.c | 26 ++++++++++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9b5f7d7049d..44ccb028c35 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -777,7 +777,7 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse) { unsigned long puds, pmds, ptes, tables, start; @@ -787,7 +787,7 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); @@ -827,12 +827,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, pgd_t *pgd_base = swapper_pg_dir; unsigned long start_pfn, end_pfn; unsigned long big_page_start; +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + int use_pse = 0; +#else + int use_pse = cpu_has_pse; +#endif /* * Find space for the kernel direct mapping tables. */ if (!after_init_bootmem) - find_early_table_space(end); + find_early_table_space(end, use_pse); #ifdef CONFIG_X86_PAE set_nx(); @@ -878,7 +888,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - cpu_has_pse); + use_pse); /* tail is not big page alignment ? */ start_pfn = end_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ba945eb628..9d7587ac1eb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -456,13 +456,14 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (direct_gbpages) { + if (use_gbpages) { unsigned long extra; extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; @@ -470,7 +471,7 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -640,6 +641,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, struct map_range mr[NR_RANGE_MR]; int nr_range, i; + int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping\n"); @@ -653,9 +655,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) init_gbpages(); - if (direct_gbpages) +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; - if (cpu_has_pse) + if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; memset(mr, 0, sizeof(mr)); @@ -716,7 +730,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1< Date: Tue, 23 Sep 2008 14:00:40 -0700 Subject: x86, cpa: no need to check alias for __set_pages_p/__set_pages_np No alias checking needed for setting present/not-present mapping. Otherwise, we may need to break large pages for 64-bit kernel text mappings (this adds to complexity if we want to do this from atomic context especially, for ex: with CONFIG_DEBUG_PAGEALLOC). Let's keep it simple! Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4b6968ba086..162812b05d2 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1121,7 +1121,13 @@ static int __set_pages_p(struct page *page, int numpages) .mask_clr = __pgprot(0), .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } static int __set_pages_np(struct page *page, int numpages) @@ -1133,7 +1139,13 @@ static int __set_pages_np(struct page *page, int numpages) .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) @@ -1153,11 +1165,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) /* * The return value is ignored as the calls cannot fail. - * Large pages are kept enabled at boot time, and are - * split up quickly with DEBUG_PAGEALLOC. If a splitup - * fails here (due to temporary memory shortage) no damage - * is done because we just keep the largepage intact up - * to the next attempt when it will likely be split up: + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); -- cgit v1.2.3 From 8311eb84bf842d345f543f4c62ca2b6ea26f638c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:41 -0700 Subject: x86, cpa: remove cpa pool code Interrupt context no longer splits large page in cpa(). So we can do away with cpa memory pool code. Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 - arch/x86/mm/pageattr.c | 157 ++----------------------------------------------- 3 files changed, 5 insertions(+), 155 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 44ccb028c35..74780800e7e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1051,7 +1051,6 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - cpa_init(); save_pg_dir(); zap_low_mappings(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d7587ac1eb..f54a4d97530 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -889,8 +889,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - - cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 162812b05d2..f5e8663c0f7 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -447,114 +447,17 @@ out_unlock: return do_split; } -static LIST_HEAD(page_pool); -static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed; - -static void cpa_fill_pool(struct page **ret) -{ - gfp_t gfp = GFP_KERNEL; - unsigned long flags; - struct page *p; - - /* - * Avoid recursion (on debug-pagealloc) and also signal - * our priority to get to these pagetables: - */ - if (current->flags & PF_MEMALLOC) - return; - current->flags |= PF_MEMALLOC; - - /* - * Allocate atomically from atomic contexts: - */ - if (in_atomic() || irqs_disabled() || debug_pagealloc) - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - - while (pool_pages < pool_size || (ret && !*ret)) { - p = alloc_pages(gfp, 0); - if (!p) { - pool_failed++; - break; - } - /* - * If the call site needs a page right now, provide it: - */ - if (ret && !*ret) { - *ret = p; - continue; - } - spin_lock_irqsave(&pgd_lock, flags); - list_add(&p->lru, &page_pool); - pool_pages++; - spin_unlock_irqrestore(&pgd_lock, flags); - } - - current->flags &= ~PF_MEMALLOC; -} - -#define SHIFT_MB (20 - PAGE_SHIFT) -#define ROUND_MB_GB ((1 << 10) - 1) -#define SHIFT_MB_GB 10 -#define POOL_PAGES_PER_GB 16 - -void __init cpa_init(void) -{ - struct sysinfo si; - unsigned long gb; - - si_meminfo(&si); - /* - * Calculate the number of pool pages: - * - * Convert totalram (nr of pages) to MiB and round to the next - * GiB. Shift MiB to Gib and multiply the result by - * POOL_PAGES_PER_GB: - */ - if (debug_pagealloc) { - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; - } else { - pool_size = 1; - } - pool_low = pool_size; - - cpa_fill_pool(NULL); - printk(KERN_DEBUG - "CPA: page pool initialized %lu of %lu pages preallocated\n", - pool_pages, pool_size); -} - static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; - struct page *base; + struct page *base = alloc_pages(GFP_KERNEL, 0); + if (!base) + return -ENOMEM; - /* - * Get a page from the pool. The pool list is protected by the - * pgd_lock, which we have to take anyway for the split - * operation: - */ spin_lock_irqsave(&pgd_lock, flags); - if (list_empty(&page_pool)) { - spin_unlock_irqrestore(&pgd_lock, flags); - base = NULL; - cpa_fill_pool(&base); - if (!base) - return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); - } else { - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - } - /* * Check for races, another CPU might have split this page * up for us already: @@ -611,11 +514,8 @@ out_unlock: * If we dropped out via the lookup_address check under * pgd_lock then stick the page back into the pool: */ - if (base) { - list_add(&base->lru, &page_pool); - pool_pages++; - } else - pool_used++; + if (base) + __free_page(base); spin_unlock_irqrestore(&pgd_lock, flags); return 0; @@ -899,8 +799,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa_flush_all(cache); out: - cpa_fill_pool(NULL); - return ret; } @@ -1178,53 +1076,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); - - /* - * Try to refill the page pool here. We can do this only after - * the tlb flush. - */ - cpa_fill_pool(NULL); -} - -#ifdef CONFIG_DEBUG_FS -static int dpa_show(struct seq_file *m, void *v) -{ - seq_puts(m, "DEBUG_PAGEALLOC\n"); - seq_printf(m, "pool_size : %lu\n", pool_size); - seq_printf(m, "pool_pages : %lu\n", pool_pages); - seq_printf(m, "pool_low : %lu\n", pool_low); - seq_printf(m, "pool_used : %lu\n", pool_used); - seq_printf(m, "pool_failed : %lu\n", pool_failed); - - return 0; -} - -static int dpa_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, dpa_show, NULL); } -static const struct file_operations dpa_fops = { - .open = dpa_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init debug_pagealloc_proc_init(void) -{ - struct dentry *de; - - de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, - &dpa_fops); - if (!de) - return -ENOMEM; - - return 0; -} -__initcall(debug_pagealloc_proc_init); -#endif - #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) -- cgit v1.2.3 From ad5ca55f6bdb47c957b681c7358bb3719ba4ee82 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:42 -0700 Subject: x86, cpa: srlz cpa(), global flush tlb after splitting big page and before doing cpa Do a global flush tlb after splitting the large page and before we do the actual change page attribute in the PTE. With out this, we violate the TLB application note, which says "The TLBs may contain both ordinary and large-page translations for a 4-KByte range of linear addresses. This may occur if software modifies the paging structures so that the page size used for the address range changes. If the two translations differ with respect to page frame or attributes (e.g., permissions), processor behavior is undefined and may be implementation-specific." And also serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) using cpa_lock. So that we don't allow any other cpu, with stale large tlb entries change the page attribute in parallel to some other cpu splitting a large page entry along with changing the attribute. Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f5e8663c0f7..b6374d653d0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -35,6 +35,14 @@ struct cpa_data { int curpage; }; +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 @@ -453,7 +461,13 @@ static int split_large_page(pte_t *kpte, unsigned long address) unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; - struct page *base = alloc_pages(GFP_KERNEL, 0); + struct page *base; + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); if (!base) return -ENOMEM; @@ -594,7 +608,25 @@ repeat: */ err = split_large_page(kpte, address); if (!err) { - cpa->flags |= CPA_FLUSHTLB; + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); goto repeat; } @@ -686,7 +718,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (cpa->flags & CPA_ARRAY) cpa->numpages = 1; + if (!debug_pagealloc) + spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc) + spin_unlock(&cpa_lock); if (ret) return ret; -- cgit v1.2.3 From 9542ada803198e6eba29d3289abb39ea82047b92 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 24 Sep 2008 08:53:33 -0700 Subject: x86: track memtype for RAM in page struct Track the memtype for RAM pages in page struct instead of using the memtype list. This avoids the explosion in the number of entries in memtype list (of the order of 20,000 with AGP) and makes the PAT tracking simpler. We are using PG_arch_1 bit in page->flags. We still use the memtype list for non RAM pages. Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 19 ++++++++++++ arch/x86/mm/pat.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4b6e6a29ae..d03c461e045 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -83,6 +83,25 @@ int page_is_ram(unsigned long pagenr) return 0; } +int pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + if (page_is_ram(page_nr)) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f049b1d6ebd..aceb6c7c6db 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -210,6 +210,75 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, static struct memtype *cached_entry; static u64 cached_start; +/* + * RED-PEN: TODO: Add PageReserved() check as well here, + * once we add SetPageReserved() to all the drivers using + * set_memory_* or set_pages_*. + * + * This will help prevent accidentally freeing pages + * before setting the attribute back to WB. + */ + +/* + * For RAM pages, mark the pages as non WB memory type using + * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or + * set_memory_wc() on a RAM page at a time before marking it as WB again. + * This is ok, because only one driver will be owning the page and + * doing set_memory_*() calls. + * + * For now, we use PageNonWB to track that the RAM page is being mapped + * as non WB. In future, we will have to use one more flag + * (or some other mechanism in page_struct) to distinguish between + * UC and WC mapping. + */ +static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct page *page; + u64 pfn, end_pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + if (page_mapped(page) || PageNonWB(page)) + goto out; + + SetPageNonWB(page); + } + return 0; + +out: + end_pfn = pfn; + for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + page = pfn_to_page(pfn); + ClearPageNonWB(page); + } + + return -EINVAL; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn, end_pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + if (page_mapped(page) || !PageNonWB(page)) + goto out; + + ClearPageNonWB(page); + } + return 0; + +out: + end_pfn = pfn; + for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + page = pfn_to_page(pfn); + SetPageNonWB(page); + } + return -EINVAL; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -232,6 +301,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long actual_type; struct list_head *where; int err = 0; + int is_range_ram; BUG_ON(start >= end); /* end is exclusive */ @@ -270,6 +340,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return reserve_ram_pages_type(start, end, req_type, new_type); + else if (is_range_ram < 0) + return -EINVAL; + new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) return -ENOMEM; @@ -358,6 +434,7 @@ int free_memtype(u64 start, u64 end) { struct memtype *entry; int err = -EINVAL; + int is_range_ram; if (!pat_enabled) return 0; @@ -366,6 +443,12 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + else if (is_range_ram < 0) + return -EINVAL; + spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { -- cgit v1.2.3 From 28dd033f43ca957cd751e02652b36c6fa364ca18 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 29 Sep 2008 12:13:26 -0700 Subject: x86: fix pagetable init 64-bit breakage Fix _end alignment check - can trigger a crash if _end happens to be on a page boundary. Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f54a4d97530..6116ff0d741 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -510,7 +510,7 @@ static int is_kernel(unsigned long pfn) unsigned long pg_addresss = pfn << PAGE_SHIFT; if (pg_addresss >= (unsigned long) __pa(_text) && - pg_addresss <= (unsigned long) __pa(_end)) + pg_addresss < (unsigned long) __pa(_end)) return 1; return 0; -- cgit v1.2.3 From ad2cde16a21985cdc4302e4a4b0fc373d666fdf7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Sep 2008 13:20:45 +0200 Subject: x86, pat: cleanups clean up recently added code to be more consistent with other x86 code. Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 67 ++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index aceb6c7c6db..738fd0f2495 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -7,24 +7,24 @@ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. */ -#include +#include +#include +#include #include #include +#include #include -#include -#include -#include -#include -#include +#include #include -#include +#include #include -#include -#include -#include #include +#include #include +#include +#include +#include #include #ifdef CONFIG_X86_PAT @@ -46,6 +46,7 @@ early_param("nopat", nopat); static int debug_enable; + static int __init pat_debug_setup(char *str) { debug_enable = 1; @@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags) */ struct memtype { - u64 start; - u64 end; - unsigned long type; - struct list_head nd; + u64 start; + u64 end; + unsigned long type; + struct list_head nd; }; static LIST_HEAD(memtype_list); -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ /* * Does intersection of PAT memory type and MTRR memory type and returns @@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } -static int chk_conflict(struct memtype *new, struct memtype *entry, - unsigned long *type) +static int +chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) { if (new->type != entry->type) { if (type) { @@ -210,15 +211,6 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, static struct memtype *cached_entry; static u64 cached_start; -/* - * RED-PEN: TODO: Add PageReserved() check as well here, - * once we add SetPageReserved() to all the drivers using - * set_memory_* or set_pages_*. - * - * This will help prevent accidentally freeing pages - * before setting the attribute back to WB. - */ - /* * For RAM pages, mark the pages as non WB memory type using * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or @@ -232,7 +224,7 @@ static u64 cached_start; * UC and WC mapping. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) + unsigned long *new_type) { struct page *page; u64 pfn, end_pfn; @@ -295,15 +287,15 @@ out: * it will return a negative return value. */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) + unsigned long *new_type) { struct memtype *new, *entry; unsigned long actual_type; struct list_head *where; - int err = 0; int is_range_ram; + int err = 0; - BUG_ON(start >= end); /* end is exclusive */ + BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled) { /* This is identical to page table setting without PAT */ @@ -336,9 +328,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, actual_type = _PAGE_CACHE_WB; else actual_type = _PAGE_CACHE_UC_MINUS; - } else + } else { actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + } is_range_ram = pagerange_is_ram(start, end); if (is_range_ram == 1) @@ -350,9 +343,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!new) return -ENOMEM; - new->start = start; - new->end = end; - new->type = actual_type; + new->start = start; + new->end = end; + new->type = actual_type; if (new_type) *new_type = actual_type; @@ -411,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, start, end, cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); + return err; } @@ -469,6 +463,7 @@ int free_memtype(u64 start, u64 end) } dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + return err; } @@ -575,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); u64 addr = (u64)pfn << PAGE_SHIFT; unsigned long flags; - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); reserve_memtype(addr, addr + size, want_flags, &flags); if (flags != want_flags) { @@ -620,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos) } spin_unlock(&memtype_lock); kfree(print_entry); + return NULL; } @@ -650,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), print_entry->start, print_entry->end); kfree(print_entry); + return 0; } -- cgit v1.2.3 From b27a43c1e90582facad44de67d02bc9e9f900289 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 7 Oct 2008 13:58:46 -0700 Subject: x86, cpa: make the kernel physical mapping initialization a two pass sequence, fix Jeremy Fitzhardinge wrote: > I'd noticed that current tip/master hasn't been booting under Xen, and I > just got around to bisecting it down to this change. > > commit 065ae73c5462d42e9761afb76f2b52965ff45bd6 > Author: Suresh Siddha > > x86, cpa: make the kernel physical mapping initialization a two pass sequence > > This patch is causing Xen to fail various pagetable updates because it > ends up remapping pagetables to RW, which Xen explicitly prohibits (as > that would allow guests to make arbitrary changes to pagetables, rather > than have them mediated by the hypervisor). Instead of making init a two pass sequence, to satisfy the Intel's TLB Application note (developer.intel.com/design/processor/applnots/317080.pdf Section 6 page 26), we preserve the original page permissions when fragmenting the large mappings and don't touch the existing memory mapping (which satisfies Xen's requirements). Only open issue is: on a native linux kernel, we will go back to mapping the first 0-1GB kernel identity mapping as executable (because of the static mapping setup in head_64.S). We can fix this in a different patch if needed. Signed-off-by: Suresh Siddha Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 149 +++++++++++++++++++++----------------------------- 1 file changed, 61 insertions(+), 88 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6116ff0d741..8c7eae490a2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -270,10 +270,9 @@ static __ref void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } -static int physical_mapping_iter; - static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { unsigned pages = 0; unsigned long last_map_addr = end; @@ -291,35 +290,40 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) break; } + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ if (pte_val(*pte)) - goto repeat_set_pte; + continue; if (0) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; -repeat_set_pte: - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; } - if (physical_mapping_iter == 1) - update_page_count(PG_LEVEL_4K, pages); + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, + pgprot_t prot) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - return phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end, prot); } static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { unsigned long pages = 0; unsigned long last_map_addr = end; @@ -330,6 +334,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; + pgprot_t new_prot = prot; if (address >= end) { if (!after_bootmem) { @@ -343,45 +348,58 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); + end, prot); spin_unlock(&init_mm.page_table_lock); continue; } - goto repeat_set_pte; + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) + continue; + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte(address >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - last_map_addr = phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end, new_prot); unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); spin_unlock(&init_mm.page_table_lock); } - if (physical_mapping_iter == 1) - update_page_count(PG_LEVEL_2M, pages); + update_page_count(PG_LEVEL_2M, pages); return last_map_addr; } static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); __flush_tlb_all(); return last_map_addr; } @@ -398,6 +416,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; if (addr >= end) break; @@ -411,16 +430,28 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (pud_val(*pud)) { if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, - page_size_mask); + page_size_mask, prot); continue; } - - goto repeat_set_pte; + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) + continue; + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -430,7 +461,8 @@ repeat_set_pte: } pmd = alloc_low_page(&pmd_phys); - last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); @@ -439,8 +471,7 @@ repeat_set_pte: } __flush_tlb_all(); - if (physical_mapping_iter == 1) - update_page_count(PG_LEVEL_1G, pages); + update_page_count(PG_LEVEL_1G, pages); return last_map_addr; } @@ -505,54 +536,15 @@ static void __init init_gbpages(void) direct_gbpages = 0; } -static int is_kernel(unsigned long pfn) -{ - unsigned long pg_addresss = pfn << PAGE_SHIFT; - - if (pg_addresss >= (unsigned long) __pa(_text) && - pg_addresss < (unsigned long) __pa(_end)) - return 1; - - return 0; -} - static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - unsigned long next, last_map_addr; - u64 cached_supported_pte_mask = __supported_pte_mask; - unsigned long cache_start = start; - unsigned long cache_end = end; - - /* - * First iteration will setup identity mapping using large/small pages - * based on page_size_mask, with other attributes same as set by - * the early code in head_64.S - * - * Second iteration will setup the appropriate attributes - * as desired for the kernel identity mapping. - * - * This two pass mechanism conforms to the TLB app note which says: - * - * "Software should not write to a paging-structure entry in a way - * that would change, for any linear address, both the page size - * and either the page frame or attributes." - * - * For now, only difference between very early PTE attributes used in - * head_64.S and here is _PAGE_NX. - */ - BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC) - != _PAGE_NX); - __supported_pte_mask &= ~(_PAGE_NX); - physical_mapping_iter = 1; + unsigned long next, last_map_addr = end; -repeat: - last_map_addr = cache_end; - - start = (unsigned long)__va(cache_start); - end = (unsigned long)__va(cache_end); + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); @@ -564,21 +556,11 @@ repeat: next = end; if (pgd_val(*pgd)) { - /* - * Static identity mappings will be overwritten - * with run-time mappings. For example, this allows - * the static 0-1GB identity mapping to be mapped - * non-executable with this. - */ - if (is_kernel(pte_pfn(*((pte_t *) pgd)))) - goto realloc; - last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end), page_size_mask); continue; } -realloc: pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); @@ -590,15 +572,6 @@ realloc: } __flush_tlb_all(); - if (physical_mapping_iter == 1) { - physical_mapping_iter = 2; - /* - * Second iteration will set the actual desired PTE attributes. - */ - __supported_pte_mask = cached_supported_pte_mask; - goto repeat; - } - return last_map_addr; } -- cgit v1.2.3