From 99bd0c0fc4b04da54cb311953ef9489931c19c63 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Jun 2009 10:59:09 +0200 Subject: x86: Set cpu_llc_id on AMD CPUs This counts when building sched domains in case NUMA information is not available. ( See cpu_coregroup_mask() which uses llc_shared_map which in turn is created based on cpu_llc_id. ) Currently Linux builds domains as follows: (example from a dual socket quad-core system) CPU0 attaching sched-domain: domain 0: span 0-7 level CPU groups: 0 1 2 3 4 5 6 7 ... CPU7 attaching sched-domain: domain 0: span 0-7 level CPU groups: 7 0 1 2 3 4 5 6 Ever since that is borked for multi-core AMD CPU systems. This patch fixes that and now we get a proper: CPU0 attaching sched-domain: domain 0: span 0-3 level MC groups: 0 1 2 3 domain 1: span 0-7 level CPU groups: 0-3 4-7 ... CPU7 attaching sched-domain: domain 0: span 4-7 level MC groups: 7 4 5 6 domain 1: span 0-7 level CPU groups: 4-7 0-3 This allows scheduler to assign tasks to cores on different sockets (i.e. that don't share last level cache) for performance reasons. Signed-off-by: Andreas Herrmann LKML-Reference: <20090619085909.GJ5218@alberich.amd.com> Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8f1b4..28e5f595604 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; #endif } -- cgit v1.2.3 From d9f2a5ecb2846d0fd368fb4c45182e43f38e4471 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 20 Jun 2009 13:19:25 +0530 Subject: perf_counter, x8: Fix L1-data-Cache-Store-Referencees for AMD Fix AMD's Data Cache Refills from System event. After this patch : ./tools/perf/perf stat -e l1d -e l1d-misses -e l1d-write -e l1d-prefetch -e l1d-prefetch-miss -e l1i -e l1i-misses -e l1i-prefetch -e l2 -e l2-misses -e l2-write -e dtlb -e dtlb-misses -e itlb -e itlb-misses -e bpu -e bpu-misses ls /dev/ > /dev/null Performance counter stats for 'ls /dev/': 2499484 L1-data-Cache-Load-Referencees (scaled from 3.97%) 70347 L1-data-Cache-Load-Misses (scaled from 7.30%) 9360 L1-data-Cache-Store-Referencees (scaled from 8.64%) 32804 L1-data-Cache-Prefetch-Referencees (scaled from 17.72%) 7693 L1-data-Cache-Prefetch-Misses (scaled from 22.97%) 2180945 L1-instruction-Cache-Load-Referencees (scaled from 28.48%) 14518 L1-instruction-Cache-Load-Misses (scaled from 35.00%) 2405 L1-instruction-Cache-Prefetch-Referencees (scaled from 34.89%) 71387 L2-Cache-Load-Referencees (scaled from 34.94%) 18732 L2-Cache-Load-Misses (scaled from 34.92%) 79918 L2-Cache-Store-Referencees (scaled from 36.02%) 1295294 Data-TLB-Cache-Load-Referencees (scaled from 35.99%) 30896 Data-TLB-Cache-Load-Misses (scaled from 33.36%) 1222030 Instruction-TLB-Cache-Load-Referencees (scaled from 29.46%) 357 Instruction-TLB-Cache-Load-Misses (scaled from 20.46%) 530888 Branch-Cache-Load-Referencees (scaled from 11.48%) 8638 Branch-Cache-Load-Misses (scaled from 5.09%) 0.011295149 seconds time elapsed. Earlier it always shows value 0. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1245484165.3102.6.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef23f78..22eb3a1d4f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -401,7 +401,7 @@ static const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { -- cgit v1.2.3 From c5806df9232d2a7f554b4839b57cac2e664fc256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix duplicate free in setup_pcpu_remap() failure path In the failure path, setup_pcpu_remap() tries to free the area which has already been freed to make holes in the large page. Fix it. [ Impact: fix duplicate free in failure path ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f0823e6a..dfbc7e6c64d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) enomem: for_each_possible_cpu(cpu) if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pcpur_ptrs), ptrs_size); -- cgit v1.2.3 From 97c9bf0618cd40b05b4859c1f8a90d8ad97fefb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: rename remap percpu first chunk allocator to lpage The "remap" allocator remaps large pages to build the first chunk; however, the name isn't very good because 4k allocator remaps too and the whole point of the remap allocator is using large page mapping. The allocator will be generalized and exported outside of x86, rename it to lpage before that happens. percpu_alloc kernel parameter is updated to accept both "remap" and "lpage" for lpage allocator. [ Impact: code cleanup, kernel parameter argument updated ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index dfbc7e6c64d..8794c0c94d2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator + * Large page remap allocator * * This allocator uses PMD page as unit. A PMD page is allocated for * each cpu and each is remapped into vmalloc area using PMD mapping. @@ -137,20 +137,20 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; +static size_t pcpul_size __initdata; +static void **pcpul_ptrs __initdata; -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; - if (off >= pcpur_size) + if (off >= pcpul_size) return NULL; - return virt_to_page(pcpur_ptrs[cpu] + off); + return virt_to_page(pcpul_ptrs[cpu] + off); } -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { static struct vm_struct vm; size_t ptrs_size, dyn_size; @@ -170,36 +170,36 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { + if (pcpul_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); + pcpul_ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) + pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpul_ptrs[cpu]) goto enomem; /* - * Only use pcpur_size bytes and give back the rest. + * Only use pcpul_size bytes and give back the rest. * * Ingo: The 2MB up-rounding bootmem is needed to make * sure the partial 2MB page is still fully RAM - it's * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); + free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + PMD_SIZE - pcpul_size); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); } /* allocate address and map */ @@ -212,7 +212,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pmd = populate_extra_pmd((unsigned long)vm.addr + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), PAGE_KERNEL_LARGE)); } @@ -220,22 +220,22 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, vm.addr, NULL); goto out_free_ar; enomem: for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); + if (pcpul_ptrs[cpu]) + free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); ret = -ENOMEM; out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + free_bootmem(__pa(pcpul_ptrs), ptrs_size); return ret; } #else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { return -EINVAL; } @@ -367,7 +367,7 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_remap(static_size); + ret = setup_pcpu_lpage(static_size); if (ret < 0) ret = setup_pcpu_embed(static_size); if (ret < 0) -- cgit v1.2.3 From 0ff2587fd54bd6f66bc6914ada4eb77a7e819a5b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: prepare setup_pcpu_lpage() for pageattr fix Make the following changes in preparation of coming pageattr updates. * Define and use array of struct pcpul_ent instead of array of pointers. The only difference is ->cpu field which is set but unused yet. * Rename variables according to the above change. * Rename local variable vm to pcpul_vm and move it out of the function. [ Impact: no functional difference ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 58 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8794c0c94d2..7d38941e2b8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,8 +137,14 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + static size_t pcpul_size __initdata; -static void **pcpul_ptrs __initdata; +static struct pcpul_ent *pcpul_map __initdata; +static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { @@ -147,13 +153,12 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) if (off >= pcpul_size) return NULL; - return virt_to_page(pcpul_ptrs[cpu] + off); + return virt_to_page(pcpul_map[cpu].ptr + off); } static ssize_t __init setup_pcpu_lpage(size_t static_size) { - static struct vm_struct vm; - size_t ptrs_size, dyn_size; + size_t map_size, dyn_size; unsigned int cpu; ssize_t ret; @@ -180,12 +185,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); - pcpul_ptrs = alloc_bootmem(ptrs_size); + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { - pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_ptrs[cpu]) + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, + PMD_SIZE); + if (!pcpul_map[cpu].ptr) goto enomem; /* @@ -196,42 +203,43 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), PMD_SIZE - pcpul_size); - memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); } /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { - pmd_t *pmd; + pmd_t *pmd, pmd_v; - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), - PAGE_KERNEL_LARGE)); + pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + + cpu * PMD_SIZE); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), + PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; + PMD_SIZE, pcpul_vm.addr, NULL); + goto out_free_map; enomem: for_each_possible_cpu(cpu) - if (pcpul_ptrs[cpu]) - free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); + if (pcpul_map[cpu].ptr) + free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpul_ptrs), ptrs_size); +out_free_map: + free_bootmem(__pa(pcpul_map), map_size); return ret; } #else -- cgit v1.2.3 From 992f4c1c2c1583cef3296ec4bf5205843a9a5f3d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: reorganize cpa_process_alias() Reorganize cpa_process_alias() so that new alias condition can be added easily. Jan Beulich spotted problem in the original cleanup thread which incorrectly assumed the two existing conditions were mutially exclusive. [ Impact: code reorganization ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/mm/pageattr.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cfe9ced8a4..2ab058b0947 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -681,8 +681,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; - int ret = 0; - unsigned long temp_cpa_vaddr, vaddr; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr; + int ret; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -706,42 +707,37 @@ static int cpa_process_alias(struct cpa_data *cpa) PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; - temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; } #ifdef CONFIG_X86_64 - if (ret) - return ret; /* - * No need to redo, when the primary call touched the high - * mapping already: - */ - if (within(vaddr, (unsigned long) _text, _brk_end)) - return 0; - - /* - * If the physical address is inside the kernel map, we need + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) - return 0; - - alias_cpa = *cpa; - temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; - alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - /* - * The high mapping range is imprecise, so ignore the return value. - */ - __change_page_attr_set_clr(&alias_cpa, 0); + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } #endif - return ret; + + return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) -- cgit v1.2.3 From e59a1bb2fdfb745c685f5b40ffbed126331d3223 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix pageattr handling for lpage percpu allocator and re-enable it lpage allocator aliases a PMD page for each cpu and returns whatever is unused to the page allocator. When the pageattr of the recycled pages are changed, this makes the two aliases point to the overlapping regions with different attributes which isn't allowed and known to cause subtle data corruption in certain cases. This can be handled in simliar manner to the x86_64 highmap alias. pageattr code should detect if the target pages have PMD alias and split the PMD alias and synchronize the attributes. pcpur allocator is updated to keep the allocated PMD pages map sorted in ascending address order and provide pcpu_lpage_remapped() function which binary searches the array to determine whether the given address is aliased and if so to which address. pageattr is updated to use pcpu_lpage_remapped() to detect the PMD alias and split it up as necessary from cpa_process_alias(). Jan Beulich spotted the original problem and incorrect usage of vaddr instead of laddr for lookup. With this, lpage percpu allocator should work correctly. Re-enable it. [ Impact: fix subtle lpage pageattr bug and re-enable lpage ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 10 ++++++ arch/x86/kernel/setup_percpu.c | 72 ++++++++++++++++++++++++++++++++++++------ arch/x86/mm/pageattr.c | 21 +++++++++++- 3 files changed, 93 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 02ecb30982a..103f1ddb0d8 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -42,6 +42,7 @@ #else /* ...!ASSEMBLY */ +#include #include #ifdef CONFIG_SMP @@ -155,6 +156,15 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); +#ifdef CONFIG_NEED_MULTIPLE_NODES +void *pcpu_lpage_remapped(void *kaddr); +#else +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7d38941e2b8..bad2fd22311 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -142,8 +142,8 @@ struct pcpul_ent { void *ptr; }; -static size_t pcpul_size __initdata; -static struct pcpul_ent *pcpul_map __initdata; +static size_t pcpul_size; +static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) @@ -160,15 +160,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) { size_t map_size, dyn_size; unsigned int cpu; + int i, j; ssize_t ret; /* * If large page isn't supported, there's no benefit in doing * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. */ - if (true || !cpu_has_pse || !pcpu_need_numa()) + if (!cpu_has_pse || !pcpu_need_numa()) return -EINVAL; /* @@ -231,16 +230,71 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, pcpul_vm.addr, NULL); - goto out_free_map; + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; enomem: for_each_possible_cpu(cpu) if (pcpul_map[cpu].ptr) free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - ret = -ENOMEM; -out_free_map: free_bootmem(__pa(pcpul_map), map_size); - return ret; + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu PMD + * mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used PMD + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); + unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < pmd_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > pmd_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * PMD_SIZE + offset; + } + } + + return NULL; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2ab058b0947..1b734d7a896 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -682,7 +683,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr; + unsigned long vaddr, remapped; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -737,6 +738,24 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif + /* + * If the PMD page was partially used for per-cpu remapping, + * the recycled area needs to be split and modified. Because + * the area is always proper subset of a PMD page + * cpa->numpages is guaranteed to be 1 for these areas, so + * there's no need to loop over and check for further remaps. + */ + remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); + if (remapped) { + WARN_ON(cpa->numpages > 1); + alias_cpa = *cpa; + alias_cpa.vaddr = &remapped; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + return 0; } -- cgit v1.2.3 From fa8a7094ba1679b4b9b443e0ac9f5e046c79ee8d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: implement percpu_alloc kernel parameter According to Andi, it isn't clear whether lpage allocator is worth the trouble as there are many processors where PMD TLB is far scarcer than PTE TLB. The advantage or disadvantage probably depends on the actual size of percpu area and specific processor. As performance degradation due to TLB pressure tends to be highly workload specific and subtle, it is difficult to decide which way to go without more data. This patch implements percpu_alloc kernel parameter to allow selecting which first chunk allocator to use to ease debugging and testing. While at it, make sure all the failure paths report why something failed to help determining why certain allocator isn't working. Also, kill the "Great future plan" comment which had already been realized quite some time ago. [ Impact: allow explicit percpu first chunk allocator selection ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 69 ++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bad2fd22311..165ebd5ba83 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -156,20 +156,23 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) return virt_to_page(pcpul_map[cpu].ptr + off); } -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t map_size, dyn_size; unsigned int cpu; int i, j; ssize_t ret; - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - */ - if (!cpu_has_pse || !pcpu_need_numa()) + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) return -EINVAL; + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. @@ -191,8 +194,11 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) pcpul_map[cpu].cpu = cpu; pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_map[cpu].ptr) + if (!pcpul_map[cpu].ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); goto enomem; + } /* * Only use pcpul_size bytes and give back the rest. @@ -297,7 +303,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { return -EINVAL; } @@ -311,7 +317,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size) +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -320,7 +326,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) * this. Also, embedding allocation doesn't play well with * NUMA. */ - if (!cpu_has_pse || pcpu_need_numa()) + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, @@ -370,8 +376,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) void *ptr; ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); goto enomem; + } memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); @@ -395,6 +404,16 @@ out_free_ar: return ret; } +/* for explicit first chunk allocator selection */ +static char pcpu_chosen_alloc[16] __initdata; + +static int __init percpu_alloc_setup(char *str) +{ + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -408,11 +427,6 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; @@ -429,9 +443,26 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_lpage(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); + ret = -EINVAL; + if (strlen(pcpu_chosen_alloc)) { + if (strcmp(pcpu_chosen_alloc, "4k")) { + if (!strcmp(pcpu_chosen_alloc, "lpage")) + ret = setup_pcpu_lpage(static_size, true); + else if (!strcmp(pcpu_chosen_alloc, "embed")) + ret = setup_pcpu_embed(static_size, true); + else + pr_warning("PERCPU: unknown allocator %s " + "specified\n", pcpu_chosen_alloc); + if (ret < 0) + pr_warning("PERCPU: %s allocator failed (%zd), " + "falling back to 4k\n", + pcpu_chosen_alloc, ret); + } + } else { + ret = setup_pcpu_lpage(static_size, false); + if (ret < 0) + ret = setup_pcpu_embed(static_size, false); + } if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) -- cgit v1.2.3 From 0017c869ddcb73069905d09f9e98e68627466237 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: ensure percpu lpage doesn't consume too much vmalloc space On extreme configuration (e.g. 32bit 32-way NUMA machine), lpage percpu first chunk allocator can consume too much of vmalloc space. Make it fall back to 4k allocator if the consumption goes over 20%. [ Impact: add sanity check for lpage percpu first chunk allocator ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 165ebd5ba83..29a3eef7cf4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -163,9 +163,21 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) int i, j; ssize_t ret; - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = num_possible_cpus() * PMD_SIZE; + + /* on non-NUMA, embedding is better */ + if (!pcpu_need_numa()) + return -EINVAL; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + return -EINVAL; + } + } /* need PSE */ if (!cpu_has_pse) { -- cgit v1.2.3 From 854c879f5abf309ebd378bea1ee41acf4ddf7194 Mon Sep 17 00:00:00 2001 From: Pekka J Enberg Date: Mon, 22 Jun 2009 17:39:41 +0300 Subject: x86: Move init_gbpages() to setup_arch() The init_gbpages() function is conditionally called from init_memory_mapping() function. There are two call-sites where this 'after_bootmem' condition can be true: setup_arch() and mem_init() via pci_iommu_alloc(). Therefore, it's safe to move the call to init_gbpages() to setup_arch() as it's always called before mem_init(). This removes an after_bootmem use - paving the way to remove all uses of that state variable. Signed-off-by: Pekka Enberg Acked-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 16 ++++++++++++++++ arch/x86/mm/init.c | 17 ----------------- 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index be5ae80f897..de2cab13284 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align) return ret; } +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -871,6 +885,8 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + init_gbpages(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Tue, 23 Jun 2009 12:40:54 +0900 Subject: x86, mce: Fix mce resume on 32bit Calling mcheck_init() on resume is required only with CONFIG_X86_OLD_MCE=y. Signed-off-by: Hidetoshi Seto Acked-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/power/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index d277ef1eea5..b3d20b9cac6 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -244,7 +244,7 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); mtrr_ap_init(); -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); #endif } -- cgit v1.2.3 From c14dab5c0782ef632742963a66276a195418a63c Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 24 Jun 2009 10:13:24 +0800 Subject: perf_counter, x86: Set global control MSR correctly Previous code made an assumption that the power on value of global control MSR has enabled all fixed and general purpose counters properly. However, this is not the case for certain Intel processors, such as Atom - and it might also be firmware dependent. Each enable bit in IA32_PERF_GLOBAL_CTRL is AND'ed with the enable bits for all privilege levels in the respective IA32_PERFEVTSELx or IA32_PERF_FIXED_CTR_CTRL MSRs to start/stop the counting of respective counters. Counting is enabled if the AND'ed results is true; counting is disabled when the result is false. The end result is that all fixed counters are always disabled on Atom processors because the assumption is just invalid. Fix this by not initializing the ctrl-mask out of the global MSR, but setting it to perf_counter_mask. Reported-by: Stephane Eranian Signed-off-by: Yong Wang Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <20090624021324.GA2788@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 22eb3a1d4f9..a310d19faca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -969,13 +969,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - /* - * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86_model == 28) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) @@ -1428,8 +1421,6 @@ static int intel_pmu_init(void) */ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - /* * Install the hw-cache-events table: */ @@ -1514,6 +1505,7 @@ void __init init_hw_perf_counters(void) perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_counter_mask; perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); -- cgit v1.2.3 From 9c26f52b900f7207135bafc8789e1a4f5d43e096 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 24 Jun 2009 09:41:59 -0500 Subject: x86: Fix uv bau sending buffer initialization The initialization of the UV Broadcast Assist Unit's sending buffers was making an invalid assumption about the initialization of an MMR that defines its address. The BIOS will not be providing that MMR. So uv_activation_descriptor_init() should unconditionally set it. Tested on UV simulator. Signed-off-by: Cliff Wickman Cc: # for v2.6.30.x LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 124d40c575d..8ccabb8a2f6 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode) unsigned long pa; unsigned long m; unsigned long n; - unsigned long mmr_image; struct bau_desc *adp; struct bau_desc *ad2; @@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; - mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) { - uv_write_global_mmr64(pnode, (unsigned long) - UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - } + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | m)); /* * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each -- cgit v1.2.3 From 194002b274e9169a04beb1b23dcc132159bb566c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 16:35:24 +0200 Subject: perf_counter, x86: Add mmap counter read support Update the mmap control page with the needed information to use the userspace RDPMC instruction for self monitoring. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 3 +++ arch/x86/kernel/cpu/perf_counter.c | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 5fb33e160ea..fa64e401589 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -87,6 +87,9 @@ union cpuid10_edx { #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); + +#define PERF_COUNTER_INDEX_OFFSET 0 + #else static inline void init_hw_perf_counters(void) { } static inline void perf_counters_lapic_init(void) { } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a310d19faca..b83474b6021 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -912,6 +912,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, err = checking_wrmsrl(hwc->counter_base + idx, (u64)(-left) & x86_pmu.counter_mask); + perf_counter_update_userpage(counter); + return ret; } @@ -1034,6 +1036,8 @@ try_generic: x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); + perf_counter_update_userpage(counter); + return 0; } @@ -1126,6 +1130,8 @@ static void x86_pmu_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); + + perf_counter_update_userpage(counter); } /* -- cgit v1.2.3 From 5211a242d0cbdded372aee59da18f80552b0a80a Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Wed, 24 Jun 2009 14:32:11 -0700 Subject: x86: Add sysctl to allow panic on IOCK NMI error This patch introduces a new sysctl: /proc/sys/kernel/panic_on_io_nmi which defaults to 0 (off). When enabled, the kernel panics when the kernel receives an NMI caused by an IO error. The IO error triggered NMI indicates a serious system condition, which could result in IO data corruption. Rather than contiuing, panicing and dumping might be a better choice, so one can figure out what's causing the IO error. This could be especially important to companies running IO intensive applications where corruption must be avoided, e.g. a bank's databases. [ SuSE has been shipping it for a while, it was done at the request of a large database vendor, for their users. ] Signed-off-by: Kurt Garloff Signed-off-by: Roberto Angelino Signed-off-by: Greg Kroah-Hartman Cc: "Eric W. Biederman" LKML-Reference: <20090624213211.GA11291@kroah.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 + arch/x86/kernel/traps.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa7d44..c8405718a4c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,6 +22,7 @@ #include "dumpstack.h" int panic_on_unrecovered_nmi; +int panic_on_io_nmi; unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a0f48f5671c..5204332f475 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); -- cgit v1.2.3 From 5be6066a7f8d917db347d94f1b359b9b70dcb572 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 24 Jun 2009 09:21:10 +0900 Subject: x86, mce: percpu mcheck_timer should be pinned If CONFIG_NO_HZ + CONFIG_SMP, timer added via add_timer() might be migrated on other cpu. Use add_timer_on() instead. Avoids the following failure: Maciej Rutecki wrote: > > After normal boot I try: > > > > echo 1 > /sys/devices/system/machinecheck/machinecheck0/check_interval > > > > I found this in dmesg: > > > > [ 141.704025] ------------[ cut here ]------------ > > [ 141.704039] WARNING: at arch/x86/kernel/cpu/mcheck/mce.c:1102 > > mcheck_timer+0xf5/0x100() Reported-by: Maciej Rutecki Signed-off-by: Hidetoshi Seto Tested-by: Maciej Rutecki Acked-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..af425b83202 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1117,7 +1117,7 @@ static void mcheck_timer(unsigned long data) *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); t->expires = jiffies + *n; - add_timer(t); + add_timer_on(t, smp_processor_id()); } static void mce_do_trigger(struct work_struct *work) @@ -1321,7 +1321,7 @@ static void mce_init_timer(void) return; setup_timer(t, mcheck_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); - add_timer(t); + add_timer_on(t, smp_processor_id()); } /* -- cgit v1.2.3 From 22f4319d6bc0155e6c0ae560729baa6c09dc09e7 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Thu, 25 Jun 2009 16:20:48 -0400 Subject: x86, setup: Fix typo "CONFIG_x86_64" in CONFIG_X86_64 was misspelled (wrong case), which caused the x86-64 kernel to advertise itself as more relocatable than it really is. This could in theory cause boot failures once bootloaders start support the new relocation fields. Signed-off-by: Robert P. J. Day Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/boot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 418e632d4a8..6f02b9a5384 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -16,7 +16,7 @@ & ~(CONFIG_PHYSICAL_ALIGN - 1)) /* Minimum kernel alignment, as a power of two */ -#ifdef CONFIG_x86_64 +#ifdef CONFIG_X86_64 #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else #define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) -- cgit v1.2.3 From 658dbfeb5e7ab35d440c665d643a6285e43fddcd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 25 Jun 2009 15:16:06 -0700 Subject: x86, setup: correct include file in needs , not in order to resolve PMD_SHIFT. Also, correct a +1 which really should be + THREAD_ORDER. This is a build error which was masked by a typoed #ifdef. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/boot.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6f02b9a5384..7a1065958ba 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -8,7 +8,7 @@ #ifdef __KERNEL__ -#include +#include /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ @@ -19,7 +19,7 @@ #ifdef CONFIG_X86_64 #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) -- cgit v1.2.3 From e888d7facd1f1460a638151036d15b6cfb3ccc74 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 25 Jun 2009 16:44:31 -0700 Subject: x86, delay: tsc based udelay should have rdtsc_barrier delay_tsc needs rdtsc_barrier to provide proper delay. Output from a test driver using hpet to cross check delay provided by udelay(). Before: [ 86.794363] Expected delay 5us actual 4679ns [ 87.154362] Expected delay 5us actual 698ns [ 87.514162] Expected delay 5us actual 4539ns [ 88.653716] Expected delay 5us actual 4539ns [ 94.664106] Expected delay 10us actual 9638ns [ 95.049351] Expected delay 10us actual 10126ns [ 95.416110] Expected delay 10us actual 9568ns [ 95.799216] Expected delay 10us actual 9638ns [ 103.624104] Expected delay 10us actual 9707ns [ 104.020619] Expected delay 10us actual 768ns [ 104.419951] Expected delay 10us actual 9707ns After: [ 50.983320] Expected delay 5us actual 5587ns [ 51.261807] Expected delay 5us actual 5587ns [ 51.565715] Expected delay 5us actual 5657ns [ 51.861171] Expected delay 5us actual 5587ns [ 52.164704] Expected delay 5us actual 5726ns [ 52.487457] Expected delay 5us actual 5657ns [ 52.789338] Expected delay 5us actual 5726ns [ 57.119680] Expected delay 10us actual 10755ns [ 57.893997] Expected delay 10us actual 10615ns [ 58.261287] Expected delay 10us actual 10755ns [ 58.620505] Expected delay 10us actual 10825ns [ 58.941035] Expected delay 10us actual 10755ns [ 59.320903] Expected delay 10us actual 10615ns [ 61.306311] Expected delay 10us actual 10755ns [ 61.520542] Expected delay 10us actual 10615ns Signed-off-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin --- arch/x86/lib/delay.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f4568605d7d..ff485d36118 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops) preempt_disable(); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); for (;;) { + rdtsc_barrier(); rdtscl(now); if ((now - bclock) >= loops) break; @@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops) if (unlikely(cpu != smp_processor_id())) { loops -= (now - bclock); cpu = smp_processor_id(); + rdtsc_barrier(); rdtscl(bclock); } } -- cgit v1.2.3 From ff8a4bae459a9b6455504127fcb78fdbc8e50e4c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 27 Jun 2009 12:22:27 -0700 Subject: Revert "x86: cap iomem_resource to addressable physical memory" This reverts commit 95ee14e4379c5e19c0897c872350570402014742. Mikael Petterson reported that at least one of his systems will not boot as a result. We have ruled out the detection algorithm malfunctioning, so it is not a matter of producing the incorrect bitmasks; rather, something in the application of them fails. Revert the commit until we can root cause and correct this problem. -stable team: this means the underlying commit should be rejected. Reported-and-isolated-by: Mikael Petterson Signed-off-by: H. Peter Anvin LKML-Reference: <200906261559.n5QFxJH8027336@pilspetsen.it.uu.se> Cc: stable@kernel.org Cc: Grant Grundler --- arch/x86/kernel/cpu/common.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b26d4deada..f1961c07af9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -848,9 +848,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif - - /* Cap the iomem address space to what is addressable on all CPUs */ - iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1; } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 29a4b9333bf9ffef12b3dd7cbf2e3dbe01152968 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 19 May 2009 13:29:27 +0300 Subject: KVM: MMU: Allow 4K ptes with bit 7 (PAT) set Bit 7 is perfectly legal in the 4K page leve; it is used for the PAT. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5c3d6e81a7d..7030b5f911b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2157,7 +2157,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) else /* 32 bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); - context->rsvd_bits_mask[1][0] = ~0ull; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = @@ -2170,7 +2170,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = ~0ull; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | @@ -2186,7 +2186,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = ~0ull; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; } } -- cgit v1.2.3 From a3f9d3981cd82d65232b733eb792382237d686bd Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 18 Jun 2009 16:53:25 +0530 Subject: KVM: kvm/x86_emulate.c toggle_interruptibility() should be static toggle_interruptibility() is used only by same file, it should be static. Fixed following sparse warning : arch/x86/kvm/x86_emulate.c:1364:6: warning: symbol 'toggle_interruptibility' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index c1b6c232e02..616de4628d6 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1361,7 +1361,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return 0; } -void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) { u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); /* -- cgit v1.2.3 From e3c7cb6ad7191e92ba89d00a7ae5f5dd1ca0c214 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 16 Jun 2009 14:19:52 +0300 Subject: KVM: VMX: Handle vmx instruction vmexits IF a guest tries to use vmx instructions, inject a #UD to let it know the instruction is not implemented, rather than crashing. This prevents guest userspace from crashing the guest kernel. Cc: stable@kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e770bf349ec..356a0ce85c6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3012,6 +3012,12 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3198,6 +3204,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_VMCLEAR] = handle_vmx_insn, + [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, + [EXIT_REASON_VMPTRLD] = handle_vmx_insn, + [EXIT_REASON_VMPTRST] = handle_vmx_insn, + [EXIT_REASON_VMREAD] = handle_vmx_insn, + [EXIT_REASON_VMRESUME] = handle_vmx_insn, + [EXIT_REASON_VMWRITE] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmx_insn, + [EXIT_REASON_VMON] = handle_vmx_insn, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, -- cgit v1.2.3 From 9e6996240afcbe61682eab8eeaeb65c34333164d Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Mon, 15 Jun 2009 13:25:34 +0530 Subject: KVM: Ignore reads to K7 EVNTSEL MSRs In commit 7fe29e0faacb650d31b9e9f538203a157bec821d we ignored the reads to the P6 EVNTSEL MSRs. That fixed crashes on Intel machines. Ignore the reads to K7 EVNTSEL MSRs as well to fix this on AMD hosts. This fixes Kaspersky antivirus crashing Windows guests on AMD hosts. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 249540f9851..fe5474aec41 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -898,6 +898,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_VM_HSAVE_PA: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: data = 0; break; case MSR_MTRRcap: -- cgit v1.2.3 From bde892232532ed522bb56b04576d07f91e59b3c7 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 20 May 2009 09:59:35 +0530 Subject: KVM: shut up uninit compiler warning in paging_tmpl.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dixes compilation warning: CC arch/x86/kernel/io_delay.o arch/x86/kvm/paging_tmpl.h: In function ‘paging64_fetch’: arch/x86/kvm/paging_tmpl.h:279: warning: ‘sptep’ may be used uninitialized in this function arch/x86/kvm/paging_tmpl.h: In function ‘paging32_fetch’: arch/x86/kvm/paging_tmpl.h:279: warning: ‘sptep’ may be used uninitialized in this function warning is bogus (always have a least one level), but need to shut the compiler up. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 258e4591e1c..67785f63539 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -281,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *shadow_page; - u64 spte, *sptep; + u64 spte, *sptep = NULL; int direct; gfn_t table_gfn; int r; -- cgit v1.2.3 From 4078c444cf667f018c3fc7ebf141131a2b7c9480 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 29 Jun 2009 00:41:11 -0700 Subject: perf_counter, x86: Update x86_pmu after WARN() The print out should read the value before changing the value. Signed-off-by: Yinghai Lu Cc: Peter Zijlstra LKML-Reference: <4A487017.4090007@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b83474b6021..d4cf4ce19aa 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1496,17 +1496,17 @@ void __init init_hw_perf_counters(void) pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } perf_counter_mask = (1 << x86_pmu.num_counters) - 1; perf_max_counters = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; } perf_counter_mask |= -- cgit v1.2.3 From c7ab48d2acaf959e4d59c3f55d12fdb7ca9afd7c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 26 Jun 2009 19:10:36 +0100 Subject: intel-iommu: Clean up identity mapping code, remove CONFIG_DMAR_GFX_WA There's no need for the GFX workaround now we have 'iommu=pt' for the cases where people really care about performance. There's no need to have a special case for just one type of device. This also speeds up the iommu=pt path and reduces memory usage by setting up the si_domain _once_ and then using it for all devices, rather than giving each device its own private page tables. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f..c07f7220590 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1913,25 +1913,14 @@ config DMAR_DEFAULT_ON recommended you say N here while the DMAR code remains experimental. -config DMAR_GFX_WA - def_bool y - prompt "Support for Graphics workaround" - depends on DMAR - ---help--- - Current Graphics drivers tend to use physical address - for DMA and avoid using DMA APIs. Setting this config - option permits the IOMMU driver to set a unity map for - all the OS-visible memory. Hence the driver can continue - to use physical addresses for DMA. - config DMAR_FLOPPY_WA def_bool y depends on DMAR ---help--- - Floppy disk drivers are know to bypass DMA API calls + Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This workaround will setup a 1:1 mapping for the first - 16M to make floppy (an ISA device) work. + 16MiB to make floppy (an ISA device) work. config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" -- cgit v1.2.3 From 2cdb3f1d834aab27a927be7555fbf4f9e43e9261 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Jun 2009 19:01:19 -0700 Subject: x86/PCI: fix boundary checking when using root CRS Don't touch info->res_num if we are out of space. Acked-by: Gary Hade Tested-by: Gary Hade Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index b26626dc517..8bf152910eb 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -68,6 +68,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data) unsigned long flags; struct resource *root; int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; + u64 start, end; + + if (bus_has_transparent_bridge(info->bus)) + max_root_bus_resources -= 3; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -84,25 +88,24 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = addr.minimum + addr.translation_offset; - res->end = res->start + addr.address_length - 1; - res->child = NULL; - - if (bus_has_transparent_bridge(info->bus)) - max_root_bus_resources -= 3; + start = addr.minimum + addr.translation_offset; + end = start + addr.address_length - 1; if (info->res_num >= max_root_bus_resources) { printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " "from %s for %s due to _CRS returning more than " - "%d resource descriptors\n", (unsigned long) res->start, - (unsigned long) res->end, root->name, info->name, + "%d resource descriptors\n", (unsigned long) start, + (unsigned long) end, root->name, info->name, max_root_bus_resources); - info->res_num++; return AE_OK; } + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + if (insert_resource(root, res)) { printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " "from %s for %s\n", (unsigned long) res->start, -- cgit v1.2.3 From 626fdfec1588ac1341a37805809d03a719d977e0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Jun 2009 20:00:12 -0700 Subject: x86/PCI: get root CRS before scanning children This allows us to remove adjust_transparent_bridge_resources and give x86_pci_root_bus_res_quirks a chance when _CRS is not used or not there. Acked-by: Gary Hade Tested-by: Gary Hade Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 32 +++++++++----------------------- arch/x86/pci/amd_bus.c | 8 ++++++-- 2 files changed, 15 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 8bf152910eb..1014eb4bfc3 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -117,23 +117,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static void -adjust_transparent_bridge_resources(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - int i; - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) { - for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++) - dev->subordinate->resource[i] = - dev->bus->resource[i - 3]; - } - } -} - static void get_current_resources(struct acpi_device *device, int busnum, int domain, struct pci_bus *bus) @@ -161,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum, info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, &info); - if (info.res_num) - adjust_transparent_bridge_resources(bus); return; @@ -225,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do */ memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); - } else - bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + } else { + bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); + if (bus) { + if (pci_probe & PCI_USE__CRS) + get_current_resources(device, busnum, domain, + bus); + bus->subordinate = pci_scan_child_bus(bus); + } + } if (!bus) kfree(sd); @@ -241,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do #endif } - if (bus && (pci_probe & PCI_USE__CRS)) - get_current_resources(device, busnum, domain, bus); return bus; } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index f893d6a6e80..3ffa10df20b 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) int j; struct pci_root_info *info; - /* don't go for it if _CRS is used */ - if (pci_probe & PCI_USE__CRS) + /* don't go for it if _CRS is used already */ + if (b->resource[0] != &ioport_resource || + b->resource[1] != &iomem_resource) return; /* if only one root bus, don't need to anything */ @@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) if (i == pci_root_num) return; + printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", + b->number); + info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; -- cgit v1.2.3 From 9e314996e3dc5189b9b36dce67088e882e989897 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 30 Jun 2009 15:00:20 +0200 Subject: x86: Fix symbol annotation for arch/x86/lib/clear_page_64.S::clear_page_c Noticed the zero-sized function symbol while looking at 'perf' profiles, it causes the profiler to display those addresses in hexa. Turns out that this was wrong/bogus for an eternity. Signed-off-by: Mike Galbraith Acked-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov LKML-Reference: <1246366820.6538.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- arch/x86/lib/clear_page_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 9a10a78bb4a..ebeafcce04a 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -5,15 +5,14 @@ * Zero a page. * rdi page */ - ALIGN -clear_page_c: +ENTRY(clear_page_c) CFI_STARTPROC movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page) +ENDPROC(clear_page_c) ENTRY(clear_page) CFI_STARTPROC -- cgit v1.2.3 From 789d03f584484af85dbdc64935270c8e45f36ef7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 30 Jun 2009 11:52:23 +0100 Subject: x86: Fix fixmap ordering The merge of the 32- and 64-bit fixmap headers made a latent bug on x86-64 a real one: with the right config settings it is possible for FIX_OHCI1394_BASE to overlap the FIX_BTMAP_* range. Signed-off-by: Jan Beulich Cc: # for 2.6.30.x LKML-Reference: <4A4A0A8702000078000082E8@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 2d81af3974a..3eb0f79a532 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -114,9 +114,6 @@ enum fixed_addresses { FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE1, __end_of_permanent_fixed_addresses, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. @@ -129,6 +126,9 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif -- cgit v1.2.3 From 66918dcdf91ad101194c749c18099e836ba3de2b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 30 Jun 2009 11:41:37 -0700 Subject: x86: only clear node_states for 64bit Nathan reported that | commit 73d60b7f747176dbdff826c4127d22e1fd3f9f74 | Author: Yinghai Lu | Date: Tue Jun 16 15:33:00 2009 -0700 | | page-allocator: clear N_HIGH_MEMORY map before we set it again | | SRAT tables may contains nodes of very small size. The arch code may | decide to not activate such a node. However, currently the early boot | code sets N_HIGH_MEMORY for such nodes. These nodes therefore seem to be | active although these nodes have no present pages. | | For 64bit N_HIGH_MEMORY == N_NORMAL_MEMORY, so that works for 64 bit too unintentionally and incorrectly clears the cpuset.mems cgroup attribute on an i386 kvm guest, meaning that cpuset.mems can not be used. Fix this by only clearing node_states[N_NORMAL_MEMORY] for 64bit only. and need to do save/restore for that in find_zone_movable_pfn Reported-by: Nathan Lynch Tested-by: Nathan Lynch Signed-off-by: Yinghai Lu Cc: Christoph Lameter Cc: Ingo Molnar , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c4378f4fd4a..b177652251a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -598,6 +598,8 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); + /* clear the default setting with node 0 */ + nodes_clear(node_states[N_NORMAL_MEMORY]); free_area_init_nodes(max_zone_pfns); } -- cgit v1.2.3 From 44973998a111dfda09b952aa0f27cad326a97793 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 17:49:38 +0530 Subject: x86: Remove double declaration of MSR_P6_EVNTSEL0 and MSR_P6_EVNTSEL1 MSR_P6_EVNTSEL0 and MSR_P6_EVNTSEL1 is already declared in msr-index.h. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1246450778.6940.8.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5050e..6be7fc254b5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -246,10 +246,6 @@ #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) -/* Intel Model 6 */ -#define MSR_P6_EVNTSEL0 0x00000186 -#define MSR_P6_EVNTSEL1 0x00000187 - /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 -- cgit v1.2.3 From b25ae679f613ed04aaf6ccbfdb9122fce668e4bb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:53:14 +0530 Subject: x86: Mark device_nb as static and fix NULL noise This sparse warning: arch/x86/kernel/amd_iommu.c:1195:23: warning: symbol 'device_nb' was not declared. Should it be static? triggers because device_nb is global but is only used in a single .c file. change device_nb to static to fix that - this also addresses the sparse warning. This sparse warning: arch/x86/kernel/amd_iommu.c:1766:10: warning: Using plain integer as NULL pointer triggers because plain integer 0 is used in place of a NULL pointer. change 0 to NULL to fix that - this also address the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Joerg Roedel LKML-Reference: <1246458194.6940.20.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9372f0406ad..6c99f503780 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1192,7 +1192,7 @@ out: return 0; } -struct notifier_block device_nb = { +static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; @@ -1763,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) - return 0; + return NULL; paddr = virt_to_phys(virt_addr); -- cgit v1.2.3 From 76c06927f2a78143763dcff9b4c362d15eb29cc2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:54:23 +0530 Subject: x86: Declare check_efer() before it gets used This sparse warning: arch/x86/mm/init.c:83:16: warning: symbol 'check_efer' was not declared. Should it be static? triggers because check_efer() is not decalared before using it. asm/proto.h includes the declaration of check_efer(), so including asm/proto.h to fix that - this also addresses the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton LKML-Reference: <1246458263.6940.22.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 47ce9a2ce5e..0607119cef9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,6 +12,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -- cgit v1.2.3 From 3238c0c4d68d9a9022b411a11a4b933fbdb53a14 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 18:56:16 +0100 Subject: intel-iommu: Make iommu=pt work on i386 too Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 47630479b06..1a041bcf506 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -211,11 +211,11 @@ static __init int iommu_setup(char *p) #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) swiotlb = 1; +#endif if (!strncmp(p, "pt", 2)) { iommu_pass_through = 1; return 1; } -#endif gart_parse_options(p); -- cgit v1.2.3 From 788d84bba47ea3eb377f7a3ae4fd1ee84b84877b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 1 Jul 2009 18:34:52 +0100 Subject: Fix pci_unmap_addr() et al on i386. We can run a 32-bit kernel on boxes with an IOMMU, so we need pci_unmap_addr() etc. to work -- without it, drivers will leak mappings. To be honest, this whole thing looks like it's more pain than it's worth; I'm half inclined to remove the no-op #else case altogether. But this is the minimal fix, which just does the right thing if CONFIG_DMAR is set. Signed-off-by: David Woodhouse Cc: stable@kernel.org [ for 2.6.30 ] Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 927958d13c1..1ff685ca221 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -91,7 +91,7 @@ extern void pci_iommu_alloc(void); #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) +#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ dma_addr_t ADDR_NAME; -- cgit v1.2.3 From 0406ca6d8e849d9dd027c8cb6791448e81411aef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 1 Jul 2009 21:02:09 +0200 Subject: perf_counter: Ignore the nmi call frames in the x86-64 backtraces About every callchains recorded with perf record are filled up including the internal perfcounter nmi frame: perf_callchain perf_counter_overflow intel_pmu_handle_irq perf_counter_nmi_handler notifier_call_chain atomic_notifier_call_chain notify_die do_nmi nmi We want ignore this frame as it's not interesting for instrumentation. To solve this, we simply ignore every frames from nmi context. New example of "perf report -s sym -c" after this patch: 9.59% [k] search_by_key 4.88% search_by_key reiserfs_read_locked_inode reiserfs_iget reiserfs_lookup do_lookup __link_path_walk path_walk do_path_lookup user_path_at vfs_fstatat vfs_lstat sys_newlstat system_call_fastpath __lxstat 0x406fb1 3.19% search_by_key search_by_entry_key reiserfs_find_entry reiserfs_lookup do_lookup __link_path_walk path_walk do_path_lookup user_path_at vfs_fstatat vfs_lstat sys_newlstat system_call_fastpath __lxstat 0x406fb1 [...] For now this patch only solves the problem in x86-64. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Anton Blanchard Cc: Arnaldo Carvalho de Melo LKML-Reference: <1246474930-6088-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 2 ++ arch/x86/kernel/cpu/perf_counter.c | 8 +++++++- arch/x86/kernel/dumpstack_32.c | 6 ++++++ arch/x86/kernel/dumpstack_64.c | 22 +++++++++++++++------- 4 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f517944b2b1..cf86a5e7381 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -3,6 +3,8 @@ extern int kstack_depth_to_print; +int x86_is_stack_id(int id, char *name); + /* Generic stack tracer with callbacks */ struct stacktrace_ops { diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d4cf4ce19aa..36c3dc7b899 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1561,6 +1561,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); static void @@ -1576,7 +1577,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - /* Process all stacks: */ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + return 0; } @@ -1584,6 +1587,9 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + if (reliable) callchain_store(entry, addr); } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d593cd1f58d..bca5fba91c9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -19,6 +19,12 @@ #include "dumpstack.h" +/* Just a stub for now */ +int x86_is_stack_id(int id, char *name) +{ + return 0; +} + void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d35db5993fd..54b0a327676 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -19,10 +19,8 @@ #include "dumpstack.h" -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { + +static char x86_stack_ids[][8] = { [DEBUG_STACK - 1] = "#DB", [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", @@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; + +int x86_is_stack_id(int id, char *name) +{ + return x86_stack_ids[id - 1] == name; +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ unsigned k; /* @@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, if (*usedp & (1U << k)) break; *usedp |= 1U << k; - *idp = ids[k]; + *idp = x86_stack_ids[k]; return (unsigned long *)end; } /* @@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, do { ++j; end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + x86_stack_ids[j][4] = '1' + + (j - N_EXCEPTION_STACKS); } while (stack < end - EXCEPTION_STKSZ); if (*usedp & (1U << j)) break; *usedp |= 1U << j; - *idp = ids[j]; + *idp = x86_stack_ids[j]; return (unsigned long *)end; } #endif -- cgit v1.2.3 From 251e1e44b97852aa5e53e71c4b47e55b2dfd054e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Jul 2009 08:54:01 +0200 Subject: x86: Fix printk call in print_local_apic() Instead of this: [ 75.690022] <7>printing local APIC contents on CPU#0/0: [ 75.704406] ... APIC ID: 00000000 (0) [ 75.707905] ... APIC VERSION: 00060015 [ 75.722551] ... APIC TASKPRI: 00000000 (00) [ 75.725473] ... APIC PROCPRI: 00000000 [ 75.728592] ... APIC LDR: 00000001 [ 75.742137] ... APIC SPIV: 000001ff [ 75.744101] ... APIC ISR field: [ 75.746648] 0123456789abcdef0123456789abcdef [ 75.746649] <7>00000000000000000000000000000000 Improve the code to be saner and simpler and just print out the bitfield in a single line using hexa values - not as a (rather pointless) binary bitfield. Partially reused Linus's initial fix for this. Reported-and-Tested-by: Yinghai Lu Signed-off-by: Yinghai Lu Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4A4C43BC.90506@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4d0216fcb36..8fd1efb5a0b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1716,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void) return; } -__apicdebuginit(void) print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_field(int base) { - unsigned int v; - int i, j; + int i; if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ -- cgit v1.2.3 From 7a6a3a086fe60bb3b739f66e47dc5bbc8f530d32 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Jul 2009 12:23:23 +0200 Subject: amd-iommu: handle alias entries correctly in init code An alias entry in the ACPI table means that the device can send requests to the IOMMU with both device ids, its own and the alias. This is not handled properly in the ACPI init code. This patch fixes the issue. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 10b2accd12e..ec72c7779d6 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -691,6 +691,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; devid_to = e->ext >> 8; + set_dev_entry_from_acpi(iommu, devid , e->flags, 0); set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; @@ -749,11 +750,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - amd_iommu_alias_table[dev_i], - flags, ext_flags); + set_dev_entry_from_acpi(iommu, + devid_to, flags, ext_flags); + } + set_dev_entry_from_acpi(iommu, dev_i, + flags, ext_flags); } break; default: -- cgit v1.2.3 From 1bc6f83813441d15a74dfa97966fb68fa1bdec76 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Jul 2009 18:32:05 +0200 Subject: amd-iommu: set evt_buf_size correctly The setting of this variable got lost during the suspend/resume implementation. But keeping this variable zero causes a divide-by-zero error in the interrupt handler. This patch fixes this. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ec72c7779d6..c1b17e97252 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -472,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) if (iommu->evt_buf == NULL) return NULL; + iommu->evt_buf_size = EVT_BUFFER_SIZE; + return iommu->evt_buf; } -- cgit v1.2.3 From 43644679a1e80f53e6e0155ab75b1093ba3c0365 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 2 Jul 2009 12:05:10 -0700 Subject: x86: fix power-of-2 round_up/round_down macros These macros had two bugs: - the type of the mask was not correctly expanded to the full size of the argument being expanded, resulting in possible loss of high bits when mixing types. - the alignment argument was evaluated twice, despite the macro looking like a fancy function (but it really does need to be a macro, since it works on arbitrary integer types) Noticed by Peter Anvin, and with a fix that is a modification of his suggestion (bug noticed by Yinghai Lu). Cc: Peter Anvin Cc: Yinghai Lu Signed-off-by: Linus Torvalds --- arch/x86/include/asm/proto.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 49fb3ecf3bb..621f56d7312 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -22,7 +22,14 @@ extern int reboot_force; long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1)) -#define round_down(x, y) ((x) & ~((y) - 1)) +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x,y) ((__typeof__(x))((y)-1)) +#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1) +#define round_down(x,y) ((x) & ~__round_mask(x,y)) #endif /* _ASM_X86_PROTO_H */ -- cgit v1.2.3 From 7c5371c403abb29f01bc6cff6c5096abdf2dc524 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 1 Jul 2009 12:32:18 -0700 Subject: x86: add boundary check for 32bit res before expand e820 resource to alignment fix hang with HIGHMEM_64G and 32bit resource. According to hpa and Linus, use (resource_size_t)-1 to fend off big ranges. Analyzed by hpa Reported-and-tested-by: Mikael Pettersson Signed-off-by: Yinghai Lu Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7271fa33d79..c4ca89d9aaf 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1383,6 +1383,8 @@ static unsigned long ram_alignment(resource_size_t pos) return 32*1024*1024; } +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + void __init e820_reserve_resources_late(void) { int i; @@ -1400,17 +1402,19 @@ void __init e820_reserve_resources_late(void) * avoid stolen RAM: */ for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; - resource_size_t start, end; + struct e820entry *entry = &e820.map[i]; + u64 start, end; if (entry->type != E820_RAM) continue; start = entry->addr + entry->size; - end = round_up(start, ram_alignment(start)); - if (start == end) + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) continue; - reserve_region_with_split(&iomem_resource, start, - end - 1, "RAM buffer"); + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); } } -- cgit v1.2.3 From bbf2a330d92c5afccfd17592ba9ccd50f41cf748 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 00:08:26 +0200 Subject: x86: atomic64: The atomic64_t data type should be 8 bytes aligned on 32-bit too Locked instructions on two cache lines at once are painful. If atomic64_t uses two cache lines, my test program is 10x slower. The chance for that is significant: 4/32 or 12.5%. Make sure an atomic64_t is 8 bytes aligned. Signed-off-by: Eric Dumazet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: [ changed it to __aligned(8) as per Andrew's suggestion ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 2503d4e64c2..ae0fbb5b057 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -250,7 +250,7 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) /* An 64bit atomic type */ typedef struct { - unsigned long long counter; + unsigned long long __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } -- cgit v1.2.3 From b7882b7c65abb00194bdb3d4a22d27d70fcc59ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 13:26:39 +0200 Subject: x86: atomic64: Move the 32-bit atomic64_t implementation to a .c file Linus noted that the atomic64_t primitives are all inlines currently which is crazy because these functions have a large register footprint anyway. Move them to a separate file: arch/x86/lib/atomic64_32.c Also, while at it, rename all uses of 'unsigned long long' to the much shorter u64. This makes the appearance of the prototypes a lot nicer - and it also uncovered a few bugs where (yet unused) API variants had 'long' as their return type instead of u64. [ More intrusive changes are not yet done in this patch. ] Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 137 ++++--------------------- arch/x86/lib/Makefile | 1 + arch/x86/lib/atomic64_32.c | 216 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+), 117 deletions(-) create mode 100644 arch/x86/lib/atomic64_32.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index ae0fbb5b057..311a43e47c0 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -250,7 +250,7 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) /* An 64bit atomic type */ typedef struct { - unsigned long long __aligned(8) counter; + u64 __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } @@ -264,31 +264,7 @@ typedef struct { */ #define __atomic64_read(ptr) ((ptr)->counter) -static inline unsigned long long -cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) -{ - asm volatile( - - LOCK_PREFIX "cmpxchg8b (%[ptr])\n" - - : "=A" (old) - - : [ptr] "D" (ptr), - "A" (old), - "b" (ll_low(new)), - "c" (ll_high(new)) - - : "memory"); - - return old; -} - -static inline unsigned long long -atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, - unsigned long long new_val) -{ - return cmpxchg8b(&ptr->counter, old_val, new_val); -} +extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); /** * atomic64_xchg - xchg atomic64 variable @@ -298,18 +274,7 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, * Atomically xchgs the value of @ptr to @new_val and returns * the old value. */ - -static inline unsigned long long -atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) -{ - unsigned long long old_val; - - do { - old_val = atomic_read(ptr); - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); - - return old_val; -} +extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); /** * atomic64_set - set atomic64 variable @@ -318,10 +283,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) * * Atomically sets the value of @ptr to @new_val. */ -static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) -{ - atomic64_xchg(ptr, new_val); -} +extern void atomic64_set(atomic64_t *ptr, u64 new_val); /** * atomic64_read - read atomic64 variable @@ -329,16 +291,7 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) * * Atomically reads the value of @ptr and returns it. */ -static inline unsigned long long atomic64_read(atomic64_t *ptr) -{ - unsigned long long curr_val; - - do { - curr_val = __atomic64_read(ptr); - } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); - - return curr_val; -} +extern u64 atomic64_read(atomic64_t *ptr); /** * atomic64_add_return - add and return @@ -347,34 +300,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr) * * Atomically adds @delta to @ptr and returns @delta + *@ptr */ -static inline unsigned long long -atomic64_add_return(unsigned long long delta, atomic64_t *ptr) -{ - unsigned long long old_val, new_val; - - do { - old_val = atomic_read(ptr); - new_val = old_val + delta; - - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); - - return new_val; -} - -static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) -{ - return atomic64_add_return(-delta, ptr); -} +extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); -static inline long atomic64_inc_return(atomic64_t *ptr) -{ - return atomic64_add_return(1, ptr); -} - -static inline long atomic64_dec_return(atomic64_t *ptr) -{ - return atomic64_sub_return(1, ptr); -} +/* + * Other variants with different arithmetic operators: + */ +extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); +extern u64 atomic64_inc_return(atomic64_t *ptr); +extern u64 atomic64_dec_return(atomic64_t *ptr); /** * atomic64_add - add integer to atomic64 variable @@ -383,10 +316,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr) * * Atomically adds @delta to @ptr. */ -static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) -{ - atomic64_add_return(delta, ptr); -} +extern void atomic64_add(u64 delta, atomic64_t *ptr); /** * atomic64_sub - subtract the atomic64 variable @@ -395,10 +325,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) * * Atomically subtracts @delta from @ptr. */ -static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) -{ - atomic64_add(-delta, ptr); -} +extern void atomic64_sub(u64 delta, atomic64_t *ptr); /** * atomic64_sub_and_test - subtract value from variable and test result @@ -409,13 +336,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) * true if the result is zero, or false for all * other cases. */ -static inline int -atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) -{ - unsigned long long old_val = atomic64_sub_return(delta, ptr); - - return old_val == 0; -} +extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); /** * atomic64_inc - increment atomic64 variable @@ -423,10 +344,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) * * Atomically increments @ptr by 1. */ -static inline void atomic64_inc(atomic64_t *ptr) -{ - atomic64_add(1, ptr); -} +extern void atomic64_inc(atomic64_t *ptr); /** * atomic64_dec - decrement atomic64 variable @@ -434,10 +352,7 @@ static inline void atomic64_inc(atomic64_t *ptr) * * Atomically decrements @ptr by 1. */ -static inline void atomic64_dec(atomic64_t *ptr) -{ - atomic64_sub(1, ptr); -} +extern void atomic64_dec(atomic64_t *ptr); /** * atomic64_dec_and_test - decrement and test @@ -447,10 +362,7 @@ static inline void atomic64_dec(atomic64_t *ptr) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic64_dec_and_test(atomic64_t *ptr) -{ - return atomic64_sub_and_test(1, ptr); -} +extern int atomic64_dec_and_test(atomic64_t *ptr); /** * atomic64_inc_and_test - increment and test @@ -460,10 +372,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic64_inc_and_test(atomic64_t *ptr) -{ - return atomic64_sub_and_test(-1, ptr); -} +extern int atomic64_inc_and_test(atomic64_t *ptr); /** * atomic64_add_negative - add and test if negative @@ -474,13 +383,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int -atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) -{ - long long old_val = atomic64_add_return(delta, ptr); - - return old_val < 0; -} +extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); #include #endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f9d35632666..c3c657c8bb8 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) + lib-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c new file mode 100644 index 00000000000..d21e725d3d8 --- /dev/null +++ b/arch/x86/lib/atomic64_32.c @@ -0,0 +1,216 @@ +#include +#include +#include +#include +#include + +static inline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) +{ + asm volatile( + + LOCK_PREFIX "cmpxchg8b (%[ptr])\n" + + : "=A" (old) + + : [ptr] "D" (ptr), + "A" (old), + "b" (ll_low(new)), + "c" (ll_high(new)) + + : "memory"); + + return old; +} + +u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) +{ + return cmpxchg8b(&ptr->counter, old_val, new_val); +} + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ + +u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) +{ + u64 old_val; + + do { + old_val = atomic_read(ptr); + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return old_val; +} + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +void atomic64_set(atomic64_t *ptr, u64 new_val) +{ + atomic64_xchg(ptr, new_val); +} + +/** + * atomic64_read - read atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +u64 atomic64_read(atomic64_t *ptr) +{ + u64 curr_val; + + do { + curr_val = __atomic64_read(ptr); + } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); + + return curr_val; +} + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +u64 atomic64_add_return(u64 delta, atomic64_t *ptr) +{ + u64 old_val, new_val; + + do { + old_val = atomic_read(ptr); + new_val = old_val + delta; + + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return new_val; +} + +u64 atomic64_sub_return(u64 delta, atomic64_t *ptr) +{ + return atomic64_add_return(-delta, ptr); +} + +u64 atomic64_inc_return(atomic64_t *ptr) +{ + return atomic64_add_return(1, ptr); +} + +u64 atomic64_dec_return(atomic64_t *ptr) +{ + return atomic64_sub_return(1, ptr); +} + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +void atomic64_add(u64 delta, atomic64_t *ptr) +{ + atomic64_add_return(delta, ptr); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +void atomic64_sub(u64 delta, atomic64_t *ptr) +{ + atomic64_add(-delta, ptr); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) +{ + u64 old_val = atomic64_sub_return(delta, ptr); + + return old_val == 0; +} + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +void atomic64_inc(atomic64_t *ptr) +{ + atomic64_add(1, ptr); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +void atomic64_dec(atomic64_t *ptr) +{ + atomic64_sub(1, ptr); +} + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +int atomic64_dec_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(1, ptr); +} + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +int atomic64_inc_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(-1, ptr); +} + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +int atomic64_add_negative(u64 delta, atomic64_t *ptr) +{ + long long old_val = atomic64_add_return(delta, ptr); + + return old_val < 0; +} -- cgit v1.2.3 From aacf682fd8c66b57383c407eecd9d4a28264ee91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 12:14:27 +0200 Subject: x86: atomic64: Improve atomic64_read() Linus noticed that the 32-bit version of atomic64_read() was being overly complex with re-reading the value and doing a retry loop over that. Instead we can just rely on cmpxchg8b returning either the new value or returning the current value. We can use any 'old' value, which will be faster as it can be loaded via immediates. Using some value that is not equal to the real value in memory the instruction gets faster. This also has the advantage that the CPU could avoid dirtying the cacheline. Reported-by: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index d21e725d3d8..afa5d444918 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -67,13 +67,9 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) */ u64 atomic64_read(atomic64_t *ptr) { - u64 curr_val; + u64 old = 1LL << 32; - do { - curr_val = __atomic64_read(ptr); - } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); - - return curr_val; + return cmpxchg8b(&ptr->counter, old, old); } /** -- cgit v1.2.3 From 69237f94e65d3d7f539f1adb98ef68685c595004 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 13:26:41 +0200 Subject: x86: atomic64: Improve cmpxchg8b() Rewrite cmpxchg8b() to not use %edi register but a generic "+m" constraint, to increase compiler freedom in code generation and possibly better code. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index afa5d444918..5fc1e2caa54 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -6,19 +6,14 @@ static inline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) { - asm volatile( - - LOCK_PREFIX "cmpxchg8b (%[ptr])\n" - - : "=A" (old) - - : [ptr] "D" (ptr), - "A" (old), - "b" (ll_low(new)), - "c" (ll_high(new)) - - : "memory"); + u32 low = new; + u32 high = new >> 32; + asm volatile( + LOCK_PREFIX "cmpxchg8b %1\n" + : "+A" (old), "+m" (*ptr) + : "b" (low), "c" (high) + ); return old; } -- cgit v1.2.3 From 824975ef190e7dcb77718d1cc2cb53769b16d918 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 12:39:07 +0200 Subject: x86: atomic64: Improve atomic64_add_return() Linus noted (based on Eric Dumazet's numbers) that we would probably be better off not trying an atomic_read() in atomic64_add_return() but intead intentionally let the first cmpxchg8b fail - to get a cache-friendly 'give me ownership of this cacheline' transaction. That can then be followed by the real cmpxchg8b which sets the value local to the CPU. Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 5fc1e2caa54..61959627e1e 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -76,13 +76,22 @@ u64 atomic64_read(atomic64_t *ptr) */ u64 atomic64_add_return(u64 delta, atomic64_t *ptr) { - u64 old_val, new_val; + /* + * Try first with a (probably incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, new_val, real_val = 1ULL << 32; do { - old_val = atomic_read(ptr); + old_val = real_val; new_val = old_val + delta; - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); return new_val; } -- cgit v1.2.3 From 3ac805d2afd3fa4a07cb5bcf352fd7fa83f28935 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 12:51:19 +0200 Subject: x86: atomic64: Reduce size of functions cmpxchg8b is a huge instruction in terms of register footprint, we almost never want to inline it, not even within the same code module. GCC 4.3 still messes up for two functions, under-judging the true cost of this instruction - so annotate two key functions to reduce the bloat: arch/x86/lib/atomic64_32.o: text data bss dec hex filename 1763 0 0 1763 6e3 atomic64_32.o.before 435 0 0 435 1b3 atomic64_32.o.after Cc: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 61959627e1e..a910238a776 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -4,7 +4,7 @@ #include #include -static inline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) +static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) { u32 low = new; u32 high = new >> 32; @@ -74,7 +74,7 @@ u64 atomic64_read(atomic64_t *ptr) * * Atomically adds @delta to @ptr and returns @delta + *@ptr */ -u64 atomic64_add_return(u64 delta, atomic64_t *ptr) +noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) { /* * Try first with a (probably incorrect) assumption about -- cgit v1.2.3 From 3217120873598533234b6dedda9c371ce30001d0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 13:06:01 +0200 Subject: x86: atomic64: Make atomic_read() type-safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linus noticed that atomic64_xchg() uses atomic_read(), which happens to work because atomic_read() is a macro so the .counter value gets u64-read on 32-bit too - but this is really bogus and serious bugs are waiting to happen. Change atomic_read() to be a type-safe inline, and this exposes the atomic64 bogosity as well: arch/x86/lib/atomic64_32.c: In function ‘atomic64_xchg’: arch/x86/lib/atomic64_32.c:39: warning: passing argument 1 of ‘atomic_read’ from incompatible pointer type Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 311a43e47c0..b551bb1ae3c 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -19,7 +19,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable -- cgit v1.2.3 From 199e23780a7e75c63a9e3d1108804e3af450ea3e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 13:02:39 +0200 Subject: x86: atomic64: Fix unclean type use in atomic64_xchg() Linus noticed that atomic64_xchg() uses atomic_read(), which happens to work because atomic_read() is a macro so the .counter value gets u64-read on 32-bit too - but this is really bogus and serious bugs are waiting to happen. Fix atomic64_xchg() to use __atomic64_read() instead. No code changed: arch/x86/lib/atomic64_32.o: text data bss dec hex filename 435 0 0 435 1b3 atomic64_32.o.before 435 0 0 435 1b3 atomic64_32.o.after md5: bd8ab95e69c93518578bfaf0ea3be4d9 atomic64_32.o.before.asm bd8ab95e69c93518578bfaf0ea3be4d9 atomic64_32.o.after.asm Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index a910238a776..fd28fd3fb74 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -36,7 +36,7 @@ u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) u64 old_val; do { - old_val = atomic_read(ptr); + old_val = __atomic64_read(ptr); } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); return old_val; -- cgit v1.2.3 From 12b9d7ccb841805e347fec8f733f368f43ddba40 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 1 Jul 2009 17:37:22 -0400 Subject: x86: Fix fixmap page order for FIX_TEXT_POKE0,1 Masami reported: > Since the fixmap pages are assigned higher address to lower, > text_poke() has to use it with inverted order (FIX_TEXT_POKE1 > to FIX_TEXT_POKE0). I prefer to just invert the order of the fixmap declaration. It's simpler and more straightforward. Backward fixmaps seems to be used by both x86 32 and 64. It's really rare but a nasty bug, because it only hurts when instructions to patch are crossing a page boundary. If this happens, the fixmap write accesses will spill on the following fixmap, which may very well crash the system. And this does not crash the system, it could leave illegal instructions in place. Thanks Masami for finding this. It seems to have crept into the 2.6.30-rc series, so this calls for a -stable inclusion. Signed-off-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Cc: LKML-Reference: <20090701213722.GH19926@Krystal> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 3eb0f79a532..7b2d71df39a 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -111,8 +111,8 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE1, + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ __end_of_permanent_fixed_addresses, /* * 256 temporary boot-time mappings, used by early_ioremap(), -- cgit v1.2.3 From 3fd382cedfb4fb742a8cc17ba15288ec03131db1 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 2 Jul 2009 07:27:20 +0200 Subject: x86: Add missing annotation to arch/x86/lib/copy_user_64.S::copy_to_user While examining symbol generation in perf_counter tools, I noticed that copy_to_user() had no size in vmlinux's symtab. Signed-off-by: Mike Galbraith Acked-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov LKML-Reference: <1246512440.13293.3.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- arch/x86/lib/copy_user_64.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index f118c110af3..6ba0f7bb85e 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -75,6 +75,7 @@ ENTRY(copy_to_user) jae bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC +ENDPROC(copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(copy_from_user) -- cgit v1.2.3 From d3ac88157c1e9153f37cd2b9313117ae11735063 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Thu, 2 Jul 2009 11:40:36 +0600 Subject: x86, kvm: Fix section mismatches in kvm.c The function paravirt_ops_setup() has been refering the variable no_timer_check, which is a __initdata. Thus generates the following warning. paravirt_ops_setup() function is called from kvm_guest_init() which is a __init function. So to fix this we mark paravirt_ops_setup as __init. The sections-check output that warned us about this was: LD arch/x86/built-in.o WARNING: arch/x86/built-in.o(.text+0x166ce): Section mismatch in reference from the function paravirt_ops_setup() to the variable .init.data:no_timer_check The function paravirt_ops_setup() references the variable __initdata no_timer_check. This is often because paravirt_ops_setup lacks a __initdata annotation or the annotation of no_timer_check is wrong. Signed-off-by: Rakib Mullick Acked-by: Avi Kivity Cc: Andrew Morton LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a78ecad0c90..c664d515f61 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void) state->mode = paravirt_get_lazy_mode(); } -static void paravirt_ops_setup(void) +static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; pv_info.paravirt_enabled = 1; -- cgit v1.2.3 From 23d0cd8e718723f1ddda37637bc6b7c34caec64a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 2 Jul 2009 22:33:59 +0530 Subject: x86: Remove unused variable disable_x2apic setup_nox2apic() is writing 1 to disable_x2apic but no one is reading it. Signed-off-by: Jaswinder Singh Rajput Cc: Yinghai Lu Cc: Cyrill Gorcunov LKML-Reference: <1246554239.2242.27.camel@jaswinder.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8c7c042ecad..0a1c2830ec6 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -140,7 +140,6 @@ int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ static int x2apic_preenabled; -static int disable_x2apic; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { @@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str) return 0; } - disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } -- cgit v1.2.3 From c7210e1ff8a95a0a6dba0d04a95b3ddd9d165fab Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 2 Jul 2009 22:35:35 +0530 Subject: x86: Remove unused function lapic_watchdog_ok() lapic_watchdog_ok() is a global function but no one is using it. Signed-off-by: Jaswinder Singh Rajput Cc: Andi Kleen Cc: Yinghai Lu LKML-Reference: <1246554335.2242.29.camel@jaswinder.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 1 - arch/x86/kernel/cpu/perfctr-watchdog.c | 5 ----- 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c9726440993..c86e5ed4af5 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -72,7 +72,6 @@ void lapic_watchdog_stop(void); int lapic_watchdog_init(unsigned nmi_hz); int lapic_wd_event(unsigned nmi_hz); unsigned lapic_adjust_nmi_hz(unsigned hz); -int lapic_watchdog_ok(void); void disable_lapic_nmi_watchdog(void); void enable_lapic_nmi_watchdog(void); void stop_nmi(void); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 5c481f6205b..e60ed740d2b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -803,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz) wd_ops->rearm(wd, nmi_hz); return 1; } - -int lapic_watchdog_ok(void) -{ - return wd_ops != NULL; -} -- cgit v1.2.3 From 8e049ef054f1cc765f05f13e1396bb9a17c19e66 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 2 Jul 2009 08:57:12 +1000 Subject: x86: atomic64: Code atomic(64)_read and atomic(64)_set in C not CPP Occasionally we get bugs where atomic_read or atomic_set are used on atomic64_t variables or vice versa. These bugs don't generate warnings on x86 because atomic_read and atomic_set are coded as macros rather than C functions, so we don't get any type-checking on their arguments; similarly for atomic64_read and atomic64_set in 64-bit kernels. This converts them to C functions so that the arguments are type-checked and bugs like this will get caught more easily. It also converts atomic_cmpxchg and atomic_xchg, and atomic64_cmpxchg and atomic64_xchg on 64-bit, so we get type-checking on their arguments too. Compiling a typical 64-bit x86 config, this generates no new warnings, and the vmlinux text is 86 bytes smaller. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 16 ++++++++++++--- arch/x86/include/asm/atomic_64.h | 42 ++++++++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index b551bb1ae3c..aa045deb2e7 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -31,7 +31,10 @@ static inline int atomic_read(const atomic_t *v) * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -203,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v) return atomic_add_return(-i, v); } -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is already a given value diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 0d636022000..d605dc268e7 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -18,7 +18,10 @@ * * Atomically reads the value of @v. */ -#define atomic_read(v) ((v)->counter) +static inline int atomic_read(const atomic_t *v) +{ + return v->counter; +} /** * atomic_set - set atomic variable @@ -27,7 +30,10 @@ * * Atomically sets the value of @v to @i. */ -#define atomic_set(v, i) (((v)->counter) = (i)) +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} /** * atomic_add - add integer to atomic variable @@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -#define atomic64_read(v) ((v)->counter) +static inline long atomic64_read(const atomic64_t *v) +{ + return v->counter; +} /** * atomic64_set - set atomic64 variable @@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) * * Atomically sets the value of @v to @i. */ -#define atomic64_set(v, i) (((v)->counter) = (i)) +static inline void atomic64_set(atomic64_t *v, long i) +{ + v->counter = i; +} /** * atomic64_add - add integer to atomic64 variable @@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v) #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) -#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic64_xchg(atomic64_t *v, long new) +{ + return xchg(&v->counter, new); +} -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) +static inline long atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline long atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} /** * atomic_add_unless - add unless the number is a given value -- cgit v1.2.3 From 67d7178f8fc64b7f68d7dd8a1b21dfa0d42c220c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 13:23:02 +0200 Subject: x86: atomic64: Improve atomic64_read() Optimize atomic64_read() as a special open-coded cmpxchg8b variant. This generates nicer code: arch/x86/lib/atomic64_32.o: text data bss dec hex filename 435 0 0 435 1b3 atomic64_32.o.before 431 0 0 431 1af atomic64_32.o.after md5: bd8ab95e69c93518578bfaf0ea3be4d9 atomic64_32.o.before.asm 2bdfd4bd1f6b7b61b7fc127aef90ce3b atomic64_32.o.after.asm Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index fd28fd3fb74..cd11803f944 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -62,9 +62,17 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) */ u64 atomic64_read(atomic64_t *ptr) { - u64 old = 1LL << 32; + u64 res; - return cmpxchg8b(&ptr->counter, old, old); + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "+A" (res) + : "m" (*ptr) + ); + + return res; } /** -- cgit v1.2.3 From 1fde902d52ee13ab9fab155bbae757fdf7daf0c1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 17:28:57 +0200 Subject: x86: atomic64: Export APIs to modules atomic64_t primitives are used by a handful of drivers, so export the APIs consistently. These were inlined before. Also mark atomic64_32.o a core object, so that the symbols are available even if not linked to core kernel pieces. Cc: Eric Dumazet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/Makefile | 2 +- arch/x86/lib/atomic64_32.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index c3c657c8bb8..07c31899c9c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -10,7 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) - lib-y += atomic64_32.o + obj-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index cd11803f944..6722a092e40 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -1,5 +1,7 @@ #include +#include #include + #include #include #include @@ -21,6 +23,7 @@ u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) { return cmpxchg8b(&ptr->counter, old_val, new_val); } +EXPORT_SYMBOL(atomic64_cmpxchg); /** * atomic64_xchg - xchg atomic64 variable @@ -41,6 +44,7 @@ u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) return old_val; } +EXPORT_SYMBOL(atomic64_xchg); /** * atomic64_set - set atomic64 variable @@ -53,6 +57,7 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) { atomic64_xchg(ptr, new_val); } +EXPORT_SYMBOL(atomic64_read); /** * atomic64_read - read atomic64 variable @@ -74,6 +79,7 @@ u64 atomic64_read(atomic64_t *ptr) return res; } +EXPORT_SYMBOL(atomic64_read); /** * atomic64_add_return - add and return @@ -103,21 +109,25 @@ noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) return new_val; } +EXPORT_SYMBOL(atomic64_add_return); u64 atomic64_sub_return(u64 delta, atomic64_t *ptr) { return atomic64_add_return(-delta, ptr); } +EXPORT_SYMBOL(atomic64_sub_return); u64 atomic64_inc_return(atomic64_t *ptr) { return atomic64_add_return(1, ptr); } +EXPORT_SYMBOL(atomic64_inc_return); u64 atomic64_dec_return(atomic64_t *ptr) { return atomic64_sub_return(1, ptr); } +EXPORT_SYMBOL(atomic64_dec_return); /** * atomic64_add - add integer to atomic64 variable @@ -130,6 +140,7 @@ void atomic64_add(u64 delta, atomic64_t *ptr) { atomic64_add_return(delta, ptr); } +EXPORT_SYMBOL(atomic64_add); /** * atomic64_sub - subtract the atomic64 variable @@ -142,6 +153,7 @@ void atomic64_sub(u64 delta, atomic64_t *ptr) { atomic64_add(-delta, ptr); } +EXPORT_SYMBOL(atomic64_sub); /** * atomic64_sub_and_test - subtract value from variable and test result @@ -158,6 +170,7 @@ int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) return old_val == 0; } +EXPORT_SYMBOL(atomic64_sub_and_test); /** * atomic64_inc - increment atomic64 variable @@ -169,6 +182,7 @@ void atomic64_inc(atomic64_t *ptr) { atomic64_add(1, ptr); } +EXPORT_SYMBOL(atomic64_inc); /** * atomic64_dec - decrement atomic64 variable @@ -180,6 +194,7 @@ void atomic64_dec(atomic64_t *ptr) { atomic64_sub(1, ptr); } +EXPORT_SYMBOL(atomic64_dec); /** * atomic64_dec_and_test - decrement and test @@ -193,6 +208,7 @@ int atomic64_dec_and_test(atomic64_t *ptr) { return atomic64_sub_and_test(1, ptr); } +EXPORT_SYMBOL(atomic64_dec_and_test); /** * atomic64_inc_and_test - increment and test @@ -206,6 +222,7 @@ int atomic64_inc_and_test(atomic64_t *ptr) { return atomic64_sub_and_test(-1, ptr); } +EXPORT_SYMBOL(atomic64_inc_and_test); /** * atomic64_add_negative - add and test if negative @@ -222,3 +239,4 @@ int atomic64_add_negative(u64 delta, atomic64_t *ptr) return old_val < 0; } +EXPORT_SYMBOL(atomic64_add_negative); -- cgit v1.2.3 From 3a8d1788b37435baf6c296f4ea8beb4fa4955f44 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 19:56:36 +0200 Subject: x86: atomic64: Improve atomic64_xchg() Remove the read-first logic from atomic64_xchg() and simplify the loop. This function was the last user of __atomic64_read() - remove it. Also, change the 'real_val' assumption from the somewhat quirky 1ULL << 32 value to the (just as arbitrary, but simpler) value of 0. Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 9 --------- arch/x86/lib/atomic64_32.c | 21 +++++++++++++++------ 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index aa045deb2e7..d7c8849b8c6 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -268,15 +268,6 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } -/** - * atomic64_read - read atomic64 variable - * @ptr: pointer of type atomic64_t - * - * Atomically reads the value of @v. - * Doesn't imply a read memory barrier. - */ -#define __atomic64_read(ptr) ((ptr)->counter) - extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); /** diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 6722a092e40..a804f96e90e 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -33,14 +33,23 @@ EXPORT_SYMBOL(atomic64_cmpxchg); * Atomically xchgs the value of @ptr to @new_val and returns * the old value. */ - u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) { - u64 old_val; + /* + * Try first with a (possibly incorrect) assumption about + * what we have there. We'll do two loops most likely, + * but we'll get an ownership MESI transaction straight away + * instead of a read transaction followed by a + * flush-for-ownership transaction: + */ + u64 old_val, real_val = 0; do { - old_val = __atomic64_read(ptr); - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + old_val = real_val; + + real_val = atomic64_cmpxchg(ptr, old_val, new_val); + + } while (real_val != old_val); return old_val; } @@ -91,13 +100,13 @@ EXPORT_SYMBOL(atomic64_read); noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) { /* - * Try first with a (probably incorrect) assumption about + * Try first with a (possibly incorrect) assumption about * what we have there. We'll do two loops most likely, * but we'll get an ownership MESI transaction straight away * instead of a read transaction followed by a * flush-for-ownership transaction: */ - u64 old_val, new_val, real_val = 1ULL << 32; + u64 old_val, new_val, real_val = 0; do { old_val = real_val; -- cgit v1.2.3 From ddf9a003d32f720805ac30bcc15755e9289073de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2009 20:11:30 +0200 Subject: x86: atomic64: Clean up atomic64_sub_and_test() and atomic64_add_negative() Linus noticed that the variable name 'old_val' is confusingly named in these functions - the correct naming is 'new_val'. Reported-by: Linus Torvalds Cc: Eric Dumazet Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/lib/atomic64_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index a804f96e90e..1d98c9eb6ea 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -175,9 +175,9 @@ EXPORT_SYMBOL(atomic64_sub); */ int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) { - u64 old_val = atomic64_sub_return(delta, ptr); + u64 new_val = atomic64_sub_return(delta, ptr); - return old_val == 0; + return new_val == 0; } EXPORT_SYMBOL(atomic64_sub_and_test); @@ -244,8 +244,8 @@ EXPORT_SYMBOL(atomic64_inc_and_test); */ int atomic64_add_negative(u64 delta, atomic64_t *ptr) { - long long old_val = atomic64_add_return(delta, ptr); + s64 new_val = atomic64_add_return(delta, ptr); - return old_val < 0; + return new_val < 0; } EXPORT_SYMBOL(atomic64_add_negative); -- cgit v1.2.3 From a79f0da80a508448434476b77f9d3d1a469eab67 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jul 2009 16:50:10 +0200 Subject: x86: atomic64: Inline atomic64_read() again Now atomic64_read() is light weight (no register pressure and small icache), we can inline it again. Also use "=&A" constraint instead of "+A" to avoid warning about unitialized 'res' variable. (gcc had to force 0 in eax/edx) $ size vmlinux.prev vmlinux.after text data bss dec hex filename 4908667 451676 1684868 7045211 6b805b vmlinux.prev 4908651 451676 1684868 7045195 6b804b vmlinux.after Signed-off-by: Eric Dumazet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: David Howells Cc: Andrew Morton Cc: Arnd Bergmann LKML-Reference: <4A4E1AA2.30002@gmail.com> [ Also fix typo in atomic64_set() export ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 22 ++++++++++++++++++++++ arch/x86/lib/atomic64_32.c | 23 +---------------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index d7c8849b8c6..dc5a667ff79 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -295,6 +295,28 @@ extern void atomic64_set(atomic64_t *ptr, u64 new_val); * * Atomically reads the value of @ptr and returns it. */ +static inline u64 atomic64_read(atomic64_t *ptr) +{ + u64 res; + + /* + * Note, we inline this atomic64_t primitive because + * it only clobbers EAX/EDX and leaves the others + * untouched. We also (somewhat subtly) rely on the + * fact that cmpxchg8b returns the current 64-bit value + * of the memory location we are touching: + */ + asm volatile( + "mov %%ebx, %%eax\n\t" + "mov %%ecx, %%edx\n\t" + LOCK_PREFIX "cmpxchg8b %1\n" + : "=&A" (res) + : "m" (*ptr) + ); + + return res; +} + extern u64 atomic64_read(atomic64_t *ptr); /** diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 1d98c9eb6ea..824fa0be55a 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -66,31 +66,10 @@ void atomic64_set(atomic64_t *ptr, u64 new_val) { atomic64_xchg(ptr, new_val); } -EXPORT_SYMBOL(atomic64_read); +EXPORT_SYMBOL(atomic64_set); /** - * atomic64_read - read atomic64 variable - * @ptr: pointer to type atomic64_t - * - * Atomically reads the value of @ptr and returns it. - */ -u64 atomic64_read(atomic64_t *ptr) -{ - u64 res; - - asm volatile( - "mov %%ebx, %%eax\n\t" - "mov %%ecx, %%edx\n\t" - LOCK_PREFIX "cmpxchg8b %1\n" - : "+A" (res) - : "m" (*ptr) - ); - - return res; -} EXPORT_SYMBOL(atomic64_read); - -/** * atomic64_add_return - add and return * @delta: integer value to add * @ptr: pointer to type atomic64_t -- cgit v1.2.3 From 62edf5dc4a524e4cb525e6020b955a1ad593d9ba Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 10:59:46 +0100 Subject: intel-iommu: Restore DMAR_BROKEN_GFX_WA option for broken graphics drivers We need to give people a little more time to fix the broken drivers. Re-introduce this, but tied in properly with the 'iommu=pt' support this time. Change the config option name and make it default to 'no' too. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f7220590..738bdc6b0f8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1913,6 +1913,18 @@ config DMAR_DEFAULT_ON recommended you say N here while the DMAR code remains experimental. +config DMAR_BROKEN_GFX_WA + def_bool n + prompt "Workaround broken graphics drivers (going away soon)" + depends on DMAR + ---help--- + Current Graphics drivers tend to use physical address + for DMA and avoid using DMA APIs. Setting this config + option permits the IOMMU driver to set a unity map for + all the OS-visible memory. Hence the driver can continue + to use physical addresses for DMA, at least until this + option is removed in the 2.6.32 kernel. + config DMAR_FLOPPY_WA def_bool y depends on DMAR -- cgit v1.2.3 From febe04de3be4bf66f9339d8847db2806d99fd164 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 1 Jul 2009 11:13:07 +0900 Subject: x86: fix usage of bios intcall() Some intcall() misuses the input biosregs as output in cf06de7b9cdd3efee7a59dced1977b3c21d43732 This fixes the problem vga=ask boot option doesn't show enough modes. Signed-off-by: Akinobu Mita LKML-Reference: <20090701021307.GA3127@localhost.localdomain> Signed-off-by: H. Peter Anvin --- arch/x86/boot/video-bios.c | 3 +-- arch/x86/boot/video-vesa.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index d660be49236..49e0c18833e 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode) ireg.al = mode; /* AH=0x00 Set Video Mode */ intcall(0x10, &ireg, NULL); - ireg.ah = 0x0f; /* Get Current Video Mode */ intcall(0x10, &ireg, &oreg); do_restore = 1; /* Assume video contents were lost */ /* Not all BIOSes are clean with the top bit */ - new_mode = ireg.al & 0x7f; + new_mode = oreg.al & 0x7f; if (new_mode == mode) return 0; /* Mode change OK */ diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index c700147d6ff..275dd177f19 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -45,7 +45,7 @@ static int vesa_probe(void) ireg.di = (size_t)&vginfo; intcall(0x10, &ireg, &oreg); - if (ireg.ax != 0x004f || + if (oreg.ax != 0x004f || vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ @@ -70,7 +70,7 @@ static int vesa_probe(void) ireg.di = (size_t)&vminfo; intcall(0x10, &ireg, &oreg); - if (ireg.ax != 0x004f) + if (oreg.ax != 0x004f) continue; if ((vminfo.mode_attr & 0x15) == 0x05) { -- cgit v1.2.3 From f386c61fe1a1f36f0e434f1b577e6b112698caf7 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Sun, 5 Jul 2009 12:08:06 -0700 Subject: gcov: exclude code operating in userspace from profiling Fix for this issue on x86_64: rostedt@goodmis.org wrote: > On bootup of the latest kernel my init segfaults. Debugging it, > I found that vread_tsc (a vsyscall) increments some strange > kernel memory: > > 0000000000000000 : > 0: 55 push %rbp > 1: 48 ff 05 00 00 00 00 incq 0(%rip) > # 8 > 4: R_X86_64_PC32 .bss+0x3c > 8: 48 89 e5 mov %rsp,%rbp > b: 66 66 90 xchg %ax,%ax > e: 48 ff 05 00 00 00 00 incq 0(%rip) > # 15 > 11: R_X86_64_PC32 .bss+0x44 > 15: 66 66 90 xchg %ax,%ax > 18: 48 ff 05 00 00 00 00 incq 0(%rip) > # 1f > 1b: R_X86_64_PC32 .bss+0x4c > 1f: 0f 31 rdtsc > > > Those "incq" is very bad to happen in vsyscall memory, since > userspace can not modify it. You need to make something prevent > profiling of vsyscall memory (like I do with ftrace). Signed-off-by: Peter Oberparleiter Cc: Ingo Molnar Reported-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6c327b852e2..430d5b24af7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,6 +26,8 @@ CFLAGS_tsc.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n +GCOV_PROFILE_tsc.o := n +GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -- cgit v1.2.3 From a2e1b4c31257c07f148a89eb7eea7ca959fd0642 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Sun, 26 Jul 2009 10:55:25 -0500 Subject: [CPUFREQ] Powernow-k8: support family 0xf with 2 low p-states Provide support for family 0xf processors with 2 P-states below the elevator voltage. Remove the checks that prevent this configuration from being supported and increase the transition voltage to prevent errors during the transition. Signed-off-by: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 30 +++++++++++------------------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 3 ++- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 81cbe64ed6b..ae068f59603 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) { - if (core_voltage_pre_transition(data, reqvid)) + if (core_voltage_pre_transition(data, reqvid, reqfid)) return 1; if (core_frequency_transition(data, reqfid)) @@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, /* Phase 1 - core voltage transition ... setup voltage */ static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid) + u32 reqvid, u32 reqfid) { u32 rvosteps = data->rvo; u32 savefid = data->currfid; - u32 maxvid, lo; + u32 maxvid, lo, rvomult = 1; dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); dprintk("ph1 maxvid=0x%x\n", maxvid); @@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { if (data->currvid == maxvid) { rvosteps = 0; } else { @@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) u32 vcoreqfid, vcocurrfid, vcofiddiff; u32 fid_interval, savevid = data->currvid; - if ((reqfid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX "ph2: illegal lo-lo transition " - "0x%x 0x%x\n", reqfid, data->currfid); - return 1; - } - if (data->currfid == reqfid) { printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); @@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid : vcoreqfid - vcocurrfid; + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + while (vcofiddiff > 2) { (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); @@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, return 0; } - if ((fid < HI_FID_TABLE_BOTTOM) && - (data->currfid < HI_FID_TABLE_BOTTOM)) { - printk(KERN_ERR PFX - "ignoring illegal change in lo freq table-%x to 0x%x\n", - data->currfid, fid); - return 1; - } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index c9c1190b5e1..02ce824073c 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -215,7 +215,8 @@ struct pst_s { #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) -static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); -- cgit v1.2.3 From 00024be9689963c55cf2ab10ce3fb4502016c09a Mon Sep 17 00:00:00 2001 From: Peter Chubb Date: Wed, 8 Jul 2009 13:20:13 +0200 Subject: x86: Fix resume from suspend when CONFIG_CC_STACKPROTECTOR Patch 08687aec71bc9134fe336e561f6did877bacf74fc0a (x86: unify power/cpu_(32|64).c) renamed cpu_32.c to cpu.c, but did not update the special compilation flags for the file for the new name. This patch fixes the compilation flags, and therefore fixes resume from suspend on my Acer Aspire One. [rjw: The regression from 2.6.30 fixed by this patch is tracked as http://bugzilla.kernel.org/show_bug.cgi?id=13661] Signed-off-by: Peter Chubb Signed-off-by: Rafael J. Wysocki --- arch/x86/power/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index de2abbd0754..a6a198c3362 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,7 +1,7 @@ # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_cpu_$(BITS).o := $(nostackp) +CFLAGS_cpu.o := $(nostackp) obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o -- cgit v1.2.3 From ad361c9884e809340f6daca80d56a9e9c871690a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 6 Jul 2009 13:05:40 -0700 Subject: Remove multiple KERN_ prefixes from printk formats Commit 5fd29d6ccbc98884569d6f3105aeca70858b3e0f ("printk: clean up handling of log-levels and newlines") changed printk semantics. printk lines with multiple KERN_ prefixes are no longer emitted as before the patch. is now included in the output on each additional use. Remove all uses of multiple KERN_s in formats. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/io_apic.c | 1 - arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++++------ arch/x86/kernel/e820.c | 7 +++---- arch/x86/kernel/pci-gart_64.c | 2 +- arch/x86/mm/fault.c | 9 +++++---- 6 files changed, 16 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8fd1efb5a0b..90b5e6efa93 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1739,7 +1739,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "\n"); printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index ae068f59603..2a50ef89100 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1259,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { static const char ACPI_PSS_BIOS_BUG_MSG[] = KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; + FW_BUG PFX "Try again with latest BIOS.\n"; struct powernow_k8_data *data; struct init_on_cpu init_on_cpu; int rc; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index af425b83202..484c1e5f658 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -194,14 +194,14 @@ static void print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk("\n"); + printk(KERN_CONT "\n"); } printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %llx ", m->addr); + printk(KERN_CONT "ADDR %llx ", m->addr); if (m->misc) - printk("MISC %llx ", m->misc); - printk("\n"); + printk(KERN_CONT "MISC %llx ", m->misc); + printk(KERN_CONT "\n"); printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); @@ -209,13 +209,13 @@ static void print_mce(struct mce *m) static void print_mce_head(void) { - printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); + printk(KERN_EMERG "\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c4ca89d9aaf..5cb5725b2ba 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -627,10 +627,9 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); } #endif diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index cfd9f906389..d2e56b8f48e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "falling back to iommu=soft.\n"); return -1; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78a5fff857b..85307cc6e45 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address) } static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; /* * No vm86 mode in 64-bit mode: -- cgit v1.2.3 From 44b572809581d5a10dbe35aa6bf689f32b9c5ad6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 8 Jul 2009 09:50:19 -0700 Subject: x86: don't clear nodes_states[N_NORMAL_MEMORY] when numa is not compiled in Alex found that specjbb2005 still can not run with hugepages on an x86-64 machine. This only happens when numa is not compiled in. The root cause: node_set_state will not set it back for us in that case, so don't clear that when numa is not select in config [ v2: use node_clear_state instead ] Reported-and-Tested-by: Alex Shi Signed-off-by: Yinghai Lu Reviewed-by: Christoph Lameter Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b177652251a..6176fe8f29e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -598,8 +598,15 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); - /* clear the default setting with node 0 */ - nodes_clear(node_states[N_NORMAL_MEMORY]); + + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_NORMAL_MEMORY); + free_area_init_nodes(max_zone_pfns); } -- cgit v1.2.3 From ad46276952f1af34cd91d46d49ba13d347d56367 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Jul 2009 12:10:31 +0000 Subject: memory barrier: adding smp_mb__after_lock Adding smp_mb__after_lock define to be used as a smp_mb call after a lock. Making it nop for x86, since {read|write|spin}_lock() on x86 are full memory barriers. Signed-off-by: Jiri Olsa Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/include/asm/spinlock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b7e5db87639..4e77853321d 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + #endif /* _ASM_X86_SPINLOCK_H */ -- cgit v1.2.3 From 8d7ff4f2a0b22b7d6d7bc3982257d1dadea22824 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 23 Jun 2009 11:48:14 +0200 Subject: x86/oprofile: rename kernel parameter for architectural perfmon to arch_perfmon The short name of the achitecture is 'arch_perfmon'. This patch changes the kernel parameter to use this name. Cc: Andi Kleen Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b07dd8d0b32..89b9a5cd63d 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type) static int force_arch_perfmon; static int force_cpu_type(const char *str, struct kernel_param *kp) { - if (!strcmp(str, "archperfmon")) { + if (!strcmp(str, "arch_perfmon")) { force_arch_perfmon = 1; printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); } -- cgit v1.2.3 From 8aa7e847d834ed937a9ad37a0f2ad5b8584c1ab0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2009 14:52:32 +0200 Subject: Fix congestion_wait() sync/async vs read/write confusion Commit 1faa16d22877f4839bd433547d770c676d1d964c accidentally broke the bdi congestion wait queue logic, causing us to wait on congestion for WRITE (== 1) when we really wanted BLK_RW_ASYNC (== 0) instead. Signed-off-by: Jens Axboe --- arch/x86/lib/usercopy_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7c8ca91bb9e..1f118d462ac 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -751,7 +751,7 @@ survive: if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto survive; } -- cgit v1.2.3 From c99e6efe1ba04561e7d93a81f0be07e37427e835 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:56 +0200 Subject: sched: INIT_PREEMPT_COUNT Pull the initial preempt_count value into a single definition site. Maintainers for: alpha, ia64 and m68k, please have a look, your arch code is funny. The header magic is a bit odd, but similar to the KERNEL_DS one, CPP waits with expanding these macros until the INIT_THREAD_INFO macro itself is expanded, which is in arch/*/kernel/init_task.c where we've already included sched.h so we're good. Cc: tony.luck@intel.com Cc: rth@twiddle.net Cc: geert@linux-m68k.org Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- arch/x86/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b0783520988..fad7d40b75f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ -- cgit v1.2.3