diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 16 | ||||
-rw-r--r-- | arch/x86/mm/Makefile_32 | 9 | ||||
-rw-r--r-- | arch/x86/mm/Makefile_64 | 9 | ||||
-rw-r--r-- | arch/x86/mm/discontig_32.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 354 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 35 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 222 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 152 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 107 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 421 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 7 |
14 files changed, 1221 insertions, 133 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 98329109684..20941d2954e 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,17 @@ +obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ + pat.o + +obj-$(CONFIG_X86_32) += pgtable_32.o + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o + +obj-$(CONFIG_HIGHMEM) += highmem_32.o + ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/mm/Makefile_32 +obj-$(CONFIG_NUMA) += discontig_32.o else -include ${srctree}/arch/x86/mm/Makefile_64 +obj-$(CONFIG_NUMA) += numa_64.o +obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_ACPI_NUMA) += srat_64.o endif diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32 deleted file mode 100644 index c36ae88bb54..00000000000 --- a/arch/x86/mm/Makefile_32 +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for the linux i386-specific parts of the memory manager. -# - -obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o - -obj-$(CONFIG_NUMA) += discontig_32.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64 deleted file mode 100644 index 688c8c28ac8..00000000000 --- a/arch/x86/mm/Makefile_64 +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for the linux x86_64-specific parts of the memory manager. -# - -obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_NUMA) += numa_64.o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_64.o diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 8e25e06ff73..eba0bbede7a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -37,7 +37,7 @@ #include <asm/e820.h> #include <asm/setup.h> #include <asm/mmzone.h> -#include <bios_ebda.h> +#include <asm/bios_ebda.h> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c new file mode 100644 index 00000000000..6791b8334bc --- /dev/null +++ b/arch/x86/mm/dump_pagetables.c @@ -0,0 +1,354 @@ +/* + * Debug helper to dump the current kernel pagetables of the system + * so that we can see what the various memory ranges are set to. + * + * (C) Copyright 2008 Intel Corporation + * + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/debugfs.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/seq_file.h> + +#include <asm/pgtable.h> + +/* + * The dumper groups pagetable entries of the same type into one, and for + * that it needs to keep some state when walking, and flush this state + * when a "break" in the continuity is found. + */ +struct pg_state { + int level; + pgprot_t current_prot; + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +/* Address space markers hints */ +static struct addr_marker address_markers[] = { + { 0, "User Space" }, +#ifdef CONFIG_X86_64 + { 0x8000000000000000UL, "Kernel Space" }, + { 0xffff810000000000UL, "Low Kernel Mapping" }, + { VMALLOC_START, "vmalloc() Area" }, + { VMEMMAP_START, "Vmemmap" }, + { __START_KERNEL_map, "High Kernel Mapping" }, + { MODULES_VADDR, "Modules" }, + { MODULES_END, "End Modules" }, +#else + { PAGE_OFFSET, "Kernel Mapping" }, + { 0/* VMALLOC_START */, "vmalloc() Area" }, + { 0/*VMALLOC_END*/, "vmalloc() End" }, +# ifdef CONFIG_HIGHMEM + { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, +# endif + { 0/*FIXADDR_START*/, "Fixmap Area" }, +#endif + { -1, NULL } /* End of list */ +}; + +/* Multipliers for offsets within the PTEs */ +#define PTE_LEVEL_MULT (PAGE_SIZE) +#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) + +/* + * Print a readable form of a pgprot_t to the seq_file + */ +static void printk_prot(struct seq_file *m, pgprot_t prot, int level) +{ + pgprotval_t pr = pgprot_val(prot); + static const char * const level_name[] = + { "cr3", "pgd", "pud", "pmd", "pte" }; + + if (!pgprot_val(prot)) { + /* Not present */ + seq_printf(m, " "); + } else { + if (pr & _PAGE_USER) + seq_printf(m, "USR "); + else + seq_printf(m, " "); + if (pr & _PAGE_RW) + seq_printf(m, "RW "); + else + seq_printf(m, "ro "); + if (pr & _PAGE_PWT) + seq_printf(m, "PWT "); + else + seq_printf(m, " "); + if (pr & _PAGE_PCD) + seq_printf(m, "PCD "); + else + seq_printf(m, " "); + + /* Bit 9 has a different meaning on level 3 vs 4 */ + if (level <= 3) { + if (pr & _PAGE_PSE) + seq_printf(m, "PSE "); + else + seq_printf(m, " "); + } else { + if (pr & _PAGE_PAT) + seq_printf(m, "pat "); + else + seq_printf(m, " "); + } + if (pr & _PAGE_GLOBAL) + seq_printf(m, "GLB "); + else + seq_printf(m, " "); + if (pr & _PAGE_NX) + seq_printf(m, "NX "); + else + seq_printf(m, "x "); + } + seq_printf(m, "%s\n", level_name[level]); +} + +/* + * On 64 bits, sign-extend the 48 bit address to 64 bit + */ +static unsigned long normalize_addr(unsigned long u) +{ +#ifdef CONFIG_X86_64 + return (signed long)(u << 16) >> 16; +#else + return u; +#endif +} + +/* + * This function gets called on a break in a continuous series + * of PTE entries; the next one is different so we need to + * print what we collected so far. + */ +static void note_page(struct seq_file *m, struct pg_state *st, + pgprot_t new_prot, int level) +{ + pgprotval_t prot, cur; + static const char units[] = "KMGTPE"; + + /* + * If we have a "break" in the series, we need to flush the state that + * we have now. "break" is either changing perms, levels or + * address space marker. + */ + prot = pgprot_val(new_prot) & ~(PTE_MASK); + cur = pgprot_val(st->current_prot) & ~(PTE_MASK); + + if (!st->level) { + /* First entry */ + st->current_prot = new_prot; + st->level = level; + st->marker = address_markers; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } else if (prot != cur || level != st->level || + st->current_address >= st->marker[1].start_address) { + const char *unit = units; + unsigned long delta; + + /* + * Now print the actual finished series + */ + seq_printf(m, "0x%p-0x%p ", + (void *)st->start_address, + (void *)st->current_address); + + delta = (st->current_address - st->start_address) >> 10; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + printk_prot(m, st->current_prot, st->level); + + /* + * We print markers for special areas of address space, + * such as the start of vmalloc space etc. + * This helps in the interpretation. + */ + if (st->current_address >= st->marker[1].start_address) { + st->marker++; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + + st->start_address = st->current_address; + st->current_prot = new_prot; + st->level = level; + } +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + unsigned long P) +{ + int i; + pte_t *start; + + start = (pte_t *) pmd_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PTE; i++) { + pgprot_t prot = pte_pgprot(*start); + + st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); + note_page(m, st, prot, 4); + start++; + } +} + +#if PTRS_PER_PMD > 1 + +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + unsigned long P) +{ + int i; + pmd_t *start; + + start = (pmd_t *) pud_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PMD; i++) { + st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); + if (!pmd_none(*start)) { + pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; + + if (pmd_large(*start) || !pmd_present(*start)) + note_page(m, st, __pgprot(prot), 3); + else + walk_pte_level(m, st, *start, + P + i * PMD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 3); + start++; + } +} + +#else +#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) +#define pud_large(a) pmd_large(__pmd(pud_val(a))) +#define pud_none(a) pmd_none(__pmd(pud_val(a))) +#endif + +#if PTRS_PER_PUD > 1 + +static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + unsigned long P) +{ + int i; + pud_t *start; + + start = (pud_t *) pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_PUD; i++) { + st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); + if (!pud_none(*start)) { + pgprotval_t prot = pud_val(*start) & ~PTE_MASK; + + if (pud_large(*start) || !pud_present(*start)) + note_page(m, st, __pgprot(prot), 2); + else + walk_pmd_level(m, st, *start, + P + i * PUD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) +#define pgd_large(a) pud_large(__pud(pgd_val(a))) +#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#endif + +static void walk_pgd_level(struct seq_file *m) +{ +#ifdef CONFIG_X86_64 + pgd_t *start = (pgd_t *) &init_level4_pgt; +#else + pgd_t *start = swapper_pg_dir; +#endif + int i; + struct pg_state st; + + memset(&st, 0, sizeof(st)); + + for (i = 0; i < PTRS_PER_PGD; i++) { + st.current_address = normalize_addr(i * PGD_LEVEL_MULT); + if (!pgd_none(*start)) { + pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; + + if (pgd_large(*start) || !pgd_present(*start)) + note_page(m, &st, __pgprot(prot), 1); + else + walk_pud_level(m, &st, *start, + i * PGD_LEVEL_MULT); + } else + note_page(m, &st, __pgprot(0), 1); + + start++; + } + + /* Flush out the last page */ + st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); + note_page(m, &st, __pgprot(0), 0); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + walk_pgd_level(m); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int pt_dump_init(void) +{ + struct dentry *pe; + +#ifdef CONFIG_X86_32 + /* Not a compile-time constant on x86-32 */ + address_markers[2].start_address = VMALLOC_START; + address_markers[3].start_address = VMALLOC_END; +# ifdef CONFIG_HIGHMEM + address_markers[4].start_address = PKMAP_BASE; + address_markers[5].start_address = FIXADDR_START; +# else + address_markers[4].start_address = FIXADDR_START; +# endif +#endif + + pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + &ptdump_fops); + if (!pe) + return -ENOMEM; + + return 0; +} + +__initcall(pt_dump_init); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ec08d838985..fd7e1798c75 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -639,7 +639,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_32 /* It's safe to allow irq's after cr2 has been saved and the vmalloc fault has been handled. */ - if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) + if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) local_irq_enable(); /* @@ -976,9 +976,5 @@ void vmalloc_sync_all(void) if (address == start) start = address + PGDIR_SIZE; } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); #endif } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ee1091a4696..1500dc8d63e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -51,6 +51,8 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; +unsigned long max_pfn_mapped; + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; @@ -179,8 +181,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) /* * Map with big pages if possible, otherwise * create normal page tables: + * + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. */ - if (cpu_has_pse) { + if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; @@ -194,6 +201,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; + max_pfn_mapped = pfn; continue; } pte = one_page_table_init(pmd); @@ -208,6 +216,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) set_pte(pte, pfn_pte(pfn, prot)); } + max_pfn_mapped = pfn; } } } @@ -723,25 +732,17 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() <= 1) -#endif - { - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel text: %luk\n", - size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); #ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", - start, start+size); - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); - printk(KERN_INFO "Testing CPA: write protecting again\n"); - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); -#endif - } + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif start += size; size = (unsigned long)__end_rodata - start; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a02a14f0f32..1076097dcab 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,26 @@ static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +int direct_gbpages __meminitdata +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static int __init parse_direct_gbpages_off(char *arg) +{ + direct_gbpages = 0; + return 0; +} +early_param("nogbpages", parse_direct_gbpages_off); + +static int __init parse_direct_gbpages_on(char *arg) +{ + direct_gbpages = 1; + return 0; +} +early_param("gbpages", parse_direct_gbpages_on); + /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move @@ -69,9 +89,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", - nr_swap_pages << (PAGE_SHIFT-10)); - for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { /* @@ -296,7 +313,7 @@ __meminit void early_iounmap(void *addr, unsigned long size) __flush_tlb_all(); } -static void __meminit +static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { int i = pmd_index(address); @@ -318,21 +335,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); } + return address; } -static void __meminit +static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { pmd_t *pmd = pmd_offset(pud, 0); + unsigned long last_map_addr; + spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); + last_map_addr = phys_pmd_init(pmd, address, end); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); + return last_map_addr; } -static void __meminit +static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { + unsigned long last_map_addr = end; int i = pud_index(addr); for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { @@ -350,7 +372,15 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) } if (pud_val(*pud)) { - phys_pmd_update(pud, addr, end); + if (!pud_large(*pud)) + last_map_addr = phys_pmd_update(pud, addr, end); + continue; + } + + if (direct_gbpages) { + set_pte((pte_t *)pud, + pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + last_map_addr = (addr & PUD_MASK) + PUD_SIZE; continue; } @@ -358,12 +388,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, addr, end); + last_map_addr = phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); unmap_low_page(pmd); } __flush_tlb_all(); + + return last_map_addr >> PAGE_SHIFT; } static void __init find_early_table_space(unsigned long end) @@ -371,9 +403,11 @@ static void __init find_early_table_space(unsigned long end) unsigned long puds, pmds, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + - round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); + if (!direct_gbpages) { + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + } /* * RED-PEN putting page tables only on node 0 could @@ -393,16 +427,135 @@ static void __init find_early_table_space(unsigned long end) (table_start << PAGE_SHIFT) + tables); } +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} + +#ifdef CONFIG_MEMTEST_BOOTPARAM + +static void __init memtest(unsigned long start_phys, unsigned long size, + unsigned pattern) +{ + unsigned long i; + unsigned long *start; + unsigned long start_bad; + unsigned long last_bad; + unsigned long val; + unsigned long start_phys_aligned; + unsigned long count; + unsigned long incr; + + switch (pattern) { + case 0: + val = 0UL; + break; + case 1: + val = -1UL; + break; + case 2: + val = 0x5555555555555555UL; + break; + case 3: + val = 0xaaaaaaaaaaaaaaaaUL; + break; + default: + return; + } + + incr = sizeof(unsigned long); + start_phys_aligned = ALIGN(start_phys, incr); + count = (size - (start_phys_aligned - start_phys))/incr; + start = __va(start_phys_aligned); + start_bad = 0; + last_bad = 0; + + for (i = 0; i < count; i++) + start[i] = val; + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { + if (*start != val) { + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + } else { + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + start_bad = last_bad = start_phys_aligned; + } + } + } + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + +} + +static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("memtest", parse_memtest); + +static void __init early_memtest(unsigned long start, unsigned long end) +{ + unsigned long t_start, t_size; + unsigned pattern; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + for (pattern = 0; pattern < memtest_pattern; pattern++) { + t_start = start; + t_size = 0; + while (t_start < end) { + t_start = find_e820_area_size(t_start, &t_size, 1); + + /* done ? */ + if (t_start >= end) + break; + if (t_start + t_size > end) + t_size = end - t_start; + + printk(KERN_CONT "\n %016lx - %016lx pattern %d", + t_start, t_start + t_size, pattern); + + memtest(t_start, t_size, pattern); + + t_start += t_size; + } + } + printk(KERN_CONT "\n"); +} +#else +static void __init early_memtest(unsigned long start, unsigned long end) +{ +} +#endif + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ -void __init_refok init_memory_mapping(unsigned long start, unsigned long end) +unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long next; + unsigned long next, last_map_addr = end; + unsigned long start_phys = start, end_phys = end; - pr_debug("init_memory_mapping\n"); + printk(KERN_INFO "init_memory_mapping\n"); /* * Find space for the kernel direct mapping tables. @@ -411,8 +564,10 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) * memory mapped. Unfortunately this is done currently before the * nodes are discovered. */ - if (!after_bootmem) + if (!after_bootmem) { + init_gbpages(); find_early_table_space(end); + } start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -430,7 +585,7 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) next = start + PGDIR_SIZE; if (next > end) next = end; - phys_pud_init(pud, __pa(start), __pa(next)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(pud); @@ -443,6 +598,11 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) if (!after_bootmem) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + + if (!after_bootmem) + early_memtest(start_phys, end_phys); + + return last_map_addr; } #ifndef CONFIG_NUMA @@ -482,11 +642,13 @@ int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, start + size-1); + last_mapped_pfn = init_memory_mapping(start, start + size-1); + if (last_mapped_pfn > max_pfn_mapped) + max_pfn_mapped = last_mapped_pfn; ret = __add_pages(zone, start_pfn, nr_pages); WARN_ON(1); @@ -596,24 +758,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { - unsigned long start = (unsigned long)_stext, end; - -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif - -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif - - end = (unsigned long)__end_rodata; - start = (start + PAGE_SIZE - 1) & PAGE_MASK; - end &= PAGE_MASK; - if (end <= start) - return; - + unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -636,6 +781,7 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif } + #endif #ifdef CONFIG_BLK_DEV_INITRD @@ -657,7 +803,7 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) * This can happen with kdump kernels when accessing * firmware tables: */ - if (pfn < end_pfn_map) + if (pfn < max_pfn_mapped) return; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 794895c6dcc..c590fd200e2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -19,11 +19,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> - -enum ioremap_mode { - IOR_MODE_UNCACHED, - IOR_MODE_CACHED, -}; +#include <asm/pat.h> #ifdef CONFIG_X86_64 @@ -35,11 +31,23 @@ unsigned long __phys_addr(unsigned long x) } EXPORT_SYMBOL(__phys_addr); +static inline int phys_addr_valid(unsigned long addr) +{ + return addr < (1UL << boot_cpu_data.x86_phys_bits); +} + +#else + +static inline int phys_addr_valid(unsigned long addr) +{ + return 1; +} + #endif int page_is_ram(unsigned long pagenr) { - unsigned long addr, end; + resource_size_t addr, end; int i; /* @@ -78,19 +86,22 @@ int page_is_ram(unsigned long pagenr) * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. */ -static int ioremap_change_attr(unsigned long vaddr, unsigned long size, - enum ioremap_mode mode) +int ioremap_change_attr(unsigned long vaddr, unsigned long size, + unsigned long prot_val) { unsigned long nrpages = size >> PAGE_SHIFT; int err; - switch (mode) { - case IOR_MODE_UNCACHED: + switch (prot_val) { + case _PAGE_CACHE_UC: default: - err = set_memory_uc(vaddr, nrpages); + err = _set_memory_uc(vaddr, nrpages); + break; + case _PAGE_CACHE_WC: + err = _set_memory_wc(vaddr, nrpages); break; - case IOR_MODE_CACHED: - err = set_memory_wb(vaddr, nrpages); + case _PAGE_CACHE_WB: + err = _set_memory_wb(vaddr, nrpages); break; } @@ -107,17 +118,27 @@ static int ioremap_change_attr(unsigned long vaddr, unsigned long size, * caller shouldn't need to know that small detail. */ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, - enum ioremap_mode mode) + unsigned long prot_val) { - unsigned long pfn, offset, last_addr, vaddr; + unsigned long pfn, offset, vaddr; + resource_size_t last_addr; struct vm_struct *area; + unsigned long new_prot_val; pgprot_t prot; + int retval; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; if (!size || last_addr < phys_addr) return NULL; + if (!phys_addr_valid(phys_addr)) { + printk(KERN_WARNING "ioremap: invalid physical address %llx\n", + phys_addr); + WARN_ON_ONCE(1); + return NULL; + } + /* * Don't remap the low PCI/ISA area, it's always mapped.. */ @@ -127,25 +148,14 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped && - (pfn << PAGE_SHIFT) < last_addr; pfn++) { - if (page_is_ram(pfn) && pfn_valid(pfn) && - !PageReserved(pfn_to_page(pfn))) - return NULL; - } + for (pfn = phys_addr >> PAGE_SHIFT; + (pfn << PAGE_SHIFT) < last_addr; pfn++) { - switch (mode) { - case IOR_MODE_UNCACHED: - default: - /* - * FIXME: we will use UC MINUS for now, as video fb drivers - * depend on it. Upcoming ioremap_wc() will fix this behavior. - */ - prot = PAGE_KERNEL_UC_MINUS; - break; - case IOR_MODE_CACHED: - prot = PAGE_KERNEL; - break; + int is_ram = page_is_ram(pfn); + + if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) + return NULL; + WARN_ON_ONCE(is_ram); } /* @@ -155,6 +165,49 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; + retval = reserve_memtype(phys_addr, phys_addr + size, + prot_val, &new_prot_val); + if (retval) { + pr_debug("Warning: reserve_memtype returned %d\n", retval); + return NULL; + } + + if (prot_val != new_prot_val) { + /* + * Do not fallback to certain memory types with certain + * requested type: + * - request is uncached, return cannot be write-back + * - request is uncached, return cannot be write-combine + * - request is write-combine, return cannot be write-back + */ + if ((prot_val == _PAGE_CACHE_UC && + (new_prot_val == _PAGE_CACHE_WB || + new_prot_val == _PAGE_CACHE_WC)) || + (prot_val == _PAGE_CACHE_WC && + new_prot_val == _PAGE_CACHE_WB)) { + pr_debug( + "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + phys_addr, phys_addr + size, + prot_val, new_prot_val); + free_memtype(phys_addr, phys_addr + size); + return NULL; + } + prot_val = new_prot_val; + } + + switch (prot_val) { + case _PAGE_CACHE_UC: + default: + prot = PAGE_KERNEL_NOCACHE; + break; + case _PAGE_CACHE_WC: + prot = PAGE_KERNEL_WC; + break; + case _PAGE_CACHE_WB: + prot = PAGE_KERNEL; + break; + } + /* * Ok, go for it.. */ @@ -164,11 +217,13 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { + free_memtype(phys_addr, phys_addr + size); free_vm_area(area); return NULL; } - if (ioremap_change_attr(vaddr, size, mode) < 0) { + if (ioremap_change_attr(vaddr, size, prot_val) < 0) { + free_memtype(phys_addr, phys_addr + size); vunmap(area->addr); return NULL; } @@ -199,13 +254,32 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, */ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); + return __ioremap(phys_addr, size, _PAGE_CACHE_UC); } EXPORT_SYMBOL(ioremap_nocache); +/** + * ioremap_wc - map memory into CPU space write combined + * @offset: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write combining. + * Write combining allows faster writes to some hardware devices. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +{ + if (pat_wc_enabled) + return __ioremap(phys_addr, size, _PAGE_CACHE_WC); + else + return ioremap_nocache(phys_addr, size); +} +EXPORT_SYMBOL(ioremap_wc); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, IOR_MODE_CACHED); + return __ioremap(phys_addr, size, _PAGE_CACHE_WB); } EXPORT_SYMBOL(ioremap_cache); @@ -252,6 +326,8 @@ void iounmap(volatile void __iomem *addr) return; } + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); + /* Finally remove it */ o = remove_vm_area((void *)addr); BUG_ON(p != o || o == NULL); @@ -272,8 +348,8 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] - __attribute__((aligned(PAGE_SIZE))); +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] + __section(.bss.page_aligned); static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 16b82ad34b9..2ea56f48f29 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -31,13 +31,15 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; +#ifdef CONFIG_SMP int x86_cpu_to_node_map_init[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; void *x86_cpu_to_node_map_early_ptr; +EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); +#endif DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); -EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE @@ -548,8 +550,6 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - cpu_pda(cpu)->nodenumber = node; - if(cpu_to_node_map) cpu_to_node_map[cpu] = node; else if(per_cpu_offset(cpu)) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7b79f6be4e7..f7823a17286 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -9,6 +9,8 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> #include <asm/e820.h> #include <asm/processor.h> @@ -17,6 +19,7 @@ #include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> +#include <asm/pat.h> /* * The current flushing context - we pass it instead of 5 arguments: @@ -28,6 +31,7 @@ struct cpa_data { int numpages; int flushtlb; unsigned long pfn; + unsigned force_split : 1; }; #ifdef CONFIG_X86_64 @@ -259,6 +263,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, int i, do_split = 1; unsigned int level; + if (cpa->force_split) + return 1; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page @@ -535,7 +542,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) repeat: kpte = lookup_address(address, &level); if (!kpte) - return primary ? -EINVAL : 0; + return 0; old_pte = *kpte; if (!pte_val(old_pte)) { @@ -693,7 +700,8 @@ static inline int cache_attr(pgprot_t attr) } static int change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr) + pgprot_t mask_set, pgprot_t mask_clr, + int force_split) { struct cpa_data cpa; int ret, cache, checkalias; @@ -704,7 +712,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, */ mask_set = canon_pgprot(mask_set); mask_clr = canon_pgprot(mask_clr); - if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ @@ -721,6 +729,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flushtlb = 0; + cpa.force_split = force_split; /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -759,26 +768,61 @@ out: static inline int change_page_attr_set(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); } static inline int change_page_attr_clear(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); } -int set_memory_uc(unsigned long addr, int numpages) +int _set_memory_uc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PCD)); + __pgprot(_PAGE_CACHE_UC)); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_UC, NULL)) + return -EINVAL; + + return _set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_memory_uc); -int set_memory_wb(unsigned long addr, int numpages) +int _set_memory_wc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_CACHE_WC)); +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + if (!pat_wc_enabled) + return set_memory_uc(addr, numpages); + + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL)) + return -EINVAL; + + return _set_memory_wc(addr, numpages); +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wb(unsigned long addr, int numpages) { return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_CACHE_MASK)); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + free_memtype(addr, addr + numpages * PAGE_SIZE); + + return _set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_memory_wb); @@ -809,6 +853,12 @@ int set_memory_np(unsigned long addr, int numpages) return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); } +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), + __pgprot(0), 1); +} + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -918,6 +968,45 @@ void kernel_map_pages(struct page *page, int numpages, int enable) cpa_fill_pool(NULL); } +#ifdef CONFIG_DEBUG_FS +static int dpa_show(struct seq_file *m, void *v) +{ + seq_puts(m, "DEBUG_PAGEALLOC\n"); + seq_printf(m, "pool_size : %lu\n", pool_size); + seq_printf(m, "pool_pages : %lu\n", pool_pages); + seq_printf(m, "pool_low : %lu\n", pool_low); + seq_printf(m, "pool_used : %lu\n", pool_used); + seq_printf(m, "pool_failed : %lu\n", pool_failed); + + return 0; +} + +static int dpa_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, dpa_show, NULL); +} + +static const struct file_operations dpa_fops = { + .open = dpa_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int __init debug_pagealloc_proc_init(void) +{ + struct dentry *de; + + de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, + &dpa_fops); + if (!de) + return -ENOMEM; + + return 0; +} +__initcall(debug_pagealloc_proc_init); +#endif + #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c new file mode 100644 index 00000000000..72c0f609740 --- /dev/null +++ b/arch/x86/mm/pat.c @@ -0,0 +1,421 @@ +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + */ + +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/fs.h> + +#include <asm/msr.h> +#include <asm/tlbflush.h> +#include <asm/processor.h> +#include <asm/pgtable.h> +#include <asm/pat.h> +#include <asm/e820.h> +#include <asm/cacheflush.h> +#include <asm/fcntl.h> +#include <asm/mtrr.h> + +int pat_wc_enabled = 1; + +static u64 __read_mostly boot_pat_state; + +static int nopat(char *str) +{ + pat_wc_enabled = 0; + printk(KERN_INFO "x86: PAT support disabled.\n"); + + return 0; +} +early_param("nopat", nopat); + +static int pat_known_cpu(void) +{ + if (!pat_wc_enabled) + return 0; + + if (cpu_has_pat) + return 1; + + pat_wc_enabled = 0; + printk(KERN_INFO "CPU and/or kernel does not support PAT.\n"); + return 0; +} + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ +}; + +#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) + +void pat_init(void) +{ + u64 pat; + +#ifndef CONFIG_X86_PAT + nopat(NULL); +#endif + + /* Boot CPU enables PAT based on CPU feature */ + if (!smp_processor_id() && !pat_known_cpu()) + return; + + /* APs enable PAT iff boot CPU has enabled it before */ + if (smp_processor_id() && !pat_wc_enabled) + return; + + /* Set PWT to Write-Combining. All other bits stay the same */ + /* + * PTE encoding used in Linux: + * PAT + * |PCD + * ||PWT + * ||| + * 000 WB _PAGE_CACHE_WB + * 001 WC _PAGE_CACHE_WC + * 010 UC- _PAGE_CACHE_UC_MINUS + * 011 UC _PAGE_CACHE_UC + * PAT bit unused + */ + pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | + PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); + + /* Boot CPU check */ + if (!smp_processor_id()) { + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + } + + wrmsrl(MSR_IA32_CR_PAT, pat); + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); +} + +#undef PAT + +static char *cattr_name(unsigned long flags) +{ + switch (flags & _PAGE_CACHE_MASK) { + case _PAGE_CACHE_UC: return "uncached"; + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_WB: return "write-back"; + case _PAGE_CACHE_WC: return "write-combining"; + default: return "broken"; + } +} + +/* + * The global memtype list keeps track of memory type for specific + * physical memory areas. Conflicting memory types in different + * mappings can cause CPU cache corruption. To avoid this we keep track. + * + * The list is sorted based on starting address and can contain multiple + * entries for each address (this allows reference counting for overlapping + * areas). All the aliases have the same cache attributes of course. + * Zero attributes are represented as holes. + * + * Currently the data structure is a list because the number of mappings + * are expected to be relatively small. If this should be a problem + * it could be changed to a rbtree or similar. + * + * memtype_lock protects the whole list. + */ + +struct memtype { + u64 start; + u64 end; + unsigned long type; + struct list_head nd; +}; + +static LIST_HEAD(memtype_list); +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ + +/* + * Does intersection of PAT memory type and MTRR memory type and returns + * the resulting memory type as PAT understands it. + * (Type in pat and mtrr will not have same value) + * The intersection is based on "Effective Memory Type" tables in IA-32 + * SDM vol 3a + */ +static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, + unsigned long *ret_prot) +{ + unsigned long pat_type; + u8 mtrr_type; + + mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == 0xFF) { /* MTRR not enabled */ + *ret_prot = prot; + return 0; + } + if (mtrr_type == 0xFE) { /* MTRR match error */ + *ret_prot = _PAGE_CACHE_UC; + return -1; + } + if (mtrr_type != MTRR_TYPE_UNCACHABLE && + mtrr_type != MTRR_TYPE_WRBACK && + mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */ + *ret_prot = _PAGE_CACHE_UC; + return -1; + } + + pat_type = prot & _PAGE_CACHE_MASK; + prot &= (~_PAGE_CACHE_MASK); + + /* Currently doing intersection by hand. Optimize it later. */ + if (pat_type == _PAGE_CACHE_WC) { + *ret_prot = prot | _PAGE_CACHE_WC; + } else if (pat_type == _PAGE_CACHE_UC_MINUS) { + *ret_prot = prot | _PAGE_CACHE_UC_MINUS; + } else if (pat_type == _PAGE_CACHE_UC || + mtrr_type == MTRR_TYPE_UNCACHABLE) { + *ret_prot = prot | _PAGE_CACHE_UC; + } else if (mtrr_type == MTRR_TYPE_WRCOMB) { + *ret_prot = prot | _PAGE_CACHE_WC; + } else { + *ret_prot = prot | _PAGE_CACHE_WB; + } + + return 0; +} + +int reserve_memtype(u64 start, u64 end, unsigned long req_type, + unsigned long *ret_type) +{ + struct memtype *new_entry = NULL; + struct memtype *parse; + unsigned long actual_type; + int err = 0; + + /* Only track when pat_wc_enabled */ + if (!pat_wc_enabled) { + if (ret_type) + *ret_type = req_type; + + return 0; + } + + /* Low ISA region is always mapped WB in page table. No need to track */ + if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { + if (ret_type) + *ret_type = _PAGE_CACHE_WB; + + return 0; + } + + req_type &= _PAGE_CACHE_MASK; + err = pat_x_mtrr_type(start, end, req_type, &actual_type); + if (err) { + if (ret_type) + *ret_type = actual_type; + + return -EINVAL; + } + + new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + new_entry->start = start; + new_entry->end = end; + new_entry->type = actual_type; + + if (ret_type) + *ret_type = actual_type; + + spin_lock(&memtype_lock); + + /* Search for existing mapping that overlaps the current range */ + list_for_each_entry(parse, &memtype_list, nd) { + struct memtype *saved_ptr; + + if (parse->start >= end) { + printk("New Entry\n"); + list_add(&new_entry->nd, parse->nd.prev); + new_entry = NULL; + break; + } + + if (start <= parse->start && end >= parse->start) { + if (actual_type != parse->type && ret_type) { + actual_type = parse->type; + *ret_type = actual_type; + new_entry->type = actual_type; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + + saved_ptr = parse; + /* + * Check to see whether the request overlaps more + * than one entry in the list + */ + list_for_each_entry_continue(parse, &memtype_list, nd) { + if (end <= parse->start) { + break; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + } + + if (err) { + break; + } + + printk("Overlap at 0x%Lx-0x%Lx\n", + saved_ptr->start, saved_ptr->end); + /* No conflict. Go ahead and add this new entry */ + list_add(&new_entry->nd, saved_ptr->nd.prev); + new_entry = NULL; + break; + } + + if (start < parse->end) { + if (actual_type != parse->type && ret_type) { + actual_type = parse->type; + *ret_type = actual_type; + new_entry->type = actual_type; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + + saved_ptr = parse; + /* + * Check to see whether the request overlaps more + * than one entry in the list + */ + list_for_each_entry_continue(parse, &memtype_list, nd) { + if (end <= parse->start) { + break; + } + + if (actual_type != parse->type) { + printk( + KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, + start, end, + cattr_name(actual_type), + cattr_name(parse->type)); + err = -EBUSY; + break; + } + } + + if (err) { + break; + } + + printk("Overlap at 0x%Lx-0x%Lx\n", + saved_ptr->start, saved_ptr->end); + /* No conflict. Go ahead and add this new entry */ + list_add(&new_entry->nd, &saved_ptr->nd); + new_entry = NULL; + break; + } + } + + if (err) { + printk( + "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", + start, end, cattr_name(new_entry->type), + cattr_name(req_type)); + kfree(new_entry); + spin_unlock(&memtype_lock); + return err; + } + + if (new_entry) { + /* No conflict. Not yet added to the list. Add to the tail */ + list_add_tail(&new_entry->nd, &memtype_list); + printk("New Entry\n"); + } + + if (ret_type) { + printk( + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", + start, end, cattr_name(actual_type), + cattr_name(req_type), cattr_name(*ret_type)); + } else { + printk( + "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", + start, end, cattr_name(actual_type), + cattr_name(req_type)); + } + + spin_unlock(&memtype_lock); + return err; +} + +int free_memtype(u64 start, u64 end) +{ + struct memtype *ml; + int err = -EINVAL; + + /* Only track when pat_wc_enabled */ + if (!pat_wc_enabled) { + return 0; + } + + /* Low ISA region is always mapped WB. No need to track */ + if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { + return 0; + } + + spin_lock(&memtype_lock); + list_for_each_entry(ml, &memtype_list, nd) { + if (ml->start == start && ml->end == end) { + list_del(&ml->nd); + kfree(ml); + err = 0; + break; + } + } + spin_unlock(&memtype_lock); + + if (err) { + printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n", + current->comm, current->pid, start, end); + } + + printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end); + return err; +} + diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 2f9e9afcb9f..3165ec0672b 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -36,7 +36,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_pgdat(pgdat) { pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { @@ -381,3 +380,10 @@ void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) } #endif + +int pmd_bad(pmd_t pmd) +{ + WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); + + return pmd_bad_v1(pmd); +} diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 845001c617c..1bae9c855ce 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -20,6 +20,7 @@ #include <asm/proto.h> #include <asm/numa.h> #include <asm/e820.h> +#include <asm/genapic.h> int acpi_numa __initdata; @@ -132,7 +133,6 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) int pxm, node; int apic_id; - apic_id = pa->apic_id; if (srat_disabled()) return; if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { @@ -148,6 +148,11 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) bad_srat(); return; } + + if (is_uv_system()) + apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; + else + apic_id = pa->apic_id; apicid_to_node[apic_id] = node; acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", |