aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-10-18 01:09:09 +0200
committerFrederic Weisbecker <fweisbec@gmail.com>2009-10-18 01:12:33 +0200
commit0f8f86c7bdd1c954fbe153af437a0d91a6c5721a (patch)
tree94a8d419a470a4f9852ca397bb9bbe48db92ff5c /arch/x86/mm
parentdca2d6ac09d9ef59ff46820d4f0c94b08a671202 (diff)
parentf39cdf25bf77219676ec5360980ac40b1a7e144a (diff)
Merge commit 'perf/core' into perf/hw-breakpoint
Conflicts: kernel/Makefile kernel/trace/Makefile kernel/trace/trace.h samples/Makefile Merge reason: We need to be uptodate with the perf events development branch because we plan to rewrite the breakpoints API on top of perf events.
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c27
-rw-r--r--arch/x86/mm/init.c63
-rw-r--r--arch/x86/mm/init_32.c12
-rw-r--r--arch/x86/mm/init_64.c12
-rw-r--r--arch/x86/mm/iomap_32.c27
-rw-r--r--arch/x86/mm/ioremap.c18
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c3
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c1
-rw-r--r--arch/x86/mm/mmap.c17
-rw-r--r--arch/x86/mm/pageattr.c30
-rw-r--r--arch/x86/mm/pat.c360
-rw-r--r--arch/x86/mm/setup_nx.c69
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c15
15 files changed, 437 insertions, 249 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9b5a9f59a47..06630d26e56 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,9 +1,10 @@
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o
+ pat.o pgtable.o physaddr.o gup.o setup_nx.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
+CFLAGS_setup_nx.o := $(nostackp)
obj-$(CONFIG_SMP) += tlb.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 775a020990a..f4cee9028cf 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,7 +10,7 @@
#include <linux/bootmem.h> /* max_low_pfn */
#include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
-#include <linux/perf_counter.h> /* perf_swcounter_event */
+#include <linux/perf_event.h> /* perf_sw_event */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void __user *)address;
+ info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
force_sig_info(si_signo, &info, tsk);
}
@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
}
static void
-do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+ unsigned int fault)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
+ int code = BUS_ADRERR;
up_read(&mm->mmap_sem);
@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+#ifdef CONFIG_MEMORY_FAILURE
+ if (fault & VM_FAULT_HWPOISON) {
+ printk(KERN_ERR
+ "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+ tsk->comm, tsk->pid, address);
+ code = BUS_MCEERR_AR;
+ }
+#endif
+ force_sig_info_fault(SIGBUS, code, address, tsk);
}
static noinline void
@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address);
} else {
- if (fault & VM_FAULT_SIGBUS)
- do_sigbus(regs, error_code, address);
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+ do_sigbus(regs, error_code, address, fault);
else
BUG();
}
@@ -1017,7 +1028,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
- perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
/*
* If we're in an interrupt, have no user context or are running
@@ -1114,11 +1125,11 @@ good_area:
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
regs, address);
} else {
tsk->min_flt++;
- perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
regs, address);
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 0607119cef9..73ffd5536f6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -28,69 +28,6 @@ int direct_gbpages
#endif
;
-int nx_enabled;
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec = on|off
- *
- * Control non-executable mappings for processes.
- *
- * on Enable
- * off Disable
- */
-static int __init noexec_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
- disable_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- }
- return 0;
-}
-early_param("noexec", noexec_setup);
-#endif
-
-#ifdef CONFIG_X86_PAE
-static void __init set_nx(void)
-{
- unsigned int v[4], l, h;
-
- if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
- cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
- if ((v[3] & (1 << 20)) && !disable_nx) {
- rdmsr(MSR_EFER, l, h);
- l |= EFER_NX;
- wrmsr(MSR_EFER, l, h);
- nx_enabled = 1;
- __supported_pte_mask |= _PAGE_NX;
- }
- }
-}
-#else
-static inline void set_nx(void)
-{
-}
-#endif
-
-#ifdef CONFIG_X86_64
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || disable_nx)
- __supported_pte_mask &= ~_PAGE_NX;
-}
-#endif
-
static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3cd7711bb94..30938c1d8d5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
if (after_bootmem)
- pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
else
pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
@@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
#endif
if (!page_table)
page_table =
- (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
} else
page_table = (pte_t *)alloc_low_page();
@@ -857,8 +857,6 @@ static void __init test_wp_bit(void)
}
}
-static struct kcore_list kcore_mem, kcore_vmalloc;
-
void __init mem_init(void)
{
int codesize, reservedpages, datasize, initsize;
@@ -886,13 +884,9 @@ void __init mem_init(void)
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
- VMALLOC_END-VMALLOC_START);
-
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
- (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ nr_free_pages() << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ea56b8cbb6a..5a4398a6006 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
- kcore_modules, kcore_vsyscall;
+static struct kcore_list kcore_vsyscall;
void __init mem_init(void)
{
@@ -677,17 +676,12 @@ void __init mem_init(void)
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
- VMALLOC_END-VMALLOC_START);
- kclist_add(&kcore_kernel, &_stext, _end - _stext);
- kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
- VSYSCALL_END - VSYSCALL_START);
+ VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
"%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
- (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+ nr_free_pages() << (PAGE_SHIFT-10),
max_pfn << (PAGE_SHIFT-10),
codesize >> 10,
absent_pages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index fe6f84ca121..84e236ce76b 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,7 +21,7 @@
#include <linux/module.h>
#include <linux/highmem.h>
-int is_io_mapping_possible(resource_size_t base, unsigned long size)
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
/* There is no way to map greater than 1 << 32 address without PAE */
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
#endif
return 1;
}
-EXPORT_SYMBOL_GPL(is_io_mapping_possible);
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+ unsigned long flag = _PAGE_CACHE_WC;
+ int ret;
+
+ if (!is_io_mapping_possible(base, size))
+ return -EINVAL;
+
+ ret = io_reserve_memtype(base, base + size, &flag);
+ if (ret)
+ return ret;
+
+ *prot = __pgprot(__PAGE_KERNEL | flag);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void
+iomap_free(resource_size_t base, unsigned long size)
+{
+ io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 04e1ad60c63..334e63ca7b2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -158,24 +158,14 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
prot_val, &new_prot_val);
if (retval) {
- pr_debug("Warning: reserve_memtype returned %d\n", retval);
+ printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
return NULL;
}
if (prot_val != new_prot_val) {
- /*
- * Do not fallback to certain memory types with certain
- * requested type:
- * - request is uc-, return cannot be write-back
- * - request is uc-, return cannot be write-combine
- * - request is write-combine, return cannot be write-back
- */
- if ((prot_val == _PAGE_CACHE_UC_MINUS &&
- (new_prot_val == _PAGE_CACHE_WB ||
- new_prot_val == _PAGE_CACHE_WC)) ||
- (prot_val == _PAGE_CACHE_WC &&
- new_prot_val == _PAGE_CACHE_WB)) {
- pr_debug(
+ if (!is_new_memtype_allowed(phys_addr, size,
+ prot_val, new_prot_val)) {
+ printk(KERN_ERR
"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
(unsigned long long)phys_addr,
(unsigned long long)(phys_addr + size),
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 528bf954eb7..8cc18334414 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs)
BUG_ON(!irqs_disabled());
- if (data->balance == 0)
- return;
-
if (unlikely(data->balance != 1)) {
kmemcheck_show_all();
kmemcheck_error_save_bug(regs);
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index e773b6bd007..3f66b82076a 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -1,7 +1,6 @@
#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/module.h>
#include <asm/page.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 16582960056..c8191defc38 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -29,13 +29,26 @@
#include <linux/random.h>
#include <linux/limits.h>
#include <linux/sched.h>
+#include <asm/elf.h>
+
+static unsigned int stack_maxrandom_size(void)
+{
+ unsigned int max = 0;
+ if ((current->flags & PF_RANDOMIZE) &&
+ !(current->personality & ADDR_NO_RANDOMIZE)) {
+ max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
+ }
+
+ return max;
+}
+
/*
* Top of mmap area (just below the process stack).
*
- * Leave an at least ~128 MB hole.
+ * Leave an at least ~128 MB hole with possible stack randomization.
*/
-#define MIN_GAP (128*1024*1024)
+#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)
/*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962d..dd38bfbefd1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
+#include <linux/percpu.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -143,6 +144,7 @@ void clflush_cache_range(void *vaddr, unsigned int size)
mb();
}
+EXPORT_SYMBOL_GPL(clflush_cache_range);
static void __cpa_flush_all(void *arg)
{
@@ -686,7 +688,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
- unsigned long vaddr, remapped;
+ unsigned long vaddr;
int ret;
if (cpa->pfn >= max_pfn_mapped)
@@ -744,24 +746,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
}
#endif
- /*
- * If the PMD page was partially used for per-cpu remapping,
- * the recycled area needs to be split and modified. Because
- * the area is always proper subset of a PMD page
- * cpa->numpages is guaranteed to be 1 for these areas, so
- * there's no need to loop over and check for further remaps.
- */
- remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
- if (remapped) {
- WARN_ON(cpa->numpages > 1);
- alias_cpa = *cpa;
- alias_cpa.vaddr = &remapped;
- alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
- ret = __change_page_attr_set_clr(&alias_cpa, 0);
- if (ret)
- return ret;
- }
-
return 0;
}
@@ -822,6 +806,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
{
struct cpa_data cpa;
int ret, cache, checkalias;
+ unsigned long baddr = 0;
/*
* Check, if we are requested to change a not supported
@@ -853,6 +838,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
*/
WARN_ON_ONCE(1);
}
+ /*
+ * Save address for cache flush. *addr is modified in the call
+ * to __change_page_attr_set_clr() below.
+ */
+ baddr = *addr;
}
/* Must avoid aliasing mappings in the highmem code */
@@ -900,7 +890,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa_flush_array(addr, numpages, cache,
cpa.flags, pages);
} else
- cpa_flush_range(*addr, numpages, cache);
+ cpa_flush_range(baddr, numpages, cache);
} else
cpa_flush_all(cache);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index b2f7d3e59b8..e78cd0ec2bc 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -15,6 +15,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/rbtree.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
@@ -80,6 +81,7 @@ enum {
void pat_init(void)
{
u64 pat;
+ bool boot_cpu = !boot_pat_state;
if (!pat_enabled)
return;
@@ -121,8 +123,10 @@ void pat_init(void)
rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
wrmsrl(MSR_IA32_CR_PAT, pat);
- printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
- smp_processor_id(), boot_pat_state, pat);
+
+ if (boot_cpu)
+ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+ smp_processor_id(), boot_pat_state, pat);
}
#undef PAT
@@ -148,11 +152,10 @@ static char *cattr_name(unsigned long flags)
* areas). All the aliases have the same cache attributes of course.
* Zero attributes are represented as holes.
*
- * Currently the data structure is a list because the number of mappings
- * are expected to be relatively small. If this should be a problem
- * it could be changed to a rbtree or similar.
+ * The data structure is a list that is also organized as an rbtree
+ * sorted on the start address of memtype range.
*
- * memtype_lock protects the whole list.
+ * memtype_lock protects both the linear list and rbtree.
*/
struct memtype {
@@ -160,11 +163,53 @@ struct memtype {
u64 end;
unsigned long type;
struct list_head nd;
+ struct rb_node rb;
};
+static struct rb_root memtype_rbroot = RB_ROOT;
static LIST_HEAD(memtype_list);
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
+{
+ struct rb_node *node = root->rb_node;
+ struct memtype *last_lower = NULL;
+
+ while (node) {
+ struct memtype *data = container_of(node, struct memtype, rb);
+
+ if (data->start < start) {
+ last_lower = data;
+ node = node->rb_right;
+ } else if (data->start > start) {
+ node = node->rb_left;
+ } else
+ return data;
+ }
+
+ /* Will return NULL if there is no entry with its start <= start */
+ return last_lower;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
+{
+ struct rb_node **new = &(root->rb_node);
+ struct rb_node *parent = NULL;
+
+ while (*new) {
+ struct memtype *this = container_of(*new, struct memtype, rb);
+
+ parent = *new;
+ if (data->start <= this->start)
+ new = &((*new)->rb_left);
+ else if (data->start > this->start)
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->rb, parent, new);
+ rb_insert_color(&data->rb, root);
+}
+
/*
* Does intersection of PAT memory type and MTRR memory type and returns
* the resulting memory type as PAT understands it.
@@ -218,9 +263,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
return -EBUSY;
}
-static struct memtype *cached_entry;
-static u64 cached_start;
-
static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
{
int ram_page = 0, not_rampage = 0;
@@ -249,63 +291,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
}
/*
- * For RAM pages, mark the pages as non WB memory type using
- * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
- * set_memory_wc() on a RAM page at a time before marking it as WB again.
- * This is ok, because only one driver will be owning the page and
- * doing set_memory_*() calls.
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
*
- * For now, we use PageNonWB to track that the RAM page is being mapped
- * as non WB. In future, we will have to use one more flag
- * (or some other mechanism in page_struct) to distinguish between
- * UC and WC mapping.
+ * Caller must hold memtype_lock for atomicity.
*/
static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
unsigned long *new_type)
{
struct page *page;
- u64 pfn, end_pfn;
+ u64 pfn;
+
+ if (req_type == _PAGE_CACHE_UC) {
+ /* We do not support strong UC */
+ WARN_ON_ONCE(1);
+ req_type = _PAGE_CACHE_UC_MINUS;
+ }
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
- page = pfn_to_page(pfn);
- if (page_mapped(page) || PageNonWB(page))
- goto out;
+ unsigned long type;
- SetPageNonWB(page);
+ page = pfn_to_page(pfn);
+ type = get_page_memtype(page);
+ if (type != -1) {
+ printk(KERN_INFO "reserve_ram_pages_type failed "
+ "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
+ start, end, type, req_type);
+ if (new_type)
+ *new_type = type;
+
+ return -EBUSY;
+ }
}
- return 0;
-out:
- end_pfn = pfn;
- for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+ if (new_type)
+ *new_type = req_type;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
- ClearPageNonWB(page);
+ set_page_memtype(page, req_type);
}
-
- return -EINVAL;
+ return 0;
}
static int free_ram_pages_type(u64 start, u64 end)
{
struct page *page;
- u64 pfn, end_pfn;
+ u64 pfn;
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
- if (page_mapped(page) || !PageNonWB(page))
- goto out;
-
- ClearPageNonWB(page);
+ set_page_memtype(page, -1);
}
return 0;
-
-out:
- end_pfn = pfn;
- for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
- page = pfn_to_page(pfn);
- SetPageNonWB(page);
- }
- return -EINVAL;
}
/*
@@ -339,6 +379,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (new_type) {
if (req_type == -1)
*new_type = _PAGE_CACHE_WB;
+ else if (req_type == _PAGE_CACHE_WC)
+ *new_type = _PAGE_CACHE_UC_MINUS;
else
*new_type = req_type & _PAGE_CACHE_MASK;
}
@@ -364,11 +406,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
*new_type = actual_type;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return reserve_ram_pages_type(start, end, req_type,
- new_type);
- else if (is_range_ram < 0)
+ if (is_range_ram == 1) {
+
+ spin_lock(&memtype_lock);
+ err = reserve_ram_pages_type(start, end, req_type, new_type);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ } else if (is_range_ram < 0) {
return -EINVAL;
+ }
new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
@@ -380,17 +427,11 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
spin_lock(&memtype_lock);
- if (cached_entry && start >= cached_start)
- entry = cached_entry;
- else
- entry = list_entry(&memtype_list, struct memtype, nd);
-
/* Search for existing mapping that overlaps the current range */
where = NULL;
- list_for_each_entry_continue(entry, &memtype_list, nd) {
+ list_for_each_entry(entry, &memtype_list, nd) {
if (end <= entry->start) {
where = entry->nd.prev;
- cached_entry = list_entry(where, struct memtype, nd);
break;
} else if (start <= entry->start) { /* end > entry->start */
err = chk_conflict(new, entry, new_type);
@@ -398,8 +439,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
where = entry->nd.prev;
- cached_entry = list_entry(where,
- struct memtype, nd);
}
break;
} else if (start < entry->end) { /* start > entry->start */
@@ -407,8 +446,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (!err) {
dprintk("Overlap at 0x%Lx-0x%Lx\n",
entry->start, entry->end);
- cached_entry = list_entry(entry->nd.prev,
- struct memtype, nd);
/*
* Move to right position in the linked
@@ -436,13 +473,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
return err;
}
- cached_start = start;
-
if (where)
list_add(&new->nd, where);
else
list_add_tail(&new->nd, &memtype_list);
+ memtype_rb_insert(&memtype_rbroot, new);
+
spin_unlock(&memtype_lock);
dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -454,7 +491,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
int free_memtype(u64 start, u64 end)
{
- struct memtype *entry;
+ struct memtype *entry, *saved_entry;
int err = -EINVAL;
int is_range_ram;
@@ -466,23 +503,58 @@ int free_memtype(u64 start, u64 end)
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1)
- return free_ram_pages_type(start, end);
- else if (is_range_ram < 0)
+ if (is_range_ram == 1) {
+
+ spin_lock(&memtype_lock);
+ err = free_ram_pages_type(start, end);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ } else if (is_range_ram < 0) {
return -EINVAL;
+ }
spin_lock(&memtype_lock);
- list_for_each_entry(entry, &memtype_list, nd) {
+
+ entry = memtype_rb_search(&memtype_rbroot, start);
+ if (unlikely(entry == NULL))
+ goto unlock_ret;
+
+ /*
+ * Saved entry points to an entry with start same or less than what
+ * we searched for. Now go through the list in both directions to look
+ * for the entry that matches with both start and end, with list stored
+ * in sorted start address
+ */
+ saved_entry = entry;
+ list_for_each_entry_from(entry, &memtype_list, nd) {
if (entry->start == start && entry->end == end) {
- if (cached_entry == entry || cached_start == start)
- cached_entry = NULL;
+ rb_erase(&entry->rb, &memtype_rbroot);
+ list_del(&entry->nd);
+ kfree(entry);
+ err = 0;
+ break;
+ } else if (entry->start > start) {
+ break;
+ }
+ }
+ if (!err)
+ goto unlock_ret;
+
+ entry = saved_entry;
+ list_for_each_entry_reverse(entry, &memtype_list, nd) {
+ if (entry->start == start && entry->end == end) {
+ rb_erase(&entry->rb, &memtype_rbroot);
list_del(&entry->nd);
kfree(entry);
err = 0;
break;
+ } else if (entry->start < start) {
+ break;
}
}
+unlock_ret:
spin_unlock(&memtype_lock);
if (err) {
@@ -496,6 +568,101 @@ int free_memtype(u64 start, u64 end)
}
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
+ * _PAGE_CACHE_UC
+ */
+static unsigned long lookup_memtype(u64 paddr)
+{
+ int rettype = _PAGE_CACHE_WB;
+ struct memtype *entry;
+
+ if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
+ return rettype;
+
+ if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+ struct page *page;
+ spin_lock(&memtype_lock);
+ page = pfn_to_page(paddr >> PAGE_SHIFT);
+ rettype = get_page_memtype(page);
+ spin_unlock(&memtype_lock);
+ /*
+ * -1 from get_page_memtype() implies RAM page is in its
+ * default state and not reserved, and hence of type WB
+ */
+ if (rettype == -1)
+ rettype = _PAGE_CACHE_WB;
+
+ return rettype;
+ }
+
+ spin_lock(&memtype_lock);
+
+ entry = memtype_rb_search(&memtype_rbroot, paddr);
+ if (entry != NULL)
+ rettype = entry->type;
+ else
+ rettype = _PAGE_CACHE_UC_MINUS;
+
+ spin_unlock(&memtype_lock);
+ return rettype;
+}
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ unsigned long *type)
+{
+ resource_size_t size = end - start;
+ unsigned long req_type = *type;
+ unsigned long new_type;
+ int ret;
+
+ WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+ ret = reserve_memtype(start, end, req_type, &new_type);
+ if (ret)
+ goto out_err;
+
+ if (!is_new_memtype_allowed(start, size, req_type, new_type))
+ goto out_free;
+
+ if (kernel_map_sync_memtype(start, size, new_type) < 0)
+ goto out_free;
+
+ *type = new_type;
+ return 0;
+
+out_free:
+ free_memtype(start, end);
+ ret = -EBUSY;
+out_err:
+ return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+ free_memtype(start, end);
+}
+
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
@@ -577,7 +744,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (!pat_enabled || base >= __pa(high_memory))
+ if (base >= __pa(high_memory))
return 0;
id_sz = (__pa(high_memory) < base + size) ?
@@ -612,11 +779,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
- * reserve_pfn_range() doesn't support RAM pages. Maintain the current
- * behavior with RAM pages by returning success.
+ * reserve_pfn_range() for RAM pages. We do not refcount to keep
+ * track of number of mappings of RAM pages. We can assert that
+ * the type requested matches the type of first page in the range.
*/
- if (is_ram != 0)
+ if (is_ram) {
+ if (!pat_enabled)
+ return 0;
+
+ flags = lookup_memtype(paddr);
+ if (want_flags != flags) {
+ printk(KERN_WARNING
+ "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size),
+ cattr_name(flags));
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ flags);
+ }
return 0;
+ }
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
@@ -678,14 +863,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
- if (!pat_enabled)
- return 0;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/*
* reserve the whole chunk covered by vma. We need the
@@ -713,23 +890,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long size)
{
+ unsigned long flags;
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (!pat_enabled)
- return 0;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/* reserve the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
return reserve_pfn_range(paddr, vma_size, prot, 0);
}
+ if (!pat_enabled)
+ return 0;
+
+ /* for vm_insert_pfn and friends, we set prot based on lookup */
+ flags = lookup_memtype(pfn << PAGE_SHIFT);
+ *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+ flags);
+
return 0;
}
@@ -744,14 +922,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
resource_size_t paddr;
unsigned long vma_size = vma->vm_end - vma->vm_start;
- if (!pat_enabled)
- return;
-
- /*
- * For now, only handle remap_pfn_range() vmas where
- * is_linear_pfn_mapping() == TRUE. Handling of
- * vm_insert_pfn() is TBD.
- */
if (is_linear_pfn_mapping(vma)) {
/* free the whole chunk starting from vm_pgoff */
paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
new file mode 100644
index 00000000000..513d8ed5d2e
--- /dev/null
+++ b/arch/x86/mm/setup_nx.c
@@ -0,0 +1,69 @@
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on Enable
+ * off Disable
+ */
+static int __init noexec_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ disable_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ disable_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+void __init set_nx(void)
+{
+ unsigned int v[4], l, h;
+
+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+ if ((v[3] & (1 << 20)) && !disable_nx) {
+ rdmsr(MSR_EFER, l, h);
+ l |= EFER_NX;
+ wrmsr(MSR_EFER, l, h);
+ nx_enabled = 1;
+ __supported_pte_mask |= _PAGE_NX;
+ }
+ }
+}
+#else
+void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || disable_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df..8565d944f7c 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
/*
* Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/io.h>
#include <linux/mmiotrace.h>
-#define MODULE_NAME "testmmiotrace"
-
static unsigned long mmio_address;
module_param(mmio_address, ulong, 0);
MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
static void do_write_test(void __iomem *p)
{
unsigned int i;
- pr_info(MODULE_NAME ": write test.\n");
+ pr_info("write test.\n");
mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
{
unsigned int i;
unsigned errs[3] = { 0 };
- pr_info(MODULE_NAME ": read test.\n");
+ pr_info("read test.\n");
mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
static void do_read_far_test(void __iomem *p)
{
- pr_info(MODULE_NAME ": read far test.\n");
+ pr_info("read far test.\n");
mmiotrace_printk("Read far test.\n");
ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
{
void __iomem *p = ioremap_nocache(mmio_address, size);
if (!p) {
- pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
+ pr_err("could not ioremap, aborting.\n");
return;
}
mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
if (mmio_address == 0) {
- pr_err(MODULE_NAME ": you have to use the module argument "
- "mmio_address.\n");
- pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
- " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+ pr_err("you have to use the module argument mmio_address.\n");
+ pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
return -ENXIO;
}
- pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
- "address space, and writing 16 kB of rubbish in there.\n",
- size >> 10, mmio_address);
+ pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+ "and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
do_test(size);
- pr_info(MODULE_NAME ": All done.\n");
+ pr_info("All done.\n");
return 0;
}
static void __exit cleanup(void)
{
- pr_debug(MODULE_NAME ": unloaded.\n");
+ pr_debug("unloaded.\n");
}
module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c814e144a3f..36fe08eeb5c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -59,7 +59,8 @@ void leave_mm(int cpu)
{
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
- cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+ cpumask_clear_cpu(cpu,
+ mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -234,8 +235,8 @@ void flush_tlb_current_task(void)
preempt_disable();
local_flush_tlb();
- if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
preempt_enable();
}
@@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm)
else
leave_mm(smp_processor_id());
}
- if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
preempt_enable();
}
@@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
leave_mm(smp_processor_id());
}
- if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, va);
preempt_enable();
}