diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/allocpercpu.c | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 83 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 12 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/nommu.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 35 | ||||
-rw-r--r-- | mm/pagewalk.c | 42 | ||||
-rw-r--r-- | mm/slab.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 14 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
14 files changed, 157 insertions, 79 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index f4026bae6ee..05f2b4009cc 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -1,7 +1,7 @@ /* * linux/mm/allocpercpu.c * - * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com> + * Separated from slab.c August 11, 2006 Christoph Lameter */ #include <linux/mm.h> #include <linux/module.h> diff --git a/mm/bootmem.c b/mm/bootmem.c index e8fb927392b..8d9f60e06f6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -442,15 +442,17 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); } -void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, +int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { int ret; ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); if (ret < 0) - return; + return -ENOMEM; reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); + + return 0; } void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bbf953eeb58..ab171274ef2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -785,7 +785,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, continue; spin_lock(&dst->page_table_lock); - spin_lock(&src->page_table_lock); + spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); diff --git a/mm/memory.c b/mm/memory.c index 19e0ae9beec..2302d228fe0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -999,17 +999,15 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page_table; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!ptep) - goto out; pte = *ptep; if (!pte_present(pte)) - goto unlock; + goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; page = vm_normal_page(vma, address, pte); if (unlikely(!page)) - goto unlock; + goto bad_page; if (flags & FOLL_GET) get_page(page); @@ -1024,6 +1022,15 @@ unlock: out: return page; +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + /* Fall through to ZERO_PAGE handling */ no_page_table: /* * When core dumping an enormous anonymous area that nobody @@ -1038,6 +1045,26 @@ no_page_table: return page; } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault or a nopfn routine, it's not an + * anonymous region. + */ + return !vma->vm_ops || + (!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -1112,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->fault)) + if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { @@ -1125,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * be processed until returning to user space. */ if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) - return -ENOMEM; + return i ? i : -ENOMEM; if (write) foll_flags |= FOLL_WRITE; @@ -1159,6 +1185,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); if (pages) { pages[i] = page; @@ -1669,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); - if (!old_page) + if (!old_page) { + /* + * VM_MIXEDMAP !pfn_valid() case + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable as we can't do any dirty + * accounting on raw pfn maps. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + goto reuse; goto gotten; + } /* * Take out anonymous pages first, anonymous shared vmas are @@ -1723,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (reuse) { +reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -1757,7 +1797,6 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1779,6 +1818,32 @@ gotten: lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a37a5034f63..c94e58b192c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -729,7 +729,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; - *policy |= pol->flags; + /* + * Internal mempolicy flags must be masked off before exposing + * the policy to userspace. + */ + *policy |= (pol->flags & MPOL_MODE_FLAGS); } if (vma) { diff --git a/mm/migrate.c b/mm/migrate.c index 449d77d409f..55bd355d170 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -9,7 +9,7 @@ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> * Hirokazu Takahashi <taka@valinux.co.jp> * Dave Hansen <haveblue@us.ibm.com> - * Christoph Lameter <clameter@sgi.com> + * Christoph Lameter */ #include <linux/migrate.h> @@ -865,6 +865,11 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, goto set_status; page = follow_page(vma, pp->addr, FOLL_GET); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + err = -ENOENT; if (!page) goto set_status; @@ -928,6 +933,11 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) goto set_status; page = follow_page(vma, pm->addr, 0); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + err = -ENOENT; /* Use PageReserved to check for zero page */ if (!page || PageReserved(page)) diff --git a/mm/mmap.c b/mm/mmap.c index 669499e7c2f..3354fdd83d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,10 +245,16 @@ asmlinkage unsigned long sys_brk(unsigned long brk) unsigned long rlim, retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; + unsigned long min_brk; down_write(&mm->mmap_sem); - if (brk < mm->start_brk) +#ifdef CONFIG_COMPAT_BRK + min_brk = mm->end_code; +#else + min_brk = mm->start_brk; +#endif + if (brk < min_brk) goto out; /* diff --git a/mm/nommu.c b/mm/nommu.c index dca93fcb8b7..4462b6a3fcb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -109,16 +109,23 @@ unsigned int kobjsize(const void *objp) * If the object we have should not have ksize performed on it, * return size of 0 */ - if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp)))) + if (!objp || !virt_addr_valid(objp)) return 0; + page = virt_to_head_page(objp); + + /* + * If the allocator sets PageSlab, we know the pointer came from + * kmalloc(). + */ if (PageSlab(page)) return ksize(objp); - BUG_ON(page->index < 0); - BUG_ON(page->index >= MAX_ORDER); - - return (PAGE_SIZE << page->index); + /* + * The ksize() function is only guaranteed to work for pointers + * returned by kmalloc(). So handle arbitrary pointers here. + */ + return PAGE_SIZE << compound_order(page); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e83f02cd2d..f32fae3121f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -237,16 +237,7 @@ static void bad_page(struct page *page) printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); - page->flags &= ~(1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_buddy ); + page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -463,16 +454,7 @@ static inline int free_pages_check(struct page *page) (page->mapping != NULL) | (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy )))) + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -616,17 +598,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) (page->mapping != NULL) | (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy )))) + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) bad_page(page); /* @@ -2356,7 +2328,6 @@ static void build_zonelists(pg_data_t *pgdat) static void build_zonelist_cache(pg_data_t *pgdat) { pgdat->node_zonelists[0].zlcache_ptr = NULL; - pgdat->node_zonelists[1].zlcache_ptr = NULL; } #endif /* CONFIG_NUMA */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 0afd2387e50..d5878bed784 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,14 +3,14 @@ #include <linux/sched.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pte_t *pte; int err = 0; pte = pte_offset_map(pmd, addr); for (;;) { - err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; addr += PAGE_SIZE; @@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pmd_t *pmd; unsigned long next; @@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pmd_entry) - err = walk->pmd_entry(pmd, addr, next, private); + err = walk->pmd_entry(pmd, addr, next, walk); if (!err && walk->pte_entry) - err = walk_pte_range(pmd, addr, next, walk, private); + err = walk_pte_range(pmd, addr, next, walk); if (err) break; } while (pmd++, addr = next, addr != end); @@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, } static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) + struct mm_walk *walk) { pud_t *pud; unsigned long next; @@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pud_entry) - err = walk->pud_entry(pud, addr, next, private); + err = walk->pud_entry(pud, addr, next, walk); if (!err && (walk->pmd_entry || walk->pte_entry)) - err = walk_pmd_range(pud, addr, next, walk, private); + err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); @@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree - * @private: private data passed to the callback function * * Recursively walk the page table for the memory area in a VMA, * calling supplied callbacks. Callbacks are called in-order (first * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, * etc.). If lower-level callbacks are omitted, walking depth is reduced. * - * Each callback receives an entry pointer, the start and end of the - * associated range, and a caller-supplied private data pointer. + * Each callback receives an entry pointer and the start and end of the + * associated range, and a copy of the original mm_walk for access to + * the ->private or ->mm fields. * * No locks are taken, but the bottom level iterator will map PTE * directories from highmem if necessary. @@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. */ -int walk_page_range(const struct mm_struct *mm, - unsigned long addr, unsigned long end, - const struct mm_walk *walk, void *private) +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) { pgd_t *pgd; unsigned long next; @@ -112,21 +111,24 @@ int walk_page_range(const struct mm_struct *mm, if (addr >= end) return err; - pgd = pgd_offset(mm, addr); + if (!walk->mm) + return -EINVAL; + + pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) - err = walk->pte_hole(addr, next, private); + err = walk->pte_hole(addr, next, walk); if (err) break; continue; } if (walk->pgd_entry) - err = walk->pgd_entry(pgd, addr, next, private); + err = walk->pgd_entry(pgd, addr, next, walk); if (!err && (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) - err = walk_pud_range(pgd, addr, next, walk, private); + err = walk_pud_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); diff --git a/mm/slab.c b/mm/slab.c index 06236e4ddc1..046607f05f3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3263,9 +3263,12 @@ retry: if (cpuset_zone_allowed_hardwall(zone, flags) && cache->nodelists[nid] && - cache->nodelists[nid]->free_objects) + cache->nodelists[nid]->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); + if (obj) + break; + } } if (!obj) { diff --git a/mm/slub.c b/mm/slub.c index 0987d1cd943..1a427c0ae83 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5,7 +5,7 @@ * The allocator synchronizes using per slab locks and only * uses a centralized lock to manage a pool of partial slabs. * - * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> + * (C) 2007 SGI, Christoph Lameter */ #include <linux/mm.h> @@ -2995,8 +2995,6 @@ void __init kmem_cache_init(void) create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_KERNEL); caches++; - } - if (KMALLOC_MIN_SIZE <= 128) { create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_KERNEL); caches++; @@ -3026,6 +3024,16 @@ void __init kmem_cache_init(void) for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + if (KMALLOC_MIN_SIZE == 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[(i - 1) / 8] = 8; + } + slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 99c4f36eb8a..a91b5f8fcaf 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -1,7 +1,7 @@ /* * Virtual Memory Map support * - * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>. + * (C) 2007 sgi. Christoph Lameter. * * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, * virt_to_page, page_address() to be implemented as a base offset diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a29901ad3b..967d30ccd92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1307,7 +1307,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - int ret = 0; + unsigned long ret = 0; unsigned long total_scanned = 0; unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; |