14 files changed, 157 insertions, 79 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index f4026bae6ee..05f2b4009cc 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -1,7 +1,7 @@
 /*
  * linux/mm/allocpercpu.c
  *
- * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
+ * Separated from slab.c August 11, 2006 Christoph Lameter
  */
 #include <linux/mm.h>
 #include <linux/module.h>
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8fb927392b..8d9f60e06f6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -442,15 +442,17 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
 	return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
 }
 
-void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 				 unsigned long size, int flags)
 {
 	int ret;
 
 	ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
 	if (ret < 0)
-		return;
+		return -ENOMEM;
 	reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+
+	return 0;
 }
 
 void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bbf953eeb58..ab171274ef2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -785,7 +785,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			continue;
 
 		spin_lock(&dst->page_table_lock);
-		spin_lock(&src->page_table_lock);
+		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
 			if (cow)
 				huge_ptep_set_wrprotect(src, addr, src_pte);
diff --git a/mm/memory.c b/mm/memory.c
index 19e0ae9beec..2302d228fe0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -999,17 +999,15 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		goto no_page_table;
 
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (!ptep)
-		goto out;
 
 	pte = *ptep;
 	if (!pte_present(pte))
-		goto unlock;
+		goto no_page;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
 	page = vm_normal_page(vma, address, pte);
 	if (unlikely(!page))
-		goto unlock;
+		goto bad_page;
 
 	if (flags & FOLL_GET)
 		get_page(page);
@@ -1024,6 +1022,15 @@ unlock:
 out:
 	return page;
 
+bad_page:
+	pte_unmap_unlock(ptep, ptl);
+	return ERR_PTR(-EFAULT);
+
+no_page:
+	pte_unmap_unlock(ptep, ptl);
+	if (!pte_none(pte))
+		return page;
+	/* Fall through to ZERO_PAGE handling */
 no_page_table:
 	/*
 	 * When core dumping an enormous anonymous area that nobody
@@ -1038,6 +1045,26 @@ no_page_table:
 	return page;
 }
 
+/* Can we do the FOLL_ANON optimization? */
+static inline int use_zero_page(struct vm_area_struct *vma)
+{
+	/*
+	 * We don't want to optimize FOLL_ANON for make_pages_present()
+	 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
+	 * we want to get the page from the page tables to make sure
+	 * that we serialize and update with any other user of that
+	 * mapping.
+	 */
+	if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
+		return 0;
+	/*
+	 * And if we have a fault or a nopfn routine, it's not an
+	 * anonymous region.
+	 */
+	return !vma->vm_ops ||
+		(!vma->vm_ops->fault && !vma->vm_ops->nopfn);
+}
+
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
@@ -1112,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		foll_flags = FOLL_TOUCH;
 		if (pages)
 			foll_flags |= FOLL_GET;
-		if (!write && !(vma->vm_flags & VM_LOCKED) &&
-		    (!vma->vm_ops || !vma->vm_ops->fault))
+		if (!write && use_zero_page(vma))
 			foll_flags |= FOLL_ANON;
 
 		do {
@@ -1125,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			 * be processed until returning to user space.
 			 */
 			if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
-				return -ENOMEM;
+				return i ? i : -ENOMEM;
 
 			if (write)
 				foll_flags |= FOLL_WRITE;
@@ -1159,6 +1185,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 				cond_resched();
 			}
+			if (IS_ERR(page))
+				return i ? i : PTR_ERR(page);
 			if (pages) {
 				pages[i] = page;
 
@@ -1669,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *dirty_page = NULL;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
-	if (!old_page)
+	if (!old_page) {
+		/*
+		 * VM_MIXEDMAP !pfn_valid() case
+		 *
+		 * We should not cow pages in a shared writeable mapping.
+		 * Just mark the pages writable as we can't do any dirty
+		 * accounting on raw pfn maps.
+		 */
+		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+				     (VM_WRITE|VM_SHARED))
+			goto reuse;
 		goto gotten;
+	}
 
 	/*
 	 * Take out anonymous pages first, anonymous shared vmas are
@@ -1723,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	if (reuse) {
+reuse:
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = pte_mkyoung(orig_pte);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1757,7 +1797,6 @@ gotten:
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
-			page_remove_rmap(old_page, vma);
 			if (!PageAnon(old_page)) {
 				dec_mm_counter(mm, file_rss);
 				inc_mm_counter(mm, anon_rss);
@@ -1779,6 +1818,32 @@ gotten:
 		lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 
+		if (old_page) {
+			/*
+			 * Only after switching the pte to the new page may
+			 * we remove the mapcount here. Otherwise another
+			 * process may come and find the rmap count decremented
+			 * before the pte is switched to the new page, and
+			 * "reuse" the old page writing into it while our pte
+			 * here still points into it and can be read by other
+			 * threads.
+			 *
+			 * The critical issue is to order this
+			 * page_remove_rmap with the ptp_clear_flush above.
+			 * Those stores are ordered by (if nothing else,)
+			 * the barrier present in the atomic_add_negative
+			 * in page_remove_rmap.
+			 *
+			 * Then the TLB flush in ptep_clear_flush ensures that
+			 * no process can access the old page before the
+			 * decremented mapcount is visible. And the old page
+			 * cannot be reused until after the decremented
+			 * mapcount is visible. So transitively, TLBs to
+			 * old page will be flushed before it can be reused.
+			 */
+			page_remove_rmap(old_page, vma);
+		}
+
 		/* Free the old page.. */
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a37a5034f63..c94e58b192c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -729,7 +729,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	} else {
 		*policy = pol == &default_policy ? MPOL_DEFAULT :
 						pol->mode;
-		*policy |= pol->flags;
+		/*
+		 * Internal mempolicy flags must be masked off before exposing
+		 * the policy to userspace.
+		 */
+		*policy |= (pol->flags & MPOL_MODE_FLAGS);
 	}
 
 	if (vma) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 449d77d409f..55bd355d170 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -9,7 +9,7 @@
  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  * Hirokazu Takahashi <taka@valinux.co.jp>
  * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter
  */
 
 #include <linux/migrate.h>
@@ -865,6 +865,11 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
 			goto set_status;
 
 		page = follow_page(vma, pp->addr, FOLL_GET);
+
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto set_status;
+
 		err = -ENOENT;
 		if (!page)
 			goto set_status;
@@ -928,6 +933,11 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
 			goto set_status;
 
 		page = follow_page(vma, pm->addr, 0);
+
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto set_status;
+
 		err = -ENOENT;
 		/* Use PageReserved to check for zero page */
 		if (!page || PageReserved(page))
diff --git a/mm/mmap.c b/mm/mmap.c
index 669499e7c2f..3354fdd83d4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -245,10 +245,16 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	unsigned long rlim, retval;
 	unsigned long newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;
+	unsigned long min_brk;
 
 	down_write(&mm->mmap_sem);
 
-	if (brk < mm->start_brk)
+#ifdef CONFIG_COMPAT_BRK
+	min_brk = mm->end_code;
+#else
+	min_brk = mm->start_brk;
+#endif
+	if (brk < min_brk)
 		goto out;
 
 	/*
diff --git a/mm/nommu.c b/mm/nommu.c
index dca93fcb8b7..4462b6a3fcb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -109,16 +109,23 @@ unsigned int kobjsize(const void *objp)
 	 * If the object we have should not have ksize performed on it,
 	 * return size of 0
 	 */
-	if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp))))
+	if (!objp || !virt_addr_valid(objp))
 		return 0;
 
+	page = virt_to_head_page(objp);
+
+	/*
+	 * If the allocator sets PageSlab, we know the pointer came from
+	 * kmalloc().
+	 */
 	if (PageSlab(page))
 		return ksize(objp);
 
-	BUG_ON(page->index < 0);
-	BUG_ON(page->index >= MAX_ORDER);
-
-	return (PAGE_SIZE << page->index);
+	/*
+	 * The ksize() function is only guaranteed to work for pointers
+	 * returned by kmalloc(). So handle arbitrary pointers here.
+	 */
+	return PAGE_SIZE << compound_order(page);
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8e83f02cd2d..f32fae3121f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -237,16 +237,7 @@ static void bad_page(struct page *page)
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 		KERN_EMERG "Backtrace:\n");
 	dump_stack();
-	page->flags &= ~(1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_dirty	|
-			1 << PG_reclaim |
-			1 << PG_slab    |
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_buddy );
+	page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -463,16 +454,7 @@ static inline int free_pages_check(struct page *page)
 		(page->mapping != NULL)  |
 		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
-		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_slab	|
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved |
-			1 << PG_buddy ))))
+		(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -616,17 +598,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 		(page->mapping != NULL)  |
 		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
-		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private	|
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_dirty	|
-			1 << PG_slab    |
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved |
-			1 << PG_buddy ))))
+		(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
 		bad_page(page);
 
 	/*
@@ -2356,7 +2328,6 @@ static void build_zonelists(pg_data_t *pgdat)
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
-	pgdat->node_zonelists[1].zlcache_ptr = NULL;
 }
 
 #endif	/* CONFIG_NUMA */
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 0afd2387e50..d5878bed784 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,14 +3,14 @@
 #include <linux/sched.h>
 
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pte_t *pte;
 	int err = 0;
 
 	pte = pte_offset_map(pmd, addr);
 	for (;;) {
-		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
 		if (err)
 		       break;
 		addr += PAGE_SIZE;
@@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pmd_entry)
-			err = walk->pmd_entry(pmd, addr, next, private);
+			err = walk->pmd_entry(pmd, addr, next, walk);
 		if (!err && walk->pte_entry)
-			err = walk_pte_range(pmd, addr, next, walk, private);
+			err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
@@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 }
 
 static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pud_entry)
-			err = walk->pud_entry(pud, addr, next, private);
+			err = walk->pud_entry(pud, addr, next, walk);
 		if (!err && (walk->pmd_entry || walk->pte_entry))
-			err = walk_pmd_range(pud, addr, next, walk, private);
+			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
@@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
  * @addr: starting address
  * @end: ending address
  * @walk: set of callbacks to invoke for each level of the tree
- * @private: private data passed to the callback function
  *
  * Recursively walk the page table for the memory area in a VMA,
  * calling supplied callbacks. Callbacks are called in-order (first
  * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
  * etc.). If lower-level callbacks are omitted, walking depth is reduced.
  *
- * Each callback receives an entry pointer, the start and end of the
- * associated range, and a caller-supplied private data pointer.
+ * Each callback receives an entry pointer and the start and end of the
+ * associated range, and a copy of the original mm_walk for access to
+ * the ->private or ->mm fields.
  *
  * No locks are taken, but the bottom level iterator will map PTE
  * directories from highmem if necessary.
@@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
  * If any callback returns a non-zero value, the walk is aborted and
  * the return value is propagated back to the caller. Otherwise 0 is returned.
  */
-int walk_page_range(const struct mm_struct *mm,
-		    unsigned long addr, unsigned long end,
-		    const struct mm_walk *walk, void *private)
+int walk_page_range(unsigned long addr, unsigned long end,
+		    struct mm_walk *walk)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -112,21 +111,24 @@ int walk_page_range(const struct mm_struct *mm,
 	if (addr >= end)
 		return err;
 
-	pgd = pgd_offset(mm, addr);
+	if (!walk->mm)
+		return -EINVAL;
+
+	pgd = pgd_offset(walk->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pgd_entry)
-			err = walk->pgd_entry(pgd, addr, next, private);
+			err = walk->pgd_entry(pgd, addr, next, walk);
 		if (!err &&
 		    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
-			err = walk_pud_range(pgd, addr, next, walk, private);
+			err = walk_pud_range(pgd, addr, next, walk);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
diff --git a/mm/slab.c b/mm/slab.c
index 06236e4ddc1..046607f05f3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3263,9 +3263,12 @@ retry:
 
 		if (cpuset_zone_allowed_hardwall(zone, flags) &&
 			cache->nodelists[nid] &&
-			cache->nodelists[nid]->free_objects)
+			cache->nodelists[nid]->free_objects) {
 				obj = ____cache_alloc_node(cache,
 					flags | GFP_THISNODE, nid);
+				if (obj)
+					break;
+		}
 	}
 
 	if (!obj) {
diff --git a/mm/slub.c b/mm/slub.c
index 0987d1cd943..1a427c0ae83 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5,7 +5,7 @@
  * The allocator synchronizes using per slab locks and only
  * uses a centralized lock to manage a pool of partial slabs.
  *
- * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
+ * (C) 2007 SGI, Christoph Lameter
  */
 
 #include <linux/mm.h>
@@ -2995,8 +2995,6 @@ void __init kmem_cache_init(void)
 		create_kmalloc_cache(&kmalloc_caches[1],
 				"kmalloc-96", 96, GFP_KERNEL);
 		caches++;
-	}
-	if (KMALLOC_MIN_SIZE <= 128) {
 		create_kmalloc_cache(&kmalloc_caches[2],
 				"kmalloc-192", 192, GFP_KERNEL);
 		caches++;
@@ -3026,6 +3024,16 @@ void __init kmem_cache_init(void)
 	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
 		size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
 
+	if (KMALLOC_MIN_SIZE == 128) {
+		/*
+		 * The 192 byte sized cache is not used if the alignment
+		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
+		 * instead.
+		 */
+		for (i = 128 + 8; i <= 192; i += 8)
+			size_index[(i - 1) / 8] = 8;
+	}
+
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 99c4f36eb8a..a91b5f8fcaf 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -1,7 +1,7 @@
 /*
  * Virtual Memory Map support
  *
- * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ * (C) 2007 sgi. Christoph Lameter.
  *
  * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
  * virt_to_page, page_address() to be implemented as a base offset
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a29901ad3b..967d30ccd92 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1307,7 +1307,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					struct scan_control *sc)
 {
 	int priority;
-	int ret = 0;
+	unsigned long ret = 0;
 	unsigned long total_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;