8 files changed, 129 insertions, 64 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67f29516662..508707704d2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
 	BUG_ON(page_count(page));
 
 	INIT_LIST_HEAD(&page->lru);
-	page[1].mapping = NULL;
+	page[1].lru.next = NULL;			/* reset dtor */
 
 	spin_lock(&hugetlb_lock);
 	enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_count(page, 1);
-	page[1].mapping = (void *)free_huge_page;
+	page[1].lru.next = (void *)free_huge_page;	/* set dtor */
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299..af3d573b014 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
 	struct mm_struct * mm = vma->vm_mm;
 	int error = 0;
 	pgoff_t pgoff;
-	int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+	int new_flags = vma->vm_flags;
 
 	switch (behavior) {
+	case MADV_NORMAL:
+		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+		break;
 	case MADV_SEQUENTIAL:
-		new_flags |= VM_SEQ_READ;
+		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
 		break;
 	case MADV_RANDOM:
-		new_flags |= VM_RAND_READ;
+		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
 		break;
-	default:
+	case MADV_DONTFORK:
+		new_flags |= VM_DONTCOPY;
+		break;
+	case MADV_DOFORK:
+		new_flags &= ~VM_DONTCOPY;
 		break;
 	}
 
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	long error;
 
 	switch (behavior) {
+	case MADV_DOFORK:
+		if (vma->vm_flags & VM_IO) {
+			error = -EINVAL;
+			break;
+		}
+	case MADV_DONTFORK:
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8..9abc6008544 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
 
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+	randomize_va_space = 0;
+	return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b7..323fdcf128c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 	}
 	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
+
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
 	struct zonelist *zl;
-	int num, max, nd;
+	int num, max, nd, k;
 
 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 	if (!zl)
 		return NULL;
 	num = 0;
-	for_each_node_mask(nd, *nodes)
-		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+	/* First put in the highest zones from all nodes, then all the next 
+	   lower zones etc. Avoid empty zones because the memory allocator
+	   doesn't like them. If you implement node hot removal you
+	   have to fix that. */
+	for (k = policy_zone; k >= 0; k--) { 
+		for_each_node_mask(nd, *nodes) { 
+			struct zone *z = &NODE_DATA(nd)->node_zones[k];
+			if (z->present_pages > 0) 
+				zl->zones[num++] = z;
+		}
+	}
 	zl->zones[num] = NULL;
 	return zl;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde04ff4be3..62c12252858 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
 int percpu_pagelist_fraction;
 
 static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
 
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
  * All pages have PG_compound set.  All pages have their ->private pointing at
  * the head page (even the head page has this).
  *
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present.  This usage means that zero-order pages
- * may not be compound.
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function.  Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
  */
+
+static void free_compound_page(struct page *page)
+{
+	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
 static void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 
-	page[1].mapping = NULL;
-	page[1].index = order;
+	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
+	page[1].lru.prev = (void *)order;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	if (unlikely(page[1].index != order))
+	if (unlikely((unsigned long)page[1].lru.prev != order))
 		bad_page(page);
 
 	for (i = 0; i < nr_pages; i++) {
diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d971..add05d808a4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1717,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		BUG();
 	}
 
+	/*
+	 * Prevent CPUs from coming and going.
+	 * lock_cpu_hotplug() nests outside cache_chain_mutex
+	 */
+	lock_cpu_hotplug();
+
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each(p, &cache_chain) {
@@ -1918,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->dtor = dtor;
 	cachep->name = name;
 
-	/* Don't let CPUs to come and go */
-	lock_cpu_hotplug();
 
 	if (g_cpucache_up == FULL) {
 		enable_cpucache(cachep);
@@ -1978,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-	unlock_cpu_hotplug();
       oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
 	mutex_unlock(&cache_chain_mutex);
+	unlock_cpu_hotplug();
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
diff --git a/mm/swap.c b/mm/swap.c
index 76247424dea..cce3dda59c5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
 	if (put_page_testzero(page)) {
 		void (*dtor)(struct page *page);
 
-		dtor = (void (*)(struct page *))page[1].mapping;
+		dtor = (void (*)(struct page *))page[1].lru.next;
 		(*dtor)(page);
 	}
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd0..1838c15ca4f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		BUG_ON(PageActive(page));
 
 		sc->nr_scanned++;
+
+		if (!sc->may_swap && page_mapped(page))
+			goto keep_locked;
+
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
 	struct address_space *mapping = page_mapping(page);
 
 	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
 			goto unlock_retry;
 
 	if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
  * pages are swapped out.
  *
  * The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
  * or no retryable pages exist anymore.
  *
  * Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
 			goto unlock_both;
 
 		if (mapping->a_ops->migratepage) {
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
 			rc = mapping->a_ops->migratepage(newpage, page);
 			goto unlock_both;
                 }
 
 		/*
-		 * Trigger writeout if page is dirty
+		 * Default handling if a filesystem does not provide
+		 * a migration function. We can only migrate clean
+		 * pages so try to write out any dirty pages first.
 		 */
 		if (PageDirty(page)) {
 			switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
 				; /* try to migrate the page below */
 			}
                 }
+
 		/*
-		 * If we have no buffer or can release the buffer
-		 * then do a simple migration.
+		 * Buffers are managed in a filesystem specific way.
+		 * We must have no buffers or drop them.
 		 */
 		if (!page_has_buffers(page) ||
 		    try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
 		 * swap them out.
 		 */
 		if (pass > 4) {
+			/*
+			 * Persistently unable to drop buffers..... As a
+			 * measure of last resort we fall back to
+			 * swap_page().
+			 */
 			unlock_page(newpage);
 			newpage = NULL;
 			rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	struct page *page;
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
+
+	if (unlikely(sc->may_swap)) {
+		long mapped_ratio;
+		long distress;
+		long swap_tendency;
+
+		/*
+		 * `distress' is a measure of how much trouble we're having
+		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+		 */
+		distress = 100 >> zone->prev_priority;
+
+		/*
+		 * The point of this algorithm is to decide when to start
+		 * reclaiming mapped memory instead of just pagecache.  Work out
+		 * how much memory
+		 * is mapped.
+		 */
+		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+		/*
+		 * Now decide how much we really want to unmap some pages.  The
+		 * mapped ratio is downgraded - just because there's a lot of
+		 * mapped memory doesn't necessarily mean that page reclaim
+		 * isn't succeeding.
+		 *
+		 * The distress ratio is important - we don't want to start
+		 * going oom.
+		 *
+		 * A 100% value of vm_swappiness overrides this algorithm
+		 * altogether.
+		 */
+		swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+		/*
+		 * Now use this metric to decide whether to start moving mapped
+		 * memory onto the inactive list.
+		 */
+		if (swap_tendency >= 100)
+			reclaim_mapped = 1;
+	}
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	zone->nr_active -= pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 
-	/*
-	 * `distress' is a measure of how much trouble we're having reclaiming
-	 * pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	distress = 100 >> zone->prev_priority;
-
-	/*
-	 * The point of this algorithm is to decide when to start reclaiming
-	 * mapped memory instead of just pagecache.  Work out how much memory
-	 * is mapped.
-	 */
-	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The mapped
-	 * ratio is downgraded - just because there's a lot of mapped memory
-	 * doesn't necessarily mean that page reclaim isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped memory
-	 * onto the inactive list.
-	 */
-	if (swap_tendency >= 100)
-		reclaim_mapped = 1;
-
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
 			sc.nr_reclaimed = 0;
 			sc.priority = priority;
 			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-			atomic_inc(&zone->reclaim_in_progress);
 			shrink_zone(zone, &sc);
-			atomic_dec(&zone->reclaim_in_progress);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);