From a8062231d80239cf3405982858c02aea21a6066a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 7 Apr 2006 19:49:21 +0200
Subject: [PATCH] x86_64: Handle empty PXMs that only contain hotplug memory

The node setup code would try to allocate the node metadata in the node
itself, but that fails if there is no memory in there.

This can happen with memory hotplug when the hotplug area defines an so
far empty node.

Now use bootmem to try to allocate the mem_map in other nodes.

And if it fails don't panic, but just ignore the node.

To make this work I added a new __alloc_bootmem_nopanic function that
does what its name implies.

TBD should try to use nearby nodes here.  Currently we just use any.
It's hard to do it better because bootmem doesn't have proper fallback
lists yet.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/bootmem.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/bootmem.c b/mm/bootmem.c
index d3e3bd2ffce..d213feded10 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -401,7 +401,7 @@ unsigned long __init free_all_bootmem (void)
 	return(free_all_bootmem_core(NODE_DATA(0)));
 }
 
-void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal)
 {
 	bootmem_data_t *bdata;
 	void *ptr;
@@ -409,7 +409,14 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned
 	list_for_each_entry(bdata, &bdata_list, list)
 		if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
 			return(ptr);
+	return NULL;
+}
 
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
+{
+	void *mem = __alloc_bootmem_nopanic(size,align,goal);
+	if (mem)
+		return mem;
 	/*
 	 * Whoops, we cannot satisfy the allocation request.
 	 */
-- 
cgit v1.2.3


From 676165a8af7167f488abdcce6851a9bc36e83254 Mon Sep 17 00:00:00 2001
From: Nick Piggin <piggin@cyberone.com.au>
Date: Mon, 10 Apr 2006 11:21:48 +1000
Subject: [PATCH] Fix buddy list race that could lead to page lru list
 corruptions

Rohit found an obscure bug causing buddy list corruption.

page_is_buddy is using a non-atomic test (PagePrivate && page_count == 0)
to determine whether or not a free page's buddy is itself free and in the
buddy lists.

Each of the conjuncts may be true at different times due to unrelated
conditions, so the non-atomic page_is_buddy test may find each conjunct to
be true even if they were not both true at the same time (ie. the page was
not on the buddy lists).

Signed-off-by: Martin Bligh <mbligh@google.com>
Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dc523a1f270..b8165e037de 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -151,7 +151,8 @@ static void bad_page(struct page *page)
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback );
+			1 << PG_writeback |
+			1 << PG_buddy );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -236,12 +237,12 @@ static inline unsigned long page_order(struct page *page) {
 
 static inline void set_page_order(struct page *page, int order) {
 	set_page_private(page, order);
-	__SetPagePrivate(page);
+	__SetPageBuddy(page);
 }
 
 static inline void rmv_page_order(struct page *page)
 {
-	__ClearPagePrivate(page);
+	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 
@@ -280,11 +281,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
- * (b) the buddy is free &&
- * (c) the buddy is on the buddy system &&
- * (d) a page and its buddy have the same order.
- * for recording page's order, we use page_private(page) and PG_private.
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order.
+ *
+ * For recording whether a page is in the buddy system, we use PG_buddy.
+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
+ * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, int order)
 {
@@ -293,10 +296,10 @@ static inline int page_is_buddy(struct page *page, int order)
 		return 0;
 #endif
 
-       if (PagePrivate(page)           &&
-           (page_order(page) == order) &&
-            page_count(page) == 0)
+	if (PageBuddy(page) && page_order(page) == order) {
+		BUG_ON(page_count(page) != 0);
                return 1;
+	}
        return 0;
 }
 
@@ -313,7 +316,7 @@ static inline int page_is_buddy(struct page *page, int order)
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_Private.Page's
+ * free pages of length of (1 << order) and marked with PG_buddy. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
@@ -376,7 +379,8 @@ static inline int free_pages_check(struct page *page)
 			1 << PG_slab	|
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved ))))
+			1 << PG_reserved |
+			1 << PG_buddy ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -524,7 +528,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved ))))
+			1 << PG_reserved |
+			1 << PG_buddy ))))
 		bad_page(page);
 
 	/*
-- 
cgit v1.2.3


From 5b74ada7eea1b0064d2b72384827853f349d803a Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Mon, 10 Apr 2006 22:52:53 -0700
Subject: [PATCH] slab: allocate node local memory for off-slab slabmanagement

Allocate off-slab slab descriptors from node local memory.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f055c142021..afabad54c4c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2318,13 +2318,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
-				   int colour_off, gfp_t local_flags)
+				   int colour_off, gfp_t local_flags,
+				   int nodeid)
 {
 	struct slab *slabp;
 
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
-		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
+		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
+					      local_flags, nodeid);
 		if (!slabp)
 			return NULL;
 	} else {
@@ -2334,6 +2336,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
 	slabp->s_mem = objp + colour_off;
+	slabp->nodeid = nodeid;
 	return slabp;
 }
 
@@ -2519,7 +2522,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		goto failed;
 
 	/* Get slab management. */
-	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
+	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
 	if (!slabp)
 		goto opps1;
 
-- 
cgit v1.2.3


From fb7faf3313d527bf68ba2e7ff3a2b6ebf201af73 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Mon, 10 Apr 2006 22:52:54 -0700
Subject: [PATCH] slab: add statistics for alien cache overflows

Add a statistics counter which is incremented everytime the alien cache
overflows.  alien_cache limit is hardcoded to 12 right now.  We can use
this statistics to tune alien cache if needed in the future.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index afabad54c4c..752c5570f2f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -420,6 +420,7 @@ struct kmem_cache {
 	unsigned long max_freeable;
 	unsigned long node_allocs;
 	unsigned long node_frees;
+	unsigned long node_overflow;
 	atomic_t allochit;
 	atomic_t allocmiss;
 	atomic_t freehit;
@@ -465,6 +466,7 @@ struct kmem_cache {
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
+#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 #define	STATS_SET_FREEABLE(x, i)					\
 	do {								\
 		if ((x)->max_freeable < i)				\
@@ -484,6 +486,7 @@ struct kmem_cache {
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
 #define	STATS_INC_NODEFREES(x)	do { } while (0)
+#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
@@ -3083,9 +3086,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 			if (l3->alien && l3->alien[nodeid]) {
 				alien = l3->alien[nodeid];
 				spin_lock(&alien->lock);
-				if (unlikely(alien->avail == alien->limit))
+				if (unlikely(alien->avail == alien->limit)) {
+					STATS_INC_ACOVERFLOW(cachep);
 					__drain_alien_cache(cachep,
 							    alien, nodeid);
+				}
 				alien->entry[alien->avail++] = objp;
 				spin_unlock(&alien->lock);
 			} else {
@@ -3763,7 +3768,7 @@ static void print_slabinfo_header(struct seq_file *m)
 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-		 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
+		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
 	seq_putc(m, '\n');
@@ -3877,11 +3882,12 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long max_freeable = cachep->max_freeable;
 		unsigned long node_allocs = cachep->node_allocs;
 		unsigned long node_frees = cachep->node_frees;
+		unsigned long overflows = cachep->node_overflow;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu", allocs, high, grown,
+				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
 				reaped, errors, max_freeable, node_allocs,
-				node_frees);
+				node_frees, overflows);
 	}
 	/* cpu stats */
 	{
-- 
cgit v1.2.3


From d6fef9da19b7acd46e04b7dbbba726b3febeca94 Mon Sep 17 00:00:00 2001
From: Luke Yang <luke.adi@gmail.com>
Date: Mon, 10 Apr 2006 22:52:56 -0700
Subject: [PATCH] nommu: use compound page in slab allocator

The earlier patch to consolidate mmu and nommu page allocation and
refcounting by using compound pages for nommu allocations had a bug:
kmalloc slabs who's pages were initially allocated by a non-__GFP_COMP
allocator could be passed into mm/nommu.c kmalloc allocations which really
wanted __GFP_COMP underlying pages.  Fix that by having nommu pass
__GFP_COMP to all higher order slab allocations.

Signed-off-by: Luke Yang <luke.adi@gmail.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 752c5570f2f..e6ef9bd5233 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1456,7 +1456,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	int i;
 
 	flags |= cachep->gfpflags;
+#ifndef CONFIG_MMU
+	/* nommu uses slab's for process anonymous memory allocations, so
+	 * requires __GFP_COMP to properly refcount higher order allocations"
+	 */
+	page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder);
+#else
 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+#endif
 	if (!page)
 		return NULL;
 	addr = page_address(page);
-- 
cgit v1.2.3


From 1e624196f43c3a62122959e15c5f03572cdadb5d Mon Sep 17 00:00:00 2001
From: Ram Gupta <ram.gupta5@gmail.com>
Date: Mon, 10 Apr 2006 22:52:57 -0700
Subject: [PATCH] mm: fix bug in brk()

The code checks for newbrk with oldbrk which are page aligned before making
a check for the memory limit set of data segment.  If the memory limit is
not page aligned in that case it bypasses the test for the limit if the
memory allocation is still for the same page.

Signed-off-by: Ram Gupta <ram.gupta5@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index e780d19aa21..eab6fcb65e1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -220,6 +220,17 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 
 	if (brk < mm->end_code)
 		goto out;
+
+	/*
+	 * Check against rlimit here. If this check is done later after the test
+	 * of oldbrk with newbrk then it can escape the test and let the data
+	 * segment grow beyond its set limit the in case where the limit is
+	 * not page aligned -Ram Gupta
+	 */
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+		goto out;
+
 	newbrk = PAGE_ALIGN(brk);
 	oldbrk = PAGE_ALIGN(mm->brk);
 	if (oldbrk == newbrk)
@@ -232,11 +243,6 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 		goto out;
 	}
 
-	/* Check against rlimit.. */
-	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
-	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
-		goto out;
-
 	/* Check against existing mmap mappings. */
 	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 		goto out;
-- 
cgit v1.2.3


From e23ca00bf1b1c6c0f04702cb4d29e275ab8dc330 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 10 Apr 2006 22:52:57 -0700
Subject: [PATCH] Some page migration fixups

- Remove sparse comment

- Remove duplicated include

- Return the correct error condition in migrate_page_remove_references().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 09f6e4aa87f..20b95db63da 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -16,8 +16,7 @@
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
-#include <linux/buffer_head.h>	/* for try_to_release_page(),
-					buffer_heads_over_limit */
+#include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
 #include <linux/pagevec.h>
 #include <linux/rmap.h>
@@ -28,8 +27,6 @@
 
 #include "internal.h"
 
-#include "internal.h"
-
 /* The maximum number of pages to take off the LRU for migration */
 #define MIGRATE_CHUNK_SIZE 256
 
@@ -234,7 +231,7 @@ int migrate_page_remove_references(struct page *newpage,
 	if (!page_mapping(page) || page_count(page) != nr_refs ||
 			*radix_pointer != page) {
 		write_unlock_irq(&mapping->tree_lock);
-		return 1;
+		return -EAGAIN;
 	}
 
 	/*
-- 
cgit v1.2.3


From cb45b0e966cbe747b6189c15b108901cc7d6c97c Mon Sep 17 00:00:00 2001
From: Hideo AOKI <haoki@redhat.com>
Date: Mon, 10 Apr 2006 22:52:59 -0700
Subject: [PATCH] overcommit: add calculate_totalreserve_pages()

These patches are an enhancement of OVERCOMMIT_GUESS algorithm in
__vm_enough_memory().

- why the kernel needed patching

  When the kernel can't allocate anonymous pages in practice, currnet
  OVERCOMMIT_GUESS could return success. This implementation might be
  the cause of oom kill in memory pressure situation.

  If the Linux runs with page reservation features like
  /proc/sys/vm/lowmem_reserve_ratio and without swap region, I think
  the oom kill occurs easily.

- the overall design approach in the patch

  When the OVERCOMMET_GUESS algorithm calculates number of free pages,
  the reserved free pages are regarded as non-free pages.

  This change helps to avoid the pitfall that the number of free pages
  become less than the number which the kernel tries to keep free.

- testing results

  I tested the patches using my test kernel module.

  If the patches aren't applied to the kernel, __vm_enough_memory()
  returns success in the situation but autual page allocation is
  failed.

  On the other hand, if the patches are applied to the kernel, memory
  allocation failure is avoided since __vm_enough_memory() returns
  failure in the situation.

  I checked that on i386 SMP 16GB memory machine. I haven't tested on
  nommu environment currently.

This patch adds totalreserve_pages for __vm_enough_memory().

Calculate_totalreserve_pages() checks maximum lowmem_reserve pages and
pages_high in each zone. Finally, the function stores the sum of each
zone to totalreserve_pages.

The totalreserve_pages is calculated when the VM is initilized.
And the variable is updated when /proc/sys/vm/lowmem_reserve_raito
or /proc/sys/vm/min_free_kbytes are changed.

Signed-off-by: Hideo Aoki <haoki@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b8165e037de..97d6827c7d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,6 +51,7 @@ nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
+unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 
@@ -2476,6 +2477,38 @@ void __init page_alloc_init(void)
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 
+/*
+ * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ *	or min_free_kbytes changes.
+ */
+static void calculate_totalreserve_pages(void)
+{
+	struct pglist_data *pgdat;
+	unsigned long reserve_pages = 0;
+	int i, j;
+
+	for_each_online_pgdat(pgdat) {
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+			unsigned long max = 0;
+
+			/* Find valid and maximum lowmem_reserve in the zone */
+			for (j = i; j < MAX_NR_ZONES; j++) {
+				if (zone->lowmem_reserve[j] > max)
+					max = zone->lowmem_reserve[j];
+			}
+
+			/* we treat pages_high as reserved pages. */
+			max += zone->pages_high;
+
+			if (max > zone->present_pages)
+				max = zone->present_pages;
+			reserve_pages += max;
+		}
+	}
+	totalreserve_pages = reserve_pages;
+}
+
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
@@ -2507,6 +2540,9 @@ static void setup_per_zone_lowmem_reserve(void)
 			}
 		}
 	}
+
+	/* update totalreserve_pages */
+	calculate_totalreserve_pages();
 }
 
 /*
@@ -2561,6 +2597,9 @@ void setup_per_zone_pages_min(void)
 		zone->pages_high  = zone->pages_min + tmp / 2;
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
+
+	/* update totalreserve_pages */
+	calculate_totalreserve_pages();
 }
 
 /*
-- 
cgit v1.2.3


From 6d9f78396583244258080f3369889644c06c37c8 Mon Sep 17 00:00:00 2001
From: Hideo AOKI <haoki@redhat.com>
Date: Mon, 10 Apr 2006 22:53:00 -0700
Subject: [PATCH] overcommit: use totalreserve_pages

This patch is an enhancement of OVERCOMMIT_GUESS algorithm in
__vm_enough_memory() in mm/mmap.c.

When the OVERCOMMIT_GUESS algorithm calculates the number of free pages,
the algorithm subtracts the number of reserved pages from the result
nr_free_pages().

Signed-off-by: Hideo Aoki <haoki@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index eab6fcb65e1..e6ee12344b1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -121,14 +121,26 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 		 * only call if we're about to fail.
 		 */
 		n = nr_free_pages();
+
+		/*
+		 * Leave reserved pages. The pages are not for anonymous pages.
+		 */
+		if (n <= totalreserve_pages)
+			goto error;
+		else
+			n -= totalreserve_pages;
+
+		/*
+		 * Leave the last 3% for root
+		 */
 		if (!cap_sys_admin)
 			n -= n / 32;
 		free += n;
 
 		if (free > pages)
 			return 0;
-		vm_unacct_memory(pages);
-		return -ENOMEM;
+
+		goto error;
 	}
 
 	allowed = (totalram_pages - hugetlb_total_pages())
@@ -150,7 +162,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 	 */
 	if (atomic_read(&vm_committed_space) < (long)allowed)
 		return 0;
-
+error:
 	vm_unacct_memory(pages);
 
 	return -ENOMEM;
-- 
cgit v1.2.3


From d5ddc79bcaab6975e7671805c3578407dc33b764 Mon Sep 17 00:00:00 2001
From: Hideo AOKI <haoki@redhat.com>
Date: Mon, 10 Apr 2006 22:53:01 -0700
Subject: [PATCH] overcommit: use totalreserve_pages for nommu

This patch is an enhancement of OVERCOMMIT_GUESS algorithm in
__vm_enough_memory() in mm/nommu.c.

When the OVERCOMMIT_GUESS algorithm calculates the number of free pages,
the algorithm subtracts the number of reserved pages from the result
nr_free_pages().

Signed-off-by: Hideo Aoki <haoki@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/nommu.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index db45efac17c..029fadac0fb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1147,14 +1147,26 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 		 * only call if we're about to fail.
 		 */
 		n = nr_free_pages();
+
+		/*
+		 * Leave reserved pages. The pages are not for anonymous pages.
+		 */
+		if (n <= totalreserve_pages)
+			goto error;
+		else
+			n -= totalreserve_pages;
+
+		/*
+		 * Leave the last 3% for root
+		 */
 		if (!cap_sys_admin)
 			n -= n / 32;
 		free += n;
 
 		if (free > pages)
 			return 0;
-		vm_unacct_memory(pages);
-		return -ENOMEM;
+
+		goto error;
 	}
 
 	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
@@ -1175,7 +1187,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 	 */
 	if (atomic_read(&vm_committed_space) < (long)allowed)
 		return 0;
-
+error:
 	vm_unacct_memory(pages);
 
 	return -ENOMEM;
-- 
cgit v1.2.3


From 64a3ca5f7ec2606b03be4a65736164a5373732ed Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Apr 2006 22:53:03 -0700
Subject: [PATCH] mm/migrate.c: don't export a static function

EXPORT_SYMBOL'ing of a static function is not a good idea.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 20b95db63da..d444229f259 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -173,7 +173,6 @@ unlock_retry:
 retry:
 	return -EAGAIN;
 }
-EXPORT_SYMBOL(swap_page);
 
 /*
  * Remove references for a page and establish the new page with the correct
-- 
cgit v1.2.3


From fd5403c79bc21819f6e0c40ba098cea8b6a418bd Mon Sep 17 00:00:00 2001
From: Coywolf Qi Hunt <coywolf@gmail.com>
Date: Mon, 10 Apr 2006 22:54:35 -0700
Subject: [PATCH] page-writeback comment fixes

Signed-off-by: Coywolf Qi Hunt <qiyong@fc-cn.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page-writeback.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6dcce3a4bbd..75d7f48b79b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -72,13 +72,12 @@ int dirty_background_ratio = 10;
 int vm_dirty_ratio = 40;
 
 /*
- * The interval between `kupdate'-style writebacks, in centiseconds
- * (hundredths of a second)
+ * The interval between `kupdate'-style writebacks, in jiffies
  */
 int dirty_writeback_interval = 5 * HZ;
 
 /*
- * The longest number of centiseconds for which data is allowed to remain dirty
+ * The longest number of jiffies for which data is allowed to remain dirty
  */
 int dirty_expire_interval = 30 * HZ;
 
-- 
cgit v1.2.3


From 69cf0fac6052c5bd3fb3469a41d4216e926028f8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 17 Apr 2006 22:46:32 +0100
Subject: [PATCH] Fix MADV_REMOVE protection checking

madvise_remove needs to respect file and mmap protections.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Will the real CVE-2006-1524 stand up, please.. ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/madvise.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index af3d573b014..4e196155a0c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -168,6 +168,9 @@ static long madvise_remove(struct vm_area_struct *vma,
 			return -EINVAL;
 	}
 
+	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+		return -EACCES;
+
 	mapping = vma->vm_file->f_mapping;
 
 	offset = (loff_t)(start - vma->vm_start)
-- 
cgit v1.2.3


From 75129e297e861e6c61038aa4cdbf604b022de4ff Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 18 Apr 2006 22:20:33 -0700
Subject: [PATCH] mm/slob.c: for_each_possible_cpu(), not NR_CPUS

Convert for-loops that explicitly reference "NR_CPUS" into the
potentially more efficient for_each_possible_cpu() construct.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slob.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 9bcc7e2cabf..a68255ba455 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -354,9 +354,7 @@ void *__alloc_percpu(size_t size)
 	if (!pdata)
 		return NULL;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
+	for_each_possible_cpu(i) {
 		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
 		if (!pdata->ptrs[i])
 			goto unwind_oom;
@@ -383,11 +381,9 @@ free_percpu(const void *objp)
 	int i;
 	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
+	for_each_possible_cpu(i)
 		kfree(p->ptrs[i]);
-	}
+
 	kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
-- 
cgit v1.2.3


From 97c2c9b84d0c1edf4926b13661d5af3f0edccbce Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 18 Apr 2006 22:20:38 -0700
Subject: [PATCH] oom-kill: mm locking fix

Dave Peterson <dsp@llnl.gov> points out that badness() is playing with
mm_structs without taking a reference on them.

mmput() can sleep, so taking a reference here (inside tasklist_lock) is
hard.  Fix it up via task_lock() instead.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 78747afad6b..9a643c4bf77 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -46,15 +46,25 @@
 unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
-	struct list_head *tsk;
+	struct mm_struct *mm;
+	struct task_struct *child;
 
-	if (!p->mm)
+	task_lock(p);
+	mm = p->mm;
+	if (!mm) {
+		task_unlock(p);
 		return 0;
+	}
 
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
-	points = p->mm->total_vm;
+	points = mm->total_vm;
+
+	/*
+	 * After this unlock we can no longer dereference local variable `mm'
+	 */
+	task_unlock(p);
 
 	/*
 	 * Processes which fork a lot of child processes are likely
@@ -64,11 +74,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	 * child is eating the vast majority of memory, adding only half
 	 * to the parents will make the child our kill candidate of choice.
 	 */
-	list_for_each(tsk, &p->children) {
-		struct task_struct *chld;
-		chld = list_entry(tsk, struct task_struct, sibling);
-		if (chld->mm != p->mm && chld->mm)
-			points += chld->mm->total_vm/2 + 1;
+	list_for_each_entry(child, &p->children, sibling) {
+		task_lock(child);
+		if (child->mm != mm && child->mm)
+			points += child->mm->total_vm/2 + 1;
+		task_unlock(child);
 	}
 
 	/*
-- 
cgit v1.2.3


From 013159227b840dfd441bd2e4c8b4d77ffb3cc42e Mon Sep 17 00:00:00 2001
From: Dave Peterson <dsp@llnl.gov>
Date: Tue, 18 Apr 2006 22:20:44 -0700
Subject: [PATCH] mm: fix mm_struct reference counting bugs in mm/oom_kill.c

Fix oom_kill_task() so it doesn't call mmput() (which may sleep) while
holding tasklist_lock.

Signed-off-by: David S. Peterson <dsp@llnl.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9a643c4bf77..042e6436c3e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -254,17 +254,24 @@ static void __oom_kill_task(task_t *p, const char *message)
 	force_sig(SIGKILL, p);
 }
 
-static struct mm_struct *oom_kill_task(task_t *p, const char *message)
+static int oom_kill_task(task_t *p, const char *message)
 {
-	struct mm_struct *mm = get_task_mm(p);
+	struct mm_struct *mm;
 	task_t * g, * q;
 
-	if (!mm)
-		return NULL;
-	if (mm == &init_mm) {
-		mmput(mm);
-		return NULL;
-	}
+	mm = p->mm;
+
+	/* WARNING: mm may not be dereferenced since we did not obtain its
+	 * value from get_task_mm(p).  This is OK since all we need to do is
+	 * compare mm to q->mm below.
+	 *
+	 * Furthermore, even if mm contains a non-NULL value, p->mm may
+	 * change to NULL at any time since we do not hold task_lock(p).
+	 * However, this is of no concern to us.
+	 */
+
+	if (mm == NULL || mm == &init_mm)
+		return 1;
 
 	__oom_kill_task(p, message);
 	/*
@@ -276,13 +283,12 @@ static struct mm_struct *oom_kill_task(task_t *p, const char *message)
 			__oom_kill_task(q, message);
 	while_each_thread(g, q);
 
-	return mm;
+	return 0;
 }
 
-static struct mm_struct *oom_kill_process(struct task_struct *p,
-				unsigned long points, const char *message)
+static int oom_kill_process(struct task_struct *p, unsigned long points,
+		const char *message)
 {
- 	struct mm_struct *mm;
 	struct task_struct *c;
 	struct list_head *tsk;
 
@@ -293,9 +299,8 @@ static struct mm_struct *oom_kill_process(struct task_struct *p,
 		c = list_entry(tsk, struct task_struct, sibling);
 		if (c->mm == p->mm)
 			continue;
-		mm = oom_kill_task(c, message);
-		if (mm)
-			return mm;
+		if (!oom_kill_task(c, message))
+			return 0;
 	}
 	return oom_kill_task(p, message);
 }
@@ -310,7 +315,6 @@ static struct mm_struct *oom_kill_process(struct task_struct *p,
  */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
-	struct mm_struct *mm = NULL;
 	task_t *p;
 	unsigned long points = 0;
 
@@ -330,12 +334,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 	 */
 	switch (constrained_alloc(zonelist, gfp_mask)) {
 	case CONSTRAINT_MEMORY_POLICY:
-		mm = oom_kill_process(current, points,
+		oom_kill_process(current, points,
 				"No available memory (MPOL_BIND)");
 		break;
 
 	case CONSTRAINT_CPUSET:
-		mm = oom_kill_process(current, points,
+		oom_kill_process(current, points,
 				"No available memory in cpuset");
 		break;
 
@@ -357,8 +361,7 @@ retry:
 			panic("Out of memory and no killable processes...\n");
 		}
 
-		mm = oom_kill_process(p, points, "Out of memory");
-		if (!mm)
+		if (oom_kill_process(p, points, "Out of memory"))
 			goto retry;
 
 		break;
@@ -367,8 +370,6 @@ retry:
 out:
 	read_unlock(&tasklist_lock);
 	cpuset_unlock();
-	if (mm)
-		mmput(mm);
 
 	/*
 	 * Give "p" a good chance of killing itself before we
-- 
cgit v1.2.3


From 6aa3001b239b387d98a7f945e4a51edeb59e4f2d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 18 Apr 2006 22:20:52 -0700
Subject: [PATCH] page_alloc.c: buddy handling cleanup

Fix up some whitespace damage.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 97d6827c7d6..123c6058674 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -232,11 +232,13 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_order(struct page *page)
+{
 	return page_private(page);
 }
 
-static inline void set_page_order(struct page *page, int order) {
+static inline void set_page_order(struct page *page, int order)
+{
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
@@ -299,9 +301,9 @@ static inline int page_is_buddy(struct page *page, int order)
 
 	if (PageBuddy(page) && page_order(page) == order) {
 		BUG_ON(page_count(page) != 0);
-               return 1;
+		return 1;
 	}
-       return 0;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 6d472be37896b1c41b50f3da124f8b7718ba7797 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 20 Apr 2006 02:43:12 -0700
Subject: [PATCH] Remove cond_resched in gather_stats()

gather_stats() is called with a spinlock held from check_pte_range.  We
cannot reschedule with a lock held.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dec8249e972..8778f58880c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1761,7 +1761,6 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
 		md->mapcount_max = count;
 
 	md->node[page_to_nid(page)]++;
-	cond_resched();
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3