From b5606c2d4447e80b1d72406af4e78af1eda611d4 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 13 Feb 2008 15:03:16 -0800
Subject: remove final fastcall users

fastcall always expands to empty, remove it.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')
diff --git a/mm/filemap.c b/mm/filemap.c
index b7b1be6dbd8..5c74b68935a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -604,7 +604,7 @@ void __lock_page(struct page *page)
 }
 EXPORT_SYMBOL(__lock_page);
 
-int fastcall __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-- 
cgit v1.2.3


From 064d9efe947542097be669581f82d6b097e81d1a Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@us.ibm.com>
Date: Wed, 13 Feb 2008 15:03:19 -0800
Subject: hugetlb: fix overcommit locking

proc_doulongvec_minmax() calls copy_to_user()/copy_from_user(), so we can't
hold hugetlb_lock over the call.  Use a dummy variable to store the sysctl
result, like in hugetlb_sysctl_handler(), then grab the lock to update
nr_overcommit_huge_pages.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Reported-by: Miles Lane <miles.lane@gmail.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d9a38031246..cb1b3a7ecdf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,14 +24,15 @@
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 static unsigned long surplus_huge_pages;
+static unsigned long nr_overcommit_huge_pages;
 unsigned long max_huge_pages;
+unsigned long sysctl_overcommit_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-unsigned long nr_overcommit_huge_pages;
 static int hugetlb_next_nid;
 
 /*
@@ -609,8 +610,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 			struct file *file, void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
-	spin_lock(&hugetlb_lock);
 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	spin_lock(&hugetlb_lock);
+	nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
 	spin_unlock(&hugetlb_lock);
 	return 0;
 }
-- 
cgit v1.2.3


From e8bff74afbdb4ad72bf6135c84289c47cf557892 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 Feb 2008 20:21:06 +0100
Subject: x86: fix "BUG: sleeping function called from invalid context" in
 print_vma_addr()

Jiri Kosina reported the following deadlock scenario with
show_unhandled_signals enabled:

 [   68.379022] gnome-settings-[2941] trap int3 ip:3d2c840f34
 sp:7fff36f5d100 error:0<3>BUG: sleeping function called from invalid
 context at kernel/rwsem.c:21
 [   68.379039] in_atomic():1, irqs_disabled():0
 [   68.379044] no locks held by gnome-settings-/2941.
 [   68.379050] Pid: 2941, comm: gnome-settings- Not tainted 2.6.25-rc1 #30
 [   68.379054]
 [   68.379056] Call Trace:
 [   68.379061]  <#DB>  [<ffffffff81064883>] ? __debug_show_held_locks+0x13/0x30
 [   68.379109]  [<ffffffff81036765>] __might_sleep+0xe5/0x110
 [   68.379123]  [<ffffffff812f2240>] down_read+0x20/0x70
 [   68.379137]  [<ffffffff8109cdca>] print_vma_addr+0x3a/0x110
 [   68.379152]  [<ffffffff8100f435>] do_trap+0xf5/0x170
 [   68.379168]  [<ffffffff8100f52b>] do_int3+0x7b/0xe0
 [   68.379180]  [<ffffffff812f4a6f>] int3+0x9f/0xd0
 [   68.379203]  <<EOE>>
 [   68.379229]  in libglib-2.0.so.0.1505.0[3d2c800000+dc000]

and tracked it down to:

  commit 03252919b79891063cf99145612360efbdf9500b
  Author: Andi Kleen <ak@suse.de>
  Date:   Wed Jan 30 13:33:18 2008 +0100

      x86: print which shared library/executable faulted in segfault etc. messages

the problem is that we call down_read() from an atomic context.

Solve this by returning from print_vma_addr() if the preempt count is
elevated. Update preempt_conditional_sti / preempt_conditional_cli to
unconditionally lift the preempt count even on !CONFIG_PREEMPT.

Reported-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/memory.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 717aa0e3be2..55b97ef6de1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2711,6 +2711,13 @@ void print_vma_addr(char *prefix, unsigned long ip)
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 
+	/*
+	 * Do not print if we are in atomic
+	 * contexts (in exception stacks, etc.):
+	 */
+	if (preempt_count())
+		return;
+
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, ip);
 	if (vma && vma->vm_file) {
-- 
cgit v1.2.3


From e51bfd0ad10600a9fe4c8ede5ac2272e80075008 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 10 Feb 2008 11:21:54 +0100
Subject: slab: avoid double initialization & do initialization in 1 place

- alloc_slabmgmt: initialize all slab fields in 1 place
- slab->nodeid was initialized twice: in alloc_slabmgmt
  and immediately after it in cache_grow

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
CC: Christoph Lameter <clameter@sgi.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 mm/slab.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 40c00dacbe4..473e6c2eaef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2630,6 +2630,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	slabp->colouroff = colour_off;
 	slabp->s_mem = objp + colour_off;
 	slabp->nodeid = nodeid;
+	slabp->free = 0;
 	return slabp;
 }
 
@@ -2683,7 +2684,6 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		slab_bufctl(slabp)[i] = i + 1;
 	}
 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
-	slabp->free = 0;
 }
 
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
@@ -2816,7 +2816,6 @@ static int cache_grow(struct kmem_cache *cachep,
 	if (!slabp)
 		goto opps1;
 
-	slabp->nodeid = nodeid;
 	slab_map_pages(cachep, slabp, objp);
 
 	cache_init_objs(cachep, slabp);
-- 
cgit v1.2.3


From eada35efcb2773cf49aa26277e056122e1a3405c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 11 Feb 2008 22:47:46 +0200
Subject: slub: kmalloc page allocator pass-through cleanup

This adds a proper function for kmalloc page allocator pass-through. While it
simplifies any code that does slab tracing code a lot, I think it's a
worthwhile cleanup in itself.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 mm/slub.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index e2989ae243b..7870ef9d863 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2671,8 +2671,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(flags | __GFP_COMP,
-							get_order(size));
+		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
 
@@ -2689,8 +2688,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(flags | __GFP_COMP,
-							get_order(size));
+		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
 
@@ -3219,8 +3217,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
-							get_order(size));
+		return kmalloc_large(size, gfpflags);
+
 	s = get_slab(size, gfpflags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
@@ -3235,8 +3233,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	struct kmem_cache *s;
 
 	if (unlikely(size > PAGE_SIZE / 2))
-		return (void *)__get_free_pages(gfpflags | __GFP_COMP,
-							get_order(size));
+		return kmalloc_large(size, gfpflags);
+
 	s = get_slab(size, gfpflags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
-- 
cgit v1.2.3


From dada123d99c241d1a45798a7c77bcf99c4968704 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 13 Feb 2008 23:30:32 +0200
Subject: make slub.c:slab_address() static

slab_address() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 7870ef9d863..1af7f2f1942 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -308,7 +308,7 @@ static inline int is_end(void *addr)
 	return (unsigned long)addr & PAGE_MAPPING_ANON;
 }
 
-void *slab_address(struct page *page)
+static void *slab_address(struct page *page)
 {
 	return page->end - PAGE_MAPPING_ANON;
 }
-- 
cgit v1.2.3


From b7a49f0d4c34166ae84089d9f145cfaae1b0eec5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 14 Feb 2008 14:21:32 -0800
Subject: slub: Determine gfpflags once and not every time a slab is allocated

Currently we determine the gfp flags to pass to the page allocator
each time a slab is being allocated.

Determine the bits to be set at the time the slab is created. Store
in a new allocflags field and add the flags in allocate_slab().

Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 mm/slub.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 1af7f2f1942..ccfd41141b6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1078,14 +1078,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	struct page *page;
 	int pages = 1 << s->order;
 
-	if (s->order)
-		flags |= __GFP_COMP;
-
-	if (s->flags & SLAB_CACHE_DMA)
-		flags |= SLUB_DMA;
-
-	if (s->flags & SLAB_RECLAIM_ACCOUNT)
-		flags |= __GFP_RECLAIMABLE;
+	flags |= s->allocflags;
 
 	if (node == -1)
 		page = alloc_pages(flags, s->order);
@@ -2333,6 +2326,16 @@ static int calculate_sizes(struct kmem_cache *s)
 	if (s->order < 0)
 		return 0;
 
+	s->allocflags = 0;
+	if (s->order)
+		s->allocflags |= __GFP_COMP;
+
+	if (s->flags & SLAB_CACHE_DMA)
+		s->allocflags |= SLUB_DMA;
+
+	if (s->flags & SLAB_RECLAIM_ACCOUNT)
+		s->allocflags |= __GFP_RECLAIMABLE;
+
 	/*
 	 * Determine the number of objects per slab
 	 */
-- 
cgit v1.2.3


From 71c7a06ff0a2ba0434ace4d7aa679537c4211d9d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 14 Feb 2008 14:28:01 -0800
Subject: slub: Fallback to kmalloc_large for failing higher order allocs

Slub already has two ways of allocating an object. One is via its own
logic and the other is via the call to kmalloc_large to hand off object
allocation to the page allocator. kmalloc_large is typically used
for objects >= PAGE_SIZE.

We can use that handoff to avoid failing if a higher order kmalloc slab
allocation cannot be satisfied by the page allocator. If we reach the
out of memory path then simply try a kmalloc_large(). kfree() can
already handle the case of an object that was allocated via the page
allocator and so this will work just fine (apart from object
accounting...).

For any kmalloc slab that already requires higher order allocs (which
makes it impossible to use the page allocator fastpath!)
we just use PAGE_ALLOC_COSTLY_ORDER to get the largest number of
objects in one go from the page allocator slowpath.

On a 4k platform this patch will lead to the following use of higher
order pages for the following kmalloc slabs:

8 ... 1024	order 0
2048 .. 4096	order 3 (4k slab only after the next patch)

We may waste some space if fallback occurs on a 2k slab but we
are always able to fallback to an order 0 alloc.

Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 mm/slub.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index ccfd41141b6..644fd0aaeaf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -211,6 +211,8 @@ static inline void ClearSlabDebug(struct page *page)
 /* Internal SLUB flags */
 #define __OBJECT_POISON		0x80000000 /* Poison object */
 #define __SYSFS_ADD_DEFERRED	0x40000000 /* Not yet visible via sysfs */
+#define __KMALLOC_CACHE		0x20000000 /* objects freed using kfree */
+#define __PAGE_ALLOC_FALLBACK	0x10000000 /* Allow fallback to page alloc */
 
 /* Not all arches define cache_line_size */
 #ifndef cache_line_size
@@ -1539,7 +1541,6 @@ load_freelist:
 unlock_out:
 	slab_unlock(c->page);
 	stat(c, ALLOC_SLOWPATH);
-out:
 #ifdef SLUB_FASTPATH
 	local_irq_restore(flags);
 #endif
@@ -1574,8 +1575,24 @@ new_slab:
 		c->page = new;
 		goto load_freelist;
 	}
-	object = NULL;
-	goto out;
+#ifdef SLUB_FASTPATH
+	local_irq_restore(flags);
+#endif
+	/*
+	 * No memory available.
+	 *
+	 * If the slab uses higher order allocs but the object is
+	 * smaller than a page size then we can fallback in emergencies
+	 * to the page allocator via kmalloc_large. The page allocator may
+	 * have failed to obtain a higher order page and we can try to
+	 * allocate a single page if the object fits into a single page.
+	 * That is only possible if certain conditions are met that are being
+	 * checked when a slab is created.
+	 */
+	if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK))
+		return kmalloc_large(s->objsize, gfpflags);
+
+	return NULL;
 debug:
 	object = c->page->freelist;
 	if (!alloc_debug_processing(s, c->page, object, addr))
@@ -2322,7 +2339,20 @@ static int calculate_sizes(struct kmem_cache *s)
 	size = ALIGN(size, align);
 	s->size = size;
 
-	s->order = calculate_order(size);
+	if ((flags & __KMALLOC_CACHE) &&
+			PAGE_SIZE / size < slub_min_objects) {
+		/*
+		 * Kmalloc cache that would not have enough objects in
+		 * an order 0 page. Kmalloc slabs can fallback to
+		 * page allocator order 0 allocs so take a reasonably large
+		 * order that will allows us a good number of objects.
+		 */
+		s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
+		s->flags |= __PAGE_ALLOC_FALLBACK;
+		s->allocflags |= __GFP_NOWARN;
+	} else
+		s->order = calculate_order(size);
+
 	if (s->order < 0)
 		return 0;
 
@@ -2539,7 +2569,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
 
 	down_write(&slub_lock);
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
-			flags, NULL))
+			flags | __KMALLOC_CACHE, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
@@ -3058,6 +3088,9 @@ static int slab_unmergeable(struct kmem_cache *s)
 	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
 		return 1;
 
+	if ((s->flags & __PAGE_ALLOC_FALLBACK)
+		return 1;
+
 	if (s->ctor)
 		return 1;
 
-- 
cgit v1.2.3


From 331dc558fa020451ff773973cee855fd721aa88e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 14 Feb 2008 14:28:09 -0800
Subject: slub: Support 4k kmallocs again to compensate for page allocator
 slowness

Currently we hand off PAGE_SIZEd kmallocs to the page allocator in the
mistaken belief that the page allocator can handle these allocations
effectively. However, measurements indicate a minimum slowdown by the
factor of 8 (and that is only SMP, NUMA is much worse) vs the slub fastpath
which causes regressions in tbench.

Increase the number of kmalloc caches by one so that we again handle 4k
kmallocs directly from slub. 4k page buffering for the page allocator
will be performed by slub like done by slab.

At some point the page allocator fastpath should be fixed. A lot of the kernel
would benefit from a faster ability to allocate a single page. If that is
done then the 4k allocs may again be forwarded to the page allocator and this
patch could be reverted.

Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
 mm/slub.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 644fd0aaeaf..4b3895cb90e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2517,11 +2517,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
 #endif
 
 static int __init setup_slub_min_order(char *str)
@@ -2703,7 +2703,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -2720,7 +2720,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -3032,7 +3032,7 @@ void __init kmem_cache_init(void)
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
 			"kmalloc", 1 << i, GFP_KERNEL);
 		caches++;
@@ -3059,7 +3059,7 @@ void __init kmem_cache_init(void)
 	slab_state = UP;
 
 	/* Provide the correct kmalloc names now that the caches are up */
-	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
+	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 
@@ -3088,7 +3088,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
 		return 1;
 
-	if ((s->flags & __PAGE_ALLOC_FALLBACK)
+	if ((s->flags & __PAGE_ALLOC_FALLBACK))
 		return 1;
 
 	if (s->ctor)
@@ -3252,7 +3252,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
@@ -3268,7 +3268,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 {
 	struct kmem_cache *s;
 
-	if (unlikely(size > PAGE_SIZE / 2))
+	if (unlikely(size > PAGE_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
-- 
cgit v1.2.3


From c32c2f63a9d6c953aaf168c0b2551da9734f76d2 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Thu, 14 Feb 2008 19:38:43 -0800
Subject: d_path: Make seq_path() use a struct path argument

seq_path() is always called with a dentry and a vfsmount from a struct path.
Make seq_path() take it directly as an argument.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 2 +-
 mm/swapfile.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8d246c3b340..6c7ba1a63d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1996,7 +1996,7 @@ int show_numa_map(struct seq_file *m, void *v)
 
 	if (file) {
 		seq_printf(m, " file=");
-		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
+		seq_path(m, &file->f_path, "\n\t= ");
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_printf(m, " heap");
 	} else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 02ccab5ad9d..2da149cfc9a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1394,7 +1394,7 @@ static int swap_show(struct seq_file *swap, void *v)
 	}
 
 	file = ptr->swap_file;
-	len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
+	len = seq_path(swap, &file->f_path, " \t\n\\");
 	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
 		       len < 40 ? 40 - len : 1, " ",
 		       S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
-- 
cgit v1.2.3


From cf28b4863f9ee8f122e8ff3ac0d403e07ba9c6d9 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Thu, 14 Feb 2008 19:38:44 -0800
Subject: d_path: Make d_path() use a struct path

d_path() is used on a <dentry,vfsmount> pair.  Lets use a struct path to
reflect this.

[akpm@linux-foundation.org: fix build in mm/memory.c]
Signed-off-by: Jan Blunck <jblunck@suse.de>
Acked-by: Bryan Wu <bryan.wu@analog.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 717aa0e3be2..e7a6dcacefc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2719,7 +2719,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 		if (buf) {
 			char *p, *s;
 
-			p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE);
+			p = d_path(&f->f_path, buf, PAGE_SIZE);
 			if (IS_ERR(p))
 				p = "?";
 			s = strrchr(p, '/');
-- 
cgit v1.2.3


From 00e962c5408b9f2d0bebd2308673fe982cb9a5fe Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Tue, 19 Feb 2008 09:08:49 -0800
Subject: Revert "SLUB: Alternate fast paths using cmpxchg_local"

This reverts commit 1f84260c8ce3b1ce26d4c1d6dedc2f33a3a29c0c, which is
suspected to be the reason for some very occasional and hard-to-trigger
crashes that usually look related to memory allocation (mostly reported
in networking, but since that's generally the most common source of
shortlived allocations - and allocations in interrupt contexts - that in
itself is not a big clue).

See for example
	http://bugzilla.kernel.org/show_bug.cgi?id=9973
	http://lkml.org/lkml/2008/2/19/278
etc.

One promising suspicion for what the root cause of bug is (which also
explains why it's so hard to trigger in practice) came from Eric
Dumazet:

   "I wonder how SLUB_FASTPATH is supposed to work, since it is affected
    by a classical ABA problem of lockless algo.

    cmpxchg_local(&c->freelist, object, object[c->offset]) can succeed,
    while an interrupt came (on this cpu), and several allocations were
    done, and one free was performed at the end of this interruption, so
    'object' was recycled.

    c->freelist can then contain the previous value (object), but
    object[c->offset] was changed by IRQ.

    We then put back in freelist an already allocated object."

but another reason for the revert is simply that everybody agrees that
this code was the main suspect just by virtue of the pattern of oopses.

Cc: Torsten Kaiser <just.for.lkml@googlemail.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 87 +--------------------------------------------------------------
 1 file changed, 1 insertion(+), 86 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 4b3895cb90e..74c65af0a54 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -149,13 +149,6 @@ static inline void ClearSlabDebug(struct page *page)
 /* Enable to test recovery from slab corruption on boot */
 #undef SLUB_RESILIENCY_TEST
 
-/*
- * Currently fastpath is not supported if preemption is enabled.
- */
-#if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT)
-#define SLUB_FASTPATH
-#endif
-
 #if PAGE_SHIFT <= 12
 
 /*
@@ -1514,11 +1507,7 @@ static void *__slab_alloc(struct kmem_cache *s,
 {
 	void **object;
 	struct page *new;
-#ifdef SLUB_FASTPATH
-	unsigned long flags;
 
-	local_irq_save(flags);
-#endif
 	if (!c->page)
 		goto new_slab;
 
@@ -1541,9 +1530,6 @@ load_freelist:
 unlock_out:
 	slab_unlock(c->page);
 	stat(c, ALLOC_SLOWPATH);
-#ifdef SLUB_FASTPATH
-	local_irq_restore(flags);
-#endif
 	return object;
 
 another_slab:
@@ -1575,9 +1561,7 @@ new_slab:
 		c->page = new;
 		goto load_freelist;
 	}
-#ifdef SLUB_FASTPATH
-	local_irq_restore(flags);
-#endif
+
 	/*
 	 * No memory available.
 	 *
@@ -1619,34 +1603,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
 	void **object;
 	struct kmem_cache_cpu *c;
-
-/*
- * The SLUB_FASTPATH path is provisional and is currently disabled if the
- * kernel is compiled with preemption or if the arch does not support
- * fast cmpxchg operations. There are a couple of coming changes that will
- * simplify matters and allow preemption. Ultimately we may end up making
- * SLUB_FASTPATH the default.
- *
- * 1. The introduction of the per cpu allocator will avoid array lookups
- *    through get_cpu_slab(). A special register can be used instead.
- *
- * 2. The introduction of per cpu atomic operations (cpu_ops) means that
- *    we can realize the logic here entirely with per cpu atomics. The
- *    per cpu atomic ops will take care of the preemption issues.
- */
-
-#ifdef SLUB_FASTPATH
-	c = get_cpu_slab(s, raw_smp_processor_id());
-	do {
-		object = c->freelist;
-		if (unlikely(is_end(object) || !node_match(c, node))) {
-			object = __slab_alloc(s, gfpflags, node, addr, c);
-			break;
-		}
-		stat(c, ALLOC_FASTPATH);
-	} while (cmpxchg_local(&c->freelist, object, object[c->offset])
-								!= object);
-#else
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -1661,7 +1617,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 		stat(c, ALLOC_FASTPATH);
 	}
 	local_irq_restore(flags);
-#endif
 
 	if (unlikely((gfpflags & __GFP_ZERO) && object))
 		memset(object, 0, c->objsize);
@@ -1698,11 +1653,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
 
-#ifdef SLUB_FASTPATH
-	unsigned long flags;
-
-	local_irq_save(flags);
-#endif
 	c = get_cpu_slab(s, raw_smp_processor_id());
 	stat(c, FREE_SLOWPATH);
 	slab_lock(page);
@@ -1734,9 +1684,6 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
-#ifdef SLUB_FASTPATH
-	local_irq_restore(flags);
-#endif
 	return;
 
 slab_empty:
@@ -1749,9 +1696,6 @@ slab_empty:
 	}
 	slab_unlock(page);
 	stat(c, FREE_SLAB);
-#ifdef SLUB_FASTPATH
-	local_irq_restore(flags);
-#endif
 	discard_slab(s, page);
 	return;
 
@@ -1777,34 +1721,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
 {
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
-
-#ifdef SLUB_FASTPATH
-	void **freelist;
-
-	c = get_cpu_slab(s, raw_smp_processor_id());
-	debug_check_no_locks_freed(object, s->objsize);
-	do {
-		freelist = c->freelist;
-		barrier();
-		/*
-		 * If the compiler would reorder the retrieval of c->page to
-		 * come before c->freelist then an interrupt could
-		 * change the cpu slab before we retrieve c->freelist. We
-		 * could be matching on a page no longer active and put the
-		 * object onto the freelist of the wrong slab.
-		 *
-		 * On the other hand: If we already have the freelist pointer
-		 * then any change of cpu_slab will cause the cmpxchg to fail
-		 * since the freelist pointers are unique per slab.
-		 */
-		if (unlikely(page != c->page || c->node < 0)) {
-			__slab_free(s, page, x, addr, c->offset);
-			break;
-		}
-		object[c->offset] = freelist;
-		stat(c, FREE_FASTPATH);
-	} while (cmpxchg_local(&c->freelist, freelist, object) != freelist);
-#else
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -1818,7 +1734,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
 		__slab_free(s, page, x, addr, c->offset);
 
 	local_irq_restore(flags);
-#endif
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
-- 
cgit v1.2.3


From e5df70ab194543522397fa3da8c8f80564a0f7d3 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Sat, 23 Feb 2008 15:23:32 -0800
Subject: hugetlb: ensure we do not reference a surplus page after handing it
 to buddy

When we free a page via free_huge_page and we detect that we are in surplus
the page will be returned to the buddy.  After this we no longer own the page.

However at the end free_huge_page we clear out our mapping pointer from
page private.  Even where the page is not a surplus we free the page to
the hugepage pool, drop the pool locks and then clear page private.  In
either case the page may have been reallocated.  BAD.

Make sure we clear out page private before we free the page.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cb1b3a7ecdf..89e6286a7f5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -120,6 +120,7 @@ static void free_huge_page(struct page *page)
 	struct address_space *mapping;
 
 	mapping = (struct address_space *) page_private(page);
+	set_page_private(page, 0);
 	BUG_ON(page_count(page));
 	INIT_LIST_HEAD(&page->lru);
 
@@ -134,7 +135,6 @@ static void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 	if (mapping)
 		hugetlb_put_quota(mapping, 1);
-	set_page_private(page, 0);
 }
 
 /*
-- 
cgit v1.2.3


From b5a0e011329431b90d315eaf6ca5fdb41df7a117 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Sat, 23 Feb 2008 15:24:06 -0800
Subject: Solve section mismatch for free_area_init_core.

WARNING: vmlinux.o(.meminit.text+0x649):
Section mismatch in reference from the
function free_area_init_core() to the function .init.text:setup_usemap()
The function __meminit free_area_init_core() references
a function __init setup_usemap().
If free_area_init_core is only used by setup_usemap then
annotate free_area_init_core with a matching annotation.

The warning is covers this stack of functions in mm/page_alloc.c:

alloc_bootmem_node must be marked __init.
alloc_bootmem_node is used by setup_usemap, if !SPARSEMEM.
(usemap_size is only used by setup_usemap, if !SPARSEMEM.)
setup_usemap is only used by free_area_init_core.
free_area_init_core is only used by free_area_init_node.

free_area_init_node is used by:
arch/alpha/mm/numa.c: __init paging_init()
arch/arm/mm/init.c: __init bootmem_init_node()
arch/avr32/mm/init.c: __init paging_init()
arch/cris/arch-v10/mm/init.c: __init paging_init()
arch/cris/arch-v32/mm/init.c: __init paging_init()
arch/m32r/mm/discontig.c: __init zone_sizes_init()
arch/m32r/mm/init.c: __init zone_sizes_init()
arch/m68k/mm/motorola.c: __init paging_init()
arch/m68k/mm/sun3mmu.c: __init paging_init()
arch/mips/sgi-ip27/ip27-memory.c: __init paging_init()
arch/parisc/mm/init.c: __init paging_init()
arch/sparc/mm/srmmu.c: __init srmmu_paging_init()
arch/sparc/mm/sun4c.c: __init sun4c_paging_init()
arch/sparc64/mm/init.c: __init paging_init()
mm/page_alloc.c: __init free_area_init_nodes()
mm/page_alloc.c: __init free_area_init()
and
mm/memory_hotplug.c: hotadd_new_pgdat()

hotadd_new_pgdat can not be an __init function, but:

It is compiled for MEMORY_HOTPLUG configurations only
MEMORY_HOTPLUG depends on SPARSEMEM || X86_64_ACPI_NUMA
X86_64_ACPI_NUMA depends on X86_64
ARCH_FLATMEM_ENABLE depends on X86_32
ARCH_DISCONTIGMEM_ENABLE depends on X86_32
So X86_64_ACPI_NUMA implies SPARSEMEM, right?

So we can mark the stack of functions __init for !SPARSEMEM, but we must mark
them __meminit for SPARSEMEM configurations.  This is ok, because then the
calls to alloc_bootmem_node are also avoided.

Compile-tested on:
silly minimal config
defconfig x86_32
defconfig x86_64
defconfig x86_64 -HIBERNATION +MEMORY_HOTPLUG

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h   | 13 +++++++++++++
 mm/page_alloc.c |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index 5a9a6200e03..789727309f4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -47,4 +47,17 @@ static inline unsigned long page_order(struct page *page)
 	VM_BUG_ON(!PageBuddy(page));
 	return page_private(page);
 }
+
+/*
+ * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
+ * so all functions starting at paging_init should be marked __init
+ * in those cases. SPARSEMEM, however, allows for memory hotplug,
+ * and alloc_bootmem_node is not used.
+ */
+#ifdef CONFIG_SPARSEMEM
+#define __paginginit __meminit
+#else
+#define __paginginit __init
+#endif
+
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 75b97931334..8896e874a67 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3314,7 +3314,7 @@ static inline int pageblock_default_order(unsigned int order)
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
-static void __meminit free_area_init_core(struct pglist_data *pgdat,
+static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
@@ -3438,7 +3438,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 
-void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
+void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long node_start_pfn,
 		unsigned long *zholes_size)
 {
-- 
cgit v1.2.3


From 7fde4c3eb7ee68828d76a2148ed6d70b6a794add Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 23 Feb 2008 15:24:13 -0800
Subject: memcgroup: remove a useless VM_BUG_ON()

Remove this VM_BUG_ON(), as Balbir stated:

We used to have a for loop with !list_empty() as a termination condition
and VM_BUG_ON(!pc) is a spill over.  With the new loop, VM_BUG_ON(!pc) does
not make sense.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6bded84c20c..bb4d7acfbca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -534,7 +534,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 		if (scan >= nr_to_scan)
 			break;
 		page = pc->page;
-		VM_BUG_ON(!pc);
 
 		if (unlikely(!PageLRU(page)))
 			continue;
-- 
cgit v1.2.3


From 2dda81ca31dc73e695ff8b83351f7aaefbef192a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 23 Feb 2008 15:24:14 -0800
Subject: memcgroup: return negative error code in mem_cgroup_create()

Cgroup requires the subsystem to return negative error code on error in the
create method.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bb4d7acfbca..631002d085d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1100,7 +1100,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
 
 	if (mem == NULL)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	res_counter_init(&mem->res);
 
@@ -1116,7 +1116,7 @@ free_out:
 		free_mem_cgroup_per_zone_info(mem, node);
 	if (cont->parent != NULL)
 		kfree(mem);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
-- 
cgit v1.2.3