From b5606c2d4447e80b1d72406af4e78af1eda611d4 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 13 Feb 2008 15:03:16 -0800 Subject: remove final fastcall users fastcall always expands to empty, remove it. Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index b7b1be6dbd8..5c74b68935a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -604,7 +604,7 @@ void __lock_page(struct page *page) } EXPORT_SYMBOL(__lock_page); -int fastcall __lock_page_killable(struct page *page) +int __lock_page_killable(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); -- cgit v1.2.3 From 064d9efe947542097be669581f82d6b097e81d1a Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 13 Feb 2008 15:03:19 -0800 Subject: hugetlb: fix overcommit locking proc_doulongvec_minmax() calls copy_to_user()/copy_from_user(), so we can't hold hugetlb_lock over the call. Use a dummy variable to store the sysctl result, like in hugetlb_sysctl_handler(), then grab the lock to update nr_overcommit_huge_pages. Signed-off-by: Nishanth Aravamudan Reported-by: Miles Lane Cc: Adam Litke Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d9a38031246..cb1b3a7ecdf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,14 +24,15 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; static unsigned long surplus_huge_pages; +static unsigned long nr_overcommit_huge_pages; unsigned long max_huge_pages; +unsigned long sysctl_overcommit_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -unsigned long nr_overcommit_huge_pages; static int hugetlb_next_nid; /* @@ -609,8 +610,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - spin_lock(&hugetlb_lock); proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + spin_lock(&hugetlb_lock); + nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; spin_unlock(&hugetlb_lock); return 0; } -- cgit v1.2.3 From e8bff74afbdb4ad72bf6135c84289c47cf557892 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Feb 2008 20:21:06 +0100 Subject: x86: fix "BUG: sleeping function called from invalid context" in print_vma_addr() Jiri Kosina reported the following deadlock scenario with show_unhandled_signals enabled: [ 68.379022] gnome-settings-[2941] trap int3 ip:3d2c840f34 sp:7fff36f5d100 error:0<3>BUG: sleeping function called from invalid context at kernel/rwsem.c:21 [ 68.379039] in_atomic():1, irqs_disabled():0 [ 68.379044] no locks held by gnome-settings-/2941. [ 68.379050] Pid: 2941, comm: gnome-settings- Not tainted 2.6.25-rc1 #30 [ 68.379054] [ 68.379056] Call Trace: [ 68.379061] <#DB> [] ? __debug_show_held_locks+0x13/0x30 [ 68.379109] [] __might_sleep+0xe5/0x110 [ 68.379123] [] down_read+0x20/0x70 [ 68.379137] [] print_vma_addr+0x3a/0x110 [ 68.379152] [] do_trap+0xf5/0x170 [ 68.379168] [] do_int3+0x7b/0xe0 [ 68.379180] [] int3+0x9f/0xd0 [ 68.379203] <> [ 68.379229] in libglib-2.0.so.0.1505.0[3d2c800000+dc000] and tracked it down to: commit 03252919b79891063cf99145612360efbdf9500b Author: Andi Kleen Date: Wed Jan 30 13:33:18 2008 +0100 x86: print which shared library/executable faulted in segfault etc. messages the problem is that we call down_read() from an atomic context. Solve this by returning from print_vma_addr() if the preempt count is elevated. Update preempt_conditional_sti / preempt_conditional_cli to unconditionally lift the preempt count even on !CONFIG_PREEMPT. Reported-by: Jiri Kosina Signed-off-by: Ingo Molnar --- mm/memory.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 717aa0e3be2..55b97ef6de1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2711,6 +2711,13 @@ void print_vma_addr(char *prefix, unsigned long ip) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + /* + * Do not print if we are in atomic + * contexts (in exception stacks, etc.): + */ + if (preempt_count()) + return; + down_read(&mm->mmap_sem); vma = find_vma(mm, ip); if (vma && vma->vm_file) { -- cgit v1.2.3 From e51bfd0ad10600a9fe4c8ede5ac2272e80075008 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 10 Feb 2008 11:21:54 +0100 Subject: slab: avoid double initialization & do initialization in 1 place - alloc_slabmgmt: initialize all slab fields in 1 place - slab->nodeid was initialized twice: in alloc_slabmgmt and immediately after it in cache_grow Signed-off-by: Marcin Slusarz CC: Christoph Lameter Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slab.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 40c00dacbe4..473e6c2eaef 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2630,6 +2630,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; slabp->nodeid = nodeid; + slabp->free = 0; return slabp; } @@ -2683,7 +2684,6 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_bufctl(slabp)[i] = i + 1; } slab_bufctl(slabp)[i - 1] = BUFCTL_END; - slabp->free = 0; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) @@ -2816,7 +2816,6 @@ static int cache_grow(struct kmem_cache *cachep, if (!slabp) goto opps1; - slabp->nodeid = nodeid; slab_map_pages(cachep, slabp, objp); cache_init_objs(cachep, slabp); -- cgit v1.2.3 From eada35efcb2773cf49aa26277e056122e1a3405c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 11 Feb 2008 22:47:46 +0200 Subject: slub: kmalloc page allocator pass-through cleanup This adds a proper function for kmalloc page allocator pass-through. While it simplifies any code that does slab tracing code a lot, I think it's a worthwhile cleanup in itself. Signed-off-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e2989ae243b..7870ef9d863 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2671,8 +2671,7 @@ void *__kmalloc(size_t size, gfp_t flags) struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); + return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2689,8 +2688,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); + return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -3219,8 +3217,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(gfpflags | __GFP_COMP, - get_order(size)); + return kmalloc_large(size, gfpflags); + s = get_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) @@ -3235,8 +3233,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(gfpflags | __GFP_COMP, - get_order(size)); + return kmalloc_large(size, gfpflags); + s = get_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) -- cgit v1.2.3 From dada123d99c241d1a45798a7c77bcf99c4968704 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 13 Feb 2008 23:30:32 +0200 Subject: make slub.c:slab_address() static slab_address() can become static. Signed-off-by: Adrian Bunk Signed-off-by: Christoph Lameter --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7870ef9d863..1af7f2f1942 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -308,7 +308,7 @@ static inline int is_end(void *addr) return (unsigned long)addr & PAGE_MAPPING_ANON; } -void *slab_address(struct page *page) +static void *slab_address(struct page *page) { return page->end - PAGE_MAPPING_ANON; } -- cgit v1.2.3 From b7a49f0d4c34166ae84089d9f145cfaae1b0eec5 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Feb 2008 14:21:32 -0800 Subject: slub: Determine gfpflags once and not every time a slab is allocated Currently we determine the gfp flags to pass to the page allocator each time a slab is being allocated. Determine the bits to be set at the time the slab is created. Store in a new allocflags field and add the flags in allocate_slab(). Acked-by: Mel Gorman Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 1af7f2f1942..ccfd41141b6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1078,14 +1078,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; int pages = 1 << s->order; - if (s->order) - flags |= __GFP_COMP; - - if (s->flags & SLAB_CACHE_DMA) - flags |= SLUB_DMA; - - if (s->flags & SLAB_RECLAIM_ACCOUNT) - flags |= __GFP_RECLAIMABLE; + flags |= s->allocflags; if (node == -1) page = alloc_pages(flags, s->order); @@ -2333,6 +2326,16 @@ static int calculate_sizes(struct kmem_cache *s) if (s->order < 0) return 0; + s->allocflags = 0; + if (s->order) + s->allocflags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + s->allocflags |= SLUB_DMA; + + if (s->flags & SLAB_RECLAIM_ACCOUNT) + s->allocflags |= __GFP_RECLAIMABLE; + /* * Determine the number of objects per slab */ -- cgit v1.2.3 From 71c7a06ff0a2ba0434ace4d7aa679537c4211d9d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Feb 2008 14:28:01 -0800 Subject: slub: Fallback to kmalloc_large for failing higher order allocs Slub already has two ways of allocating an object. One is via its own logic and the other is via the call to kmalloc_large to hand off object allocation to the page allocator. kmalloc_large is typically used for objects >= PAGE_SIZE. We can use that handoff to avoid failing if a higher order kmalloc slab allocation cannot be satisfied by the page allocator. If we reach the out of memory path then simply try a kmalloc_large(). kfree() can already handle the case of an object that was allocated via the page allocator and so this will work just fine (apart from object accounting...). For any kmalloc slab that already requires higher order allocs (which makes it impossible to use the page allocator fastpath!) we just use PAGE_ALLOC_COSTLY_ORDER to get the largest number of objects in one go from the page allocator slowpath. On a 4k platform this patch will lead to the following use of higher order pages for the following kmalloc slabs: 8 ... 1024 order 0 2048 .. 4096 order 3 (4k slab only after the next patch) We may waste some space if fallback occurs on a 2k slab but we are always able to fallback to an order 0 alloc. Reviewed-by: Pekka Enberg Signed-off-by: Christoph Lameter --- mm/slub.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ccfd41141b6..644fd0aaeaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -211,6 +211,8 @@ static inline void ClearSlabDebug(struct page *page) /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ +#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */ +#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */ /* Not all arches define cache_line_size */ #ifndef cache_line_size @@ -1539,7 +1541,6 @@ load_freelist: unlock_out: slab_unlock(c->page); stat(c, ALLOC_SLOWPATH); -out: #ifdef SLUB_FASTPATH local_irq_restore(flags); #endif @@ -1574,8 +1575,24 @@ new_slab: c->page = new; goto load_freelist; } - object = NULL; - goto out; +#ifdef SLUB_FASTPATH + local_irq_restore(flags); +#endif + /* + * No memory available. + * + * If the slab uses higher order allocs but the object is + * smaller than a page size then we can fallback in emergencies + * to the page allocator via kmalloc_large. The page allocator may + * have failed to obtain a higher order page and we can try to + * allocate a single page if the object fits into a single page. + * That is only possible if certain conditions are met that are being + * checked when a slab is created. + */ + if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK)) + return kmalloc_large(s->objsize, gfpflags); + + return NULL; debug: object = c->page->freelist; if (!alloc_debug_processing(s, c->page, object, addr)) @@ -2322,7 +2339,20 @@ static int calculate_sizes(struct kmem_cache *s) size = ALIGN(size, align); s->size = size; - s->order = calculate_order(size); + if ((flags & __KMALLOC_CACHE) && + PAGE_SIZE / size < slub_min_objects) { + /* + * Kmalloc cache that would not have enough objects in + * an order 0 page. Kmalloc slabs can fallback to + * page allocator order 0 allocs so take a reasonably large + * order that will allows us a good number of objects. + */ + s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); + s->flags |= __PAGE_ALLOC_FALLBACK; + s->allocflags |= __GFP_NOWARN; + } else + s->order = calculate_order(size); + if (s->order < 0) return 0; @@ -2539,7 +2569,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, down_write(&slub_lock); if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL)) + flags | __KMALLOC_CACHE, NULL)) goto panic; list_add(&s->list, &slab_caches); @@ -3058,6 +3088,9 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; + if ((s->flags & __PAGE_ALLOC_FALLBACK) + return 1; + if (s->ctor) return 1; -- cgit v1.2.3 From 331dc558fa020451ff773973cee855fd721aa88e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Feb 2008 14:28:09 -0800 Subject: slub: Support 4k kmallocs again to compensate for page allocator slowness Currently we hand off PAGE_SIZEd kmallocs to the page allocator in the mistaken belief that the page allocator can handle these allocations effectively. However, measurements indicate a minimum slowdown by the factor of 8 (and that is only SMP, NUMA is much worse) vs the slub fastpath which causes regressions in tbench. Increase the number of kmalloc caches by one so that we again handle 4k kmallocs directly from slub. 4k page buffering for the page allocator will be performed by slub like done by slab. At some point the page allocator fastpath should be fixed. A lot of the kernel would benefit from a faster ability to allocate a single page. If that is done then the 4k allocs may again be forwarded to the page allocator and this patch could be reverted. Reviewed-by: Pekka Enberg Acked-by: Mel Gorman Signed-off-by: Christoph Lameter --- mm/slub.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 644fd0aaeaf..4b3895cb90e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2517,11 +2517,11 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; +struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; #endif static int __init setup_slub_min_order(char *str) @@ -2703,7 +2703,7 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE / 2)) + if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2720,7 +2720,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE / 2)) + if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -3032,7 +3032,7 @@ void __init kmem_cache_init(void) caches++; } - for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); caches++; @@ -3059,7 +3059,7 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); @@ -3088,7 +3088,7 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if ((s->flags & __PAGE_ALLOC_FALLBACK) + if ((s->flags & __PAGE_ALLOC_FALLBACK)) return 1; if (s->ctor) @@ -3252,7 +3252,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE / 2)) + if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); @@ -3268,7 +3268,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE / 2)) + if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); -- cgit v1.2.3 From c32c2f63a9d6c953aaf168c0b2551da9734f76d2 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:38:43 -0800 Subject: d_path: Make seq_path() use a struct path argument seq_path() is always called with a dentry and a vfsmount from a struct path. Make seq_path() take it directly as an argument. Signed-off-by: Jan Blunck Cc: Christoph Hellwig Cc: Al Viro Cc: "J. Bruce Fields" Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- mm/swapfile.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8d246c3b340..6c7ba1a63d2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1996,7 +1996,7 @@ int show_numa_map(struct seq_file *m, void *v) if (file) { seq_printf(m, " file="); - seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); + seq_path(m, &file->f_path, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_printf(m, " heap"); } else if (vma->vm_start <= mm->start_stack && diff --git a/mm/swapfile.c b/mm/swapfile.c index 02ccab5ad9d..2da149cfc9a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1394,7 +1394,7 @@ static int swap_show(struct seq_file *swap, void *v) } file = ptr->swap_file; - len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\"); + len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? -- cgit v1.2.3 From cf28b4863f9ee8f122e8ff3ac0d403e07ba9c6d9 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Thu, 14 Feb 2008 19:38:44 -0800 Subject: d_path: Make d_path() use a struct path d_path() is used on a pair. Lets use a struct path to reflect this. [akpm@linux-foundation.org: fix build in mm/memory.c] Signed-off-by: Jan Blunck Acked-by: Bryan Wu Acked-by: Christoph Hellwig Cc: Al Viro Cc: "J. Bruce Fields" Cc: Neil Brown Cc: Michael Halcrow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 717aa0e3be2..e7a6dcacefc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,7 +2719,7 @@ void print_vma_addr(char *prefix, unsigned long ip) if (buf) { char *p, *s; - p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE); + p = d_path(&f->f_path, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; s = strrchr(p, '/'); -- cgit v1.2.3 From 00e962c5408b9f2d0bebd2308673fe982cb9a5fe Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 19 Feb 2008 09:08:49 -0800 Subject: Revert "SLUB: Alternate fast paths using cmpxchg_local" This reverts commit 1f84260c8ce3b1ce26d4c1d6dedc2f33a3a29c0c, which is suspected to be the reason for some very occasional and hard-to-trigger crashes that usually look related to memory allocation (mostly reported in networking, but since that's generally the most common source of shortlived allocations - and allocations in interrupt contexts - that in itself is not a big clue). See for example http://bugzilla.kernel.org/show_bug.cgi?id=9973 http://lkml.org/lkml/2008/2/19/278 etc. One promising suspicion for what the root cause of bug is (which also explains why it's so hard to trigger in practice) came from Eric Dumazet: "I wonder how SLUB_FASTPATH is supposed to work, since it is affected by a classical ABA problem of lockless algo. cmpxchg_local(&c->freelist, object, object[c->offset]) can succeed, while an interrupt came (on this cpu), and several allocations were done, and one free was performed at the end of this interruption, so 'object' was recycled. c->freelist can then contain the previous value (object), but object[c->offset] was changed by IRQ. We then put back in freelist an already allocated object." but another reason for the revert is simply that everybody agrees that this code was the main suspect just by virtue of the pattern of oopses. Cc: Torsten Kaiser Cc: Christoph Lameter Cc: Mathieu Desnoyers Cc: Pekka Enberg Cc: Ingo Molnar Cc: Eric Dumazet Signed-off-by: Linus Torvalds --- mm/slub.c | 87 +-------------------------------------------------------------- 1 file changed, 1 insertion(+), 86 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4b3895cb90e..74c65af0a54 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -149,13 +149,6 @@ static inline void ClearSlabDebug(struct page *page) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -/* - * Currently fastpath is not supported if preemption is enabled. - */ -#if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT) -#define SLUB_FASTPATH -#endif - #if PAGE_SHIFT <= 12 /* @@ -1514,11 +1507,7 @@ static void *__slab_alloc(struct kmem_cache *s, { void **object; struct page *new; -#ifdef SLUB_FASTPATH - unsigned long flags; - local_irq_save(flags); -#endif if (!c->page) goto new_slab; @@ -1541,9 +1530,6 @@ load_freelist: unlock_out: slab_unlock(c->page); stat(c, ALLOC_SLOWPATH); -#ifdef SLUB_FASTPATH - local_irq_restore(flags); -#endif return object; another_slab: @@ -1575,9 +1561,7 @@ new_slab: c->page = new; goto load_freelist; } -#ifdef SLUB_FASTPATH - local_irq_restore(flags); -#endif + /* * No memory available. * @@ -1619,34 +1603,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; - -/* - * The SLUB_FASTPATH path is provisional and is currently disabled if the - * kernel is compiled with preemption or if the arch does not support - * fast cmpxchg operations. There are a couple of coming changes that will - * simplify matters and allow preemption. Ultimately we may end up making - * SLUB_FASTPATH the default. - * - * 1. The introduction of the per cpu allocator will avoid array lookups - * through get_cpu_slab(). A special register can be used instead. - * - * 2. The introduction of per cpu atomic operations (cpu_ops) means that - * we can realize the logic here entirely with per cpu atomics. The - * per cpu atomic ops will take care of the preemption issues. - */ - -#ifdef SLUB_FASTPATH - c = get_cpu_slab(s, raw_smp_processor_id()); - do { - object = c->freelist; - if (unlikely(is_end(object) || !node_match(c, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); - break; - } - stat(c, ALLOC_FASTPATH); - } while (cmpxchg_local(&c->freelist, object, object[c->offset]) - != object); -#else unsigned long flags; local_irq_save(flags); @@ -1661,7 +1617,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, stat(c, ALLOC_FASTPATH); } local_irq_restore(flags); -#endif if (unlikely((gfpflags & __GFP_ZERO) && object)) memset(object, 0, c->objsize); @@ -1698,11 +1653,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, void **object = (void *)x; struct kmem_cache_cpu *c; -#ifdef SLUB_FASTPATH - unsigned long flags; - - local_irq_save(flags); -#endif c = get_cpu_slab(s, raw_smp_processor_id()); stat(c, FREE_SLOWPATH); slab_lock(page); @@ -1734,9 +1684,6 @@ checks_ok: out_unlock: slab_unlock(page); -#ifdef SLUB_FASTPATH - local_irq_restore(flags); -#endif return; slab_empty: @@ -1749,9 +1696,6 @@ slab_empty: } slab_unlock(page); stat(c, FREE_SLAB); -#ifdef SLUB_FASTPATH - local_irq_restore(flags); -#endif discard_slab(s, page); return; @@ -1777,34 +1721,6 @@ static __always_inline void slab_free(struct kmem_cache *s, { void **object = (void *)x; struct kmem_cache_cpu *c; - -#ifdef SLUB_FASTPATH - void **freelist; - - c = get_cpu_slab(s, raw_smp_processor_id()); - debug_check_no_locks_freed(object, s->objsize); - do { - freelist = c->freelist; - barrier(); - /* - * If the compiler would reorder the retrieval of c->page to - * come before c->freelist then an interrupt could - * change the cpu slab before we retrieve c->freelist. We - * could be matching on a page no longer active and put the - * object onto the freelist of the wrong slab. - * - * On the other hand: If we already have the freelist pointer - * then any change of cpu_slab will cause the cmpxchg to fail - * since the freelist pointers are unique per slab. - */ - if (unlikely(page != c->page || c->node < 0)) { - __slab_free(s, page, x, addr, c->offset); - break; - } - object[c->offset] = freelist; - stat(c, FREE_FASTPATH); - } while (cmpxchg_local(&c->freelist, freelist, object) != freelist); -#else unsigned long flags; local_irq_save(flags); @@ -1818,7 +1734,6 @@ static __always_inline void slab_free(struct kmem_cache *s, __slab_free(s, page, x, addr, c->offset); local_irq_restore(flags); -#endif } void kmem_cache_free(struct kmem_cache *s, void *x) -- cgit v1.2.3 From e5df70ab194543522397fa3da8c8f80564a0f7d3 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Sat, 23 Feb 2008 15:23:32 -0800 Subject: hugetlb: ensure we do not reference a surplus page after handing it to buddy When we free a page via free_huge_page and we detect that we are in surplus the page will be returned to the buddy. After this we no longer own the page. However at the end free_huge_page we clear out our mapping pointer from page private. Even where the page is not a surplus we free the page to the hugepage pool, drop the pool locks and then clear page private. In either case the page may have been reallocated. BAD. Make sure we clear out page private before we free the page. Signed-off-by: Andy Whitcroft Acked-by: Adam Litke Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cb1b3a7ecdf..89e6286a7f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -120,6 +120,7 @@ static void free_huge_page(struct page *page) struct address_space *mapping; mapping = (struct address_space *) page_private(page); + set_page_private(page, 0); BUG_ON(page_count(page)); INIT_LIST_HEAD(&page->lru); @@ -134,7 +135,6 @@ static void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); if (mapping) hugetlb_put_quota(mapping, 1); - set_page_private(page, 0); } /* -- cgit v1.2.3 From b5a0e011329431b90d315eaf6ca5fdb41df7a117 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 23 Feb 2008 15:24:06 -0800 Subject: Solve section mismatch for free_area_init_core. WARNING: vmlinux.o(.meminit.text+0x649): Section mismatch in reference from the function free_area_init_core() to the function .init.text:setup_usemap() The function __meminit free_area_init_core() references a function __init setup_usemap(). If free_area_init_core is only used by setup_usemap then annotate free_area_init_core with a matching annotation. The warning is covers this stack of functions in mm/page_alloc.c: alloc_bootmem_node must be marked __init. alloc_bootmem_node is used by setup_usemap, if !SPARSEMEM. (usemap_size is only used by setup_usemap, if !SPARSEMEM.) setup_usemap is only used by free_area_init_core. free_area_init_core is only used by free_area_init_node. free_area_init_node is used by: arch/alpha/mm/numa.c: __init paging_init() arch/arm/mm/init.c: __init bootmem_init_node() arch/avr32/mm/init.c: __init paging_init() arch/cris/arch-v10/mm/init.c: __init paging_init() arch/cris/arch-v32/mm/init.c: __init paging_init() arch/m32r/mm/discontig.c: __init zone_sizes_init() arch/m32r/mm/init.c: __init zone_sizes_init() arch/m68k/mm/motorola.c: __init paging_init() arch/m68k/mm/sun3mmu.c: __init paging_init() arch/mips/sgi-ip27/ip27-memory.c: __init paging_init() arch/parisc/mm/init.c: __init paging_init() arch/sparc/mm/srmmu.c: __init srmmu_paging_init() arch/sparc/mm/sun4c.c: __init sun4c_paging_init() arch/sparc64/mm/init.c: __init paging_init() mm/page_alloc.c: __init free_area_init_nodes() mm/page_alloc.c: __init free_area_init() and mm/memory_hotplug.c: hotadd_new_pgdat() hotadd_new_pgdat can not be an __init function, but: It is compiled for MEMORY_HOTPLUG configurations only MEMORY_HOTPLUG depends on SPARSEMEM || X86_64_ACPI_NUMA X86_64_ACPI_NUMA depends on X86_64 ARCH_FLATMEM_ENABLE depends on X86_32 ARCH_DISCONTIGMEM_ENABLE depends on X86_32 So X86_64_ACPI_NUMA implies SPARSEMEM, right? So we can mark the stack of functions __init for !SPARSEMEM, but we must mark them __meminit for SPARSEMEM configurations. This is ok, because then the calls to alloc_bootmem_node are also avoided. Compile-tested on: silly minimal config defconfig x86_32 defconfig x86_64 defconfig x86_64 -HIBERNATION +MEMORY_HOTPLUG Signed-off-by: Alexander van Heukelum Reviewed-by: Sam Ravnborg Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 13 +++++++++++++ mm/page_alloc.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 5a9a6200e03..789727309f4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -47,4 +47,17 @@ static inline unsigned long page_order(struct page *page) VM_BUG_ON(!PageBuddy(page)); return page_private(page); } + +/* + * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, + * so all functions starting at paging_init should be marked __init + * in those cases. SPARSEMEM, however, allows for memory hotplug, + * and alloc_bootmem_node is not used. + */ +#ifdef CONFIG_SPARSEMEM +#define __paginginit __meminit +#else +#define __paginginit __init +#endif + #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 75b97931334..8896e874a67 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3314,7 +3314,7 @@ static inline int pageblock_default_order(unsigned int order) * - mark all memory queues empty * - clear the memory bitmaps */ -static void __meminit free_area_init_core(struct pglist_data *pgdat, +static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -3438,7 +3438,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, +void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { -- cgit v1.2.3 From 7fde4c3eb7ee68828d76a2148ed6d70b6a794add Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:13 -0800 Subject: memcgroup: remove a useless VM_BUG_ON() Remove this VM_BUG_ON(), as Balbir stated: We used to have a for loop with !list_empty() as a termination condition and VM_BUG_ON(!pc) is a spill over. With the new loop, VM_BUG_ON(!pc) does not make sense. Signed-off-by: Li Zefan Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6bded84c20c..bb4d7acfbca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -534,7 +534,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (scan >= nr_to_scan) break; page = pc->page; - VM_BUG_ON(!pc); if (unlikely(!PageLRU(page))) continue; -- cgit v1.2.3 From 2dda81ca31dc73e695ff8b83351f7aaefbef192a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:14 -0800 Subject: memcgroup: return negative error code in mem_cgroup_create() Cgroup requires the subsystem to return negative error code on error in the create method. Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bb4d7acfbca..631002d085d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1100,7 +1100,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); if (mem == NULL) - return NULL; + return ERR_PTR(-ENOMEM); res_counter_init(&mem->res); @@ -1116,7 +1116,7 @@ free_out: free_mem_cgroup_per_zone_info(mem, node); if (cont->parent != NULL) kfree(mem); - return NULL; + return ERR_PTR(-ENOMEM); } static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, -- cgit v1.2.3