diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/bootmem.c | 37 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 26 | ||||
-rw-r--r-- | mm/filemap_xip.c | 65 | ||||
-rw-r--r-- | mm/highmem.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 62 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/mempolicy.c | 1 | ||||
-rw-r--r-- | mm/mm_init.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 24 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 24 | ||||
-rw-r--r-- | mm/page_isolation.c | 13 | ||||
-rw-r--r-- | mm/quicklist.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 39 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 1 | ||||
-rw-r--r-- | mm/slob.c | 9 | ||||
-rw-r--r-- | mm/slub.c | 32 | ||||
-rw-r--r-- | mm/sparse.c | 1 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 26 | ||||
-rw-r--r-- | mm/truncate.c | 4 | ||||
-rw-r--r-- | mm/util.c | 15 | ||||
-rw-r--r-- | mm/vmalloc.c | 7 | ||||
-rw-r--r-- | mm/vmstat.c | 19 |
28 files changed, 340 insertions, 123 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 446c6588c75..91ee3922510 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,9 +77,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -config HAVE_GET_USER_PAGES_FAST - bool - # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows @@ -190,6 +187,9 @@ config RESOURCES_64BIT help This option allows memory and IO resources to be 64 bit. +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + config ZONE_DMA_FLAG int default "0" if !ZONE_DMA diff --git a/mm/bootmem.c b/mm/bootmem.c index 4af15d0340a..ad8eec6e44a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, + unsigned long step) +{ + unsigned long base = bdata->node_min_pfn; + + /* + * Align the index with respect to the node start so that the + * combination of both satisfies the requested alignment. + */ + + return ALIGN(base + idx, step) - base; +} + +static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, + unsigned long align) +{ + unsigned long base = PFN_PHYS(bdata->node_min_pfn); + + /* Same as align_idx for byte offsets */ + + return ALIGN(base + off, align) - base; +} + static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, else start = ALIGN(min, step); - sidx = start - bdata->node_min_pfn;; + sidx = start - bdata->node_min_pfn; midx = max - bdata->node_min_pfn; if (bdata->hint_idx > sidx) { @@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, * catch the fallback below. */ fallback = sidx + 1; - sidx = ALIGN(bdata->hint_idx, step); + sidx = align_idx(bdata, bdata->hint_idx, step); } while (1) { @@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long eidx, i, start_off, end_off; find_block: sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); - sidx = ALIGN(sidx, step); + sidx = align_idx(bdata, sidx, step); eidx = sidx + PFN_UP(size); if (sidx >= midx || eidx > midx) @@ -467,15 +490,15 @@ find_block: for (i = sidx; i < eidx; i++) if (test_bit(i, bdata->node_bootmem_map)) { - sidx = ALIGN(i, step); + sidx = align_idx(bdata, i, step); if (sidx == i) sidx += step; goto find_block; } - if (bdata->last_end_off && + if (bdata->last_end_off & (PAGE_SIZE - 1) && PFN_DOWN(bdata->last_end_off) + 1 == sidx) - start_off = ALIGN(bdata->last_end_off, align); + start_off = align_off(bdata, bdata->last_end_off, align); else start_off = PFN_PHYS(sidx); @@ -499,7 +522,7 @@ find_block: } if (fallback) { - sidx = ALIGN(fallback - 1, step); + sidx = align_idx(bdata, fallback - 1, step); fallback = 0; goto find_block; } diff --git a/mm/bounce.c b/mm/bounce.c index b6d2d0f1019..06722c40305 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * Data-less bio, nothing to bounce */ - if (bio_empty_barrier(*bio_orig)) + if (!bio_has_data(*bio_orig)) return; /* diff --git a/mm/filemap.c b/mm/filemap.c index 54e96865085..494ff20b6cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1100,8 +1100,9 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ @@ -1130,8 +1131,9 @@ readpage: } if (!PageUptodate(page)) { - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1143,15 +1145,14 @@ readpage: } unlock_page(page); shrink_readahead_size_eio(filp, ra); - goto readpage_eio; + error = -EIO; + goto readpage_error; } unlock_page(page); } goto page_ok; -readpage_eio: - error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; @@ -2129,13 +2130,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + * without clobbering -EIOCBQUEUED from ->direct_IO(). */ if (mapping->nrpages) { written = invalidate_inode_pages2_range(mapping, pos >> PAGE_CACHE_SHIFT, end); - if (written) + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; goto out; + } } written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 380ab402d71..b5167dfb2f2 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -15,6 +15,8 @@ #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/sched.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -22,22 +24,18 @@ * We do use our own empty page to avoid interference with other users * of ZERO_PAGE(), such as /dev/zero */ +static DEFINE_MUTEX(xip_sparse_mutex); +static seqcount_t xip_sparse_seq = SEQCNT_ZERO; static struct page *__xip_sparse_page; +/* called under xip_sparse_mutex */ static struct page *xip_sparse_page(void) { if (!__xip_sparse_page) { struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - if (page) { - static DEFINE_SPINLOCK(xip_alloc_lock); - spin_lock(&xip_alloc_lock); - if (!__xip_sparse_page) - __xip_sparse_page = page; - else - __free_page(page); - spin_unlock(&xip_alloc_lock); - } + if (page) + __xip_sparse_page = page; } return __xip_sparse_page; } @@ -174,18 +172,23 @@ __xip_unmap (struct address_space * mapping, pte_t pteval; spinlock_t *ptl; struct page *page; + unsigned count; + int locked = 0; + + count = read_seqcount_begin(&xip_sparse_seq); page = __xip_sparse_page; if (!page) return; +retry: spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); @@ -198,6 +201,14 @@ __xip_unmap (struct address_space * mapping, } } spin_unlock(&mapping->i_mmap_lock); + + if (locked) { + mutex_unlock(&xip_sparse_mutex); + } else if (read_seqcount_retry(&xip_sparse_seq, count)) { + mutex_lock(&xip_sparse_mutex); + locked = 1; + goto retry; + } } /* @@ -218,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; /* XXX: are VM_FAULT_ codes OK? */ - +again: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -237,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int err; /* maybe shared writable, allocate new block */ + mutex_lock(&xip_sparse_mutex); error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (error) return VM_FAULT_SIGBUS; /* unmap sparse mappings at pgoff from all other vmas */ @@ -252,14 +265,34 @@ found: BUG_ON(err); return VM_FAULT_NOPAGE; } else { + int err, ret = VM_FAULT_OOM; + + mutex_lock(&xip_sparse_mutex); + write_seqcount_begin(&xip_sparse_seq); + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (unlikely(!error)) { + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + goto again; + } + if (error != -ENODATA) + goto out; /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) - return VM_FAULT_OOM; + goto out; + err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, + page); + if (err == -ENOMEM) + goto out; - page_cache_get(page); - vmf->page = page; - return 0; + ret = VM_FAULT_NOPAGE; +out: + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + + return ret; } } @@ -308,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf, &xip_mem, &xip_pfn); if (status == -ENODATA) { /* we allocate a new page unmap it */ + mutex_lock(&xip_sparse_mutex); status = a_ops->get_xip_mem(mapping, index, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (!status) /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, index); diff --git a/mm/highmem.c b/mm/highmem.c index e16e1523b68..b36b83b920f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); static void flush_all_zero_pkmaps(void) { int i; + int need_flush = 0; flush_cache_kmaps(); @@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void) &pkmap_page_table[i]); set_page_address(page, NULL); + need_flush = 1; } - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + if (need_flush) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } /** diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 757ca983fd9..67a71191136 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* @@ -1937,6 +1942,18 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1962,6 +1979,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -1973,6 +1991,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -1989,25 +2008,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_unlock; } ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_unlock; + } + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } - } + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + +out_unlock: mutex_unlock(&hugetlb_instantiation_mutex); return ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7056c3bdb47..36896f3eb7f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } @@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + kmem_cache_free(page_cgroup_cache, pc); + return 0; + } /* * For every charge from the cgroup, increment reference count */ @@ -796,14 +809,21 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) if (mem_cgroup_subsys.disabled) return 0; + if (!mm) + return 0; rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } css_get(&mem->css); rcu_read_unlock(); do { progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress += res_counter_check_under_limit(&mem->res); } while (!progress && --retry); css_put(&mem->css); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e550bec2058..83369058ec1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { - LIST_HEAD(pagelist); int busy = 0; int err = 0; nodemask_t tmp; diff --git a/mm/mm_init.c b/mm/mm_init.c index 936ef2efd89..4e0e26591df 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -12,7 +12,7 @@ #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT -int __meminitdata mminit_loglevel; +int mminit_loglevel; #ifndef SECTIONS_SHIFT #define SECTIONS_SHIFT 0 diff --git a/mm/mmap.c b/mm/mmap.c index 971d0eda754..e7a5a68a9c2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Ignore pgoff. + */ + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: @@ -2273,14 +2277,14 @@ int install_special_mapping(struct mm_struct *mm, static DEFINE_MUTEX(mm_all_locks_mutex); -static void vm_lock_anon_vma(struct anon_vma *anon_vma) +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock(&anon_vma->lock); + spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->lock. If some other vma in this mm shares @@ -2296,7 +2300,7 @@ static void vm_lock_anon_vma(struct anon_vma *anon_vma) } } -static void vm_lock_mapping(struct address_space *mapping) +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* @@ -2310,7 +2314,7 @@ static void vm_lock_mapping(struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock(&mapping->i_mmap_lock); + spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); } } @@ -2358,11 +2362,17 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->anon_vma) - vm_lock_anon_vma(vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) - vm_lock_mapping(vma->vm_file->f_mapping); + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(mm, vma->anon_vma); } + ret = 0; out_unlock: diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6..16ce8b955dc 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z++); + *zone = zonelist_zone(z); return z; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee626..64e5b4bcd96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/memcontrol.h> +#include <linux/security.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) + if (has_capability(p, CAP_SYS_ADMIN) || + has_capability(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (__capable(p, CAP_SYS_RAWIO)) + if (has_capability(p, CAP_SYS_RAWIO)) points /= 4; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 401d104d2bb..27b8681139f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); __SetPageTail(p); p->first_page = page; } @@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; if (unlikely(compound_order(page) != order)) bad_page(page); @@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageHead(page))) bad_page(page); __ClearPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); if (unlikely(!PageTail(p) | (p->first_page != page))) @@ -694,6 +697,9 @@ static int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; @@ -2516,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; page = pfn_to_page(pfn); + /* Watch out for overlapping nodes */ + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + /* Blocks with reserved pages will never free, skip them. */ if (PageReserved(page)) continue; @@ -4064,7 +4074,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; EXPORT_SYMBOL(contig_page_data); #endif @@ -4437,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem(size); + table = alloc_bootmem_nopanic(size); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3444b58033c..b70a7fec1ff 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -2,7 +2,6 @@ * linux/mm/page_isolation.c */ -#include <stddef.h> #include <linux/mm.h> #include <linux/page-isolation.h> #include <linux/pageblock-flags.h> @@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, flags; struct page *page; + struct zone *zone; + int ret; pfn = start_pfn; /* @@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) if (pfn < end_pfn) return -EBUSY; /* Check all pages are free or Marked as ISOLATED */ - if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) - return 0; - return -EBUSY; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + return ret ? 0 : -EBUSY; } diff --git a/mm/quicklist.c b/mm/quicklist.c index 3f703f7cb39..8dbb6805ef3 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; static unsigned long max_pages(unsigned long min_pages) { unsigned long node_free_pages, max; - struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; + node_to_cpumask_ptr(cpumask_on_node, node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages) zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); max = node_free_pages / FRACTION_OF_NODE_MEM; + + num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + max /= num_cpus_on_node; + return max(max, min_pages); } diff --git a/mm/rmap.c b/mm/rmap.c index 1ea4e6fcee7..0383acfcb06 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -224,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* * Check that @page is mapped at @address into @mm. * + * If @sync is false, page_check_address may perform a racy check to avoid + * the page table lock when the pte is not present (helpful when reclaiming + * highly shared pages). + * * On success returns with pte mapped and locked. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp) + unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; pud_t *pud; @@ -249,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ - if (!pte_present(*pte)) { + if (!sync && !pte_present(*pte)) { pte_unmap(pte); return NULL; } @@ -281,7 +285,7 @@ static int page_referenced_one(struct page *page, if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -450,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) goto out; @@ -659,23 +663,30 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) } /* - * It would be tidy to reset the PageAnon mapping here, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, - * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. */ if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { page_clear_dirty(page); set_page_dirty(page); } - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, - PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ } } @@ -697,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; diff --git a/mm/shmem.c b/mm/shmem.c index 04fb4f1ab88..bf66d0191ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -50,14 +50,12 @@ #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> +#include <linux/magic.h> #include <asm/uaccess.h> #include <asm/div64.h> #include <asm/pgtable.h> -/* This magic number is used in glibc for posix shared memory */ -#define TMPFS_MAGIC 0x01021994 - #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) diff --git a/mm/slab.c b/mm/slab.c index 918f04f7fef..e76eee46688 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4472,4 +4472,3 @@ size_t ksize(const void *objp) return obj_size(virt_to_cache(objp)); } -EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index d8fbd4d1bfa..cb675d12679 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -514,12 +514,13 @@ size_t ksize(const void *block) return 0; sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) - return ((slob_t *)block - 1)->units + SLOB_UNIT; - else + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; + } else return sp->page.private; } -EXPORT_SYMBOL(ksize); struct kmem_cache { unsigned int size, align; diff --git a/mm/slub.c b/mm/slub.c index b7e2cd5d82d..0c83e6afe7b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1329,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(zone)); if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > MIN_PARTIAL) { + n->nr_partial > n->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1381,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < MIN_PARTIAL) { + if (n->nr_partial < n->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1913,13 +1913,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, #endif } -static void init_kmem_cache_node(struct kmem_cache_node *n) +static void +init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; + + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + n->min_partial = ilog2(s->size); + if (n->min_partial < MIN_PARTIAL) + n->min_partial = MIN_PARTIAL; + else if (n->min_partial > MAX_PARTIAL) + n->min_partial = MAX_PARTIAL; + spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } @@ -2087,7 +2100,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_object(kmalloc_caches, n, 1); init_tracking(kmalloc_caches, n); #endif - init_kmem_cache_node(n); + init_kmem_cache_node(n, kmalloc_caches); inc_slabs_node(kmalloc_caches, node, page->objects); /* @@ -2144,7 +2157,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } s->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); } return 1; } @@ -2155,7 +2168,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { - init_kmem_cache_node(&s->local_node); + init_kmem_cache_node(&s->local_node, s); return 1; } #endif @@ -2300,7 +2313,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->refcount = 1; #ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 100; + s->remote_node_defrag_ratio = 1000; #endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2715,7 +2728,6 @@ size_t ksize(const void *object) */ return s->size; } -EXPORT_SYMBOL(ksize); void kfree(const void *x) { @@ -2890,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); s->node[nid] = n; } out: @@ -4047,7 +4059,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, if (err) return err; - if (ratio < 100) + if (ratio <= 100) s->remote_node_defrag_ratio = ratio * 10; return length; diff --git a/mm/sparse.c b/mm/sparse.c index 5d9dbbb9d39..39db301b920 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -12,7 +12,6 @@ #include <asm/dma.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> -#include "internal.h" /* * Permanent SPARSEMEM data: diff --git a/mm/swap_state.c b/mm/swap_state.c index 167cf2dc8a0..797c3831cbe 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -60,7 +60,7 @@ void show_swap_cache_info(void) printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); - printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index ae532f50194..8d7a27a6335 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -65,31 +65,31 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!dentry) goto put_memory; + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto put_dentry; + error = -ENOSPC; inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); if (!inode) - goto put_dentry; + goto close_file; d_instantiate(dentry, inode); - error = -ENFILE; - file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &ramfs_file_operations); - if (!file) - goto put_dentry; - + inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &ramfs_file_operations); - /* notify everyone as to the change of file size */ - error = do_truncate(dentry, size, 0, file); - if (error < 0) +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) goto close_file; - +#endif return file; close_file: put_filp(file); - return ERR_PTR(error); - put_dentry: dput(dentry); put_memory: diff --git a/mm/truncate.c b/mm/truncate.c index 250505091d3..6650c1d878b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -380,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EIO if any pages could not be invalidated. + * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -440,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) - ret2 = -EIO; + ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; diff --git a/mm/util.c b/mm/util.c index 9341ca77bd8..cb00b748ce4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -171,3 +171,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->unmap_area = arch_unmap_area; } #endif + +int __attribute__((weak)) get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, nr_pages, + write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 85b9a0d2c87..bba06c41fc5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -180,6 +180,13 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) pmd_t *pmd; pte_t *ptep, pte; + /* + * XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space + */ + VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && + !is_module_address(addr)); + if (!pgd_none(*pgd)) { pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { diff --git a/mm/vmstat.c b/mm/vmstat.c index b0d08e667ec..d7826af2fb0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); +#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES + /* + * Ordinarily, memory holes in flatmem still have a valid + * memmap for the PFN range. However, an architecture for + * embedded systems (e.g. ARM) can free up the memmap backing + * holes to save memory on the assumption the memmap is + * never used. The page_zone linkages are then broken even + * though pfn_valid() returns true. Skip the page if the + * linkages are broken. Even if this test passed, the impact + * is that the counters for the movable type are off but + * fragmentation monitoring is likely meaningless on small + * systems. + */ + if (page_zone(page) != zone) + continue; +#endif mtype = get_pageblock_migratetype(page); - count[mtype]++; + if (mtype < MIGRATE_TYPES) + count[mtype]++; } /* Print counts */ |