aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig30
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/hugetlb.c26
-rw-r--r--mm/memcontrol.c52
-rw-r--r--mm/memory.c112
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mmzone.c15
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/oom_kill.c68
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c89
-rw-r--r--mm/pdflush.c31
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c35
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c6
-rw-r--r--mm/swap.c46
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/truncate.c1
-rw-r--r--mm/util.c16
-rw-r--r--mm/vmalloc.c1
-rw-r--r--mm/vmscan.c21
-rw-r--r--mm/vmstat.c19
23 files changed, 319 insertions, 308 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b53427ad30a..c2b57d81e15 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -213,6 +213,8 @@ config UNEVICTABLE_LRU
will use one page flag and increase the code size a little,
say Y unless you know what you are doing.
+ See Documentation/vm/unevictable-lru.txt for more information.
+
config HAVE_MLOCK
bool
default y if MMU=y
@@ -223,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT
config MMU_NOTIFIER
bool
+
+config NOMMU_INITIAL_TRIM_EXCESS
+ int "Turn on mmap() excess space trimming before booting"
+ depends on !MMU
+ default 1
+ help
+ The NOMMU mmap() frequently needs to allocate large contiguous chunks
+ of memory on which to store mappings, but it can only ask the system
+ allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
+ more than it requires. To deal with this, mmap() is able to trim off
+ the excess and return it to the allocator.
+
+ If trimming is enabled, the excess is trimmed off and returned to the
+ system allocator, which can cause extra fragmentation, particularly
+ if there are a lot of transient processes.
+
+ If trimming is disabled, the excess is kept, but not used, which for
+ long-term mappings means that the space is wasted.
+
+ Trimming can be dynamically controlled through a sysctl option
+ (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
+ excess pages there must be before trimming should occur, or zero if
+ no trimming is to occur.
+
+ This option specifies the initial value of this option. The default
+ of 1 says that all excess pages should be trimmed.
+
+ See Documentation/nommu-mmap.txt for more information.
diff --git a/mm/filemap.c b/mm/filemap.c
index 2e2d38ebda4..1b60f30cebf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page)
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
- mem_cgroup_uncharge_cache_page(page);
/*
* Some filesystems seem to re-dirty the page even after
@@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page)
spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
}
static int sync_page(void *word)
@@ -441,6 +441,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
}
return err;
}
+EXPORT_SYMBOL(filemap_write_and_wait_range);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -475,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
+ spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
}
-
- spin_unlock_irq(&mapping->tree_lock);
radix_tree_preload_end();
} else
mem_cgroup_uncharge_cache_page(page);
@@ -567,8 +568,8 @@ EXPORT_SYMBOL(wait_on_page_bit);
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
- * @page - Page defining the wait queue of interest
- * @waiter - Waiter to add to the queue
+ * @page: Page defining the wait queue of interest
+ * @waiter: Waiter to add to the queue
*
* Add an arbitrary @waiter to the wait queue for the nominated @page.
*/
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 28c655ba935..e83ad2c9228 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref)
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & VM_MAYSHARE))
return (struct resv_map *)(get_vma_private_data(vma) &
~HPAGE_RESV_MASK);
return NULL;
@@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_SHARED);
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
set_vma_private_data(vma, (get_vma_private_data(vma) &
HPAGE_RESV_MASK) | (unsigned long)map);
@@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_SHARED);
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
@@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
if (vma->vm_flags & VM_NORESERVE)
return;
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
/* Shared mappings always use reserves */
h->resv_huge_pages--;
} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
@@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & VM_MAYSHARE))
vma->vm_private_data = (void *)0;
}
/* Returns true if the VMA has associated reserve pages */
static int vma_has_reserves(struct vm_area_struct *vma)
{
- if (vma->vm_flags & VM_SHARED)
+ if (vma->vm_flags & VM_MAYSHARE)
return 1;
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return 1;
@@ -924,7 +924,7 @@ static long vma_needs_reservation(struct hstate *h,
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
return region_chg(&inode->i_mapping->private_list,
idx, idx + 1);
@@ -949,7 +949,7 @@ static void vma_commit_reservation(struct hstate *h,
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
region_add(&inode->i_mapping->private_list, idx, idx + 1);
@@ -1893,7 +1893,7 @@ retry_avoidcopy:
* at the time of fork() could consume its reserves on COW instead
* of the full address range.
*/
- if (!(vma->vm_flags & VM_SHARED) &&
+ if (!(vma->vm_flags & VM_MAYSHARE) &&
is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_page != pagecache_page)
outside_reserve = 1;
@@ -2000,7 +2000,7 @@ retry:
clear_huge_page(page, address, huge_page_size(h));
__SetPageUptodate(page);
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
int err;
struct inode *inode = mapping->host;
@@ -2104,7 +2104,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mutex;
}
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
vma, address);
}
@@ -2289,7 +2289,7 @@ int hugetlb_reserve_pages(struct inode *inode,
* to reserve the full area even if read-only as mprotect() may be
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
- if (!vma || vma->vm_flags & VM_SHARED)
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
chg = region_chg(&inode->i_mapping->private_list, from, to);
else {
struct resv_map *resv_map = resv_map_alloc();
@@ -2330,7 +2330,7 @@ int hugetlb_reserve_pages(struct inode *inode,
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
- if (!vma || vma->vm_flags & VM_SHARED)
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
region_add(&inode->i_mapping->private_list, from, to);
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2fc6d6c4823..78eb8552818 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -314,14 +314,6 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return mem;
}
-static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
-{
- if (!mem)
- return true;
- return css_is_removed(&mem->css);
-}
-
-
/*
* Call callback function against all cgroup under hierarchy tree.
*/
@@ -932,7 +924,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (unlikely(!mem))
return 0;
- VM_BUG_ON(mem_cgroup_is_obsolete(mem));
+ VM_BUG_ON(css_is_removed(&mem->css));
while (1) {
int ret;
@@ -1024,9 +1016,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
return NULL;
pc = lookup_page_cgroup(page);
- /*
- * Used bit of swapcache is solid under page lock.
- */
+ lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
if (mem && !css_tryget(&mem->css))
@@ -1040,6 +1030,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
mem = NULL;
rcu_read_unlock();
}
+ unlock_page_cgroup(pc);
return mem;
}
@@ -1489,8 +1480,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}
+#ifdef CONFIG_SWAP
/*
- * called from __delete_from_swap_cache() and drop "page" account.
+ * called after __delete_from_swap_cache() and drop "page" account.
* memcg information is recorded to swap_cgroup of "ent"
*/
void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
@@ -1507,6 +1499,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
if (memcg)
css_put(&memcg->css);
}
+#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/*
@@ -1618,37 +1611,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
}
/*
- * A call to try to shrink memory usage under specified resource controller.
- * This is typically used for page reclaiming for shmem for reducing side
- * effect of page allocation from shmem, which is used by some mem_cgroup.
+ * A call to try to shrink memory usage on charge failure at shmem's swapin.
+ * Calling hierarchical_reclaim is not enough because we should update
+ * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
+ * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
+ * not from the memcg which this page would be charged to.
+ * try_charge_swapin does all of these works properly.
*/
-int mem_cgroup_shrink_usage(struct page *page,
+int mem_cgroup_shmem_charge_fallback(struct page *page,
struct mm_struct *mm,
gfp_t gfp_mask)
{
struct mem_cgroup *mem = NULL;
- int progress = 0;
- int retry = MEM_CGROUP_RECLAIM_RETRIES;
+ int ret;
if (mem_cgroup_disabled())
return 0;
- if (page)
- mem = try_get_mem_cgroup_from_swapcache(page);
- if (!mem && mm)
- mem = try_get_mem_cgroup_from_mm(mm);
- if (unlikely(!mem))
- return 0;
- do {
- progress = mem_cgroup_hierarchical_reclaim(mem,
- gfp_mask, true, false);
- progress += mem_cgroup_check_under_limit(mem);
- } while (!progress && --retry);
+ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
+ if (!ret)
+ mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
- css_put(&mem->css);
- if (!retry)
- return -ENOMEM;
- return 0;
+ return ret;
}
static DEFINE_MUTEX(set_limit_mutex);
diff --git a/mm/memory.c b/mm/memory.c
index cf6873e91c6..4126dd16778 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret = tmp;
goto unwritable_page;
}
+ if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ lock_page(old_page);
+ if (!old_page->mapping) {
+ ret = 0; /* retry the fault */
+ unlock_page(old_page);
+ goto unwritable_page;
+ }
+ } else
+ VM_BUG_ON(!PageLocked(old_page));
/*
* Since we dropped the lock we need to revalidate
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
- page_cache_release(old_page);
- if (!pte_same(*page_table, orig_pte))
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ page_cache_release(old_page);
goto unlock;
+ }
page_mkwrite = 1;
}
@@ -2094,9 +2105,6 @@ gotten:
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
- if (vma->vm_file)
- file_update_time(vma->vm_file);
-
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2113,41 @@ unlock:
*
* do_no_page is protected similarly.
*/
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
+ if (!page_mkwrite) {
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ }
put_page(dirty_page);
+ if (page_mkwrite) {
+ struct address_space *mapping = dirty_page->mapping;
+
+ set_page_dirty(dirty_page);
+ unlock_page(dirty_page);
+ page_cache_release(dirty_page);
+ if (mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
}
return ret;
oom_free_new:
page_cache_release(new_page);
oom:
- if (old_page)
+ if (old_page) {
+ if (page_mkwrite) {
+ unlock_page(old_page);
+ page_cache_release(old_page);
+ }
page_cache_release(old_page);
+ }
return VM_FAULT_OOM;
unwritable_page:
@@ -2458,8 +2491,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
ret = VM_FAULT_OOM;
- unlock_page(page);
- goto out;
+ goto out_page;
}
/*
@@ -2521,6 +2553,7 @@ out:
out_nomap:
mem_cgroup_cancel_charge_swapin(ptr);
pte_unmap_unlock(page_table, ptl);
+out_page:
unlock_page(page);
page_cache_release(page);
return ret;
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int tmp;
unlock_page(page);
- vmf.flags |= FAULT_FLAG_MKWRITE;
+ vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
- anon = 1; /* no anon but release vmf.page */
- goto out_unlocked;
- }
- lock_page(page);
- /*
- * XXX: this is not quite right (racy vs
- * invalidate) to unlock and relock the page
- * like this, however a better fix requires
- * reworking page_mkwrite locking API, which
- * is better done later.
- */
- if (!page->mapping) {
- ret = 0;
- anon = 1; /* no anon but release vmf.page */
- goto out;
+ goto unwritable_page;
}
+ if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ lock_page(page);
+ if (!page->mapping) {
+ ret = 0; /* retry the fault */
+ unlock_page(page);
+ goto unwritable_page;
+ }
+ } else
+ VM_BUG_ON(!PageLocked(page));
page_mkwrite = 1;
}
}
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(page_table, ptl);
out:
- unlock_page(vmf.page);
-out_unlocked:
- if (anon)
- page_cache_release(vmf.page);
- else if (dirty_page) {
- if (vma->vm_file)
- file_update_time(vma->vm_file);
+ if (dirty_page) {
+ struct address_space *mapping = page->mapping;
- set_page_dirty_balance(dirty_page, page_mkwrite);
+ if (set_page_dirty(dirty_page))
+ page_mkwrite = 1;
+ unlock_page(dirty_page);
put_page(dirty_page);
+ if (page_mkwrite && mapping) {
+ /*
+ * Some device drivers do not set page.mapping but still
+ * dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+ } else {
+ unlock_page(vmf.page);
+ if (anon)
+ page_cache_release(vmf.page);
}
return ret;
+
+unwritable_page:
+ page_cache_release(page);
+ return ret;
}
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a3841186c1..6b7b1a95944 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
/*
* Check that a process has enough memory to allocate a new virtual
@@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (mm)
allowed -= mm->total_vm / 32;
- /*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
- */
- if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
@@ -1575,7 +1571,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory(grow))
+ if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
/* Ok, everything looks good - let it rip */
@@ -2481,4 +2477,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
*/
void __init mmap_init(void)
{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0);
+ VM_BUG_ON(ret);
}
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 16ce8b955dc..f5b7d176021 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -6,6 +6,7 @@
#include <linux/stddef.h>
+#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/module.h>
@@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
*zone = zonelist_zone(z);
return z;
}
+
+#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
+int memmap_valid_within(unsigned long pfn,
+ struct page *page, struct zone *zone)
+{
+ if (page_to_pfn(page) != pfn)
+ return 0;
+
+ if (page_zone(page) != zone)
+ return 0;
+
+ return 1;
+}
+#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/nommu.c b/mm/nommu.c
index 72eda4aee2c..b571ef70742 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,11 +62,11 @@ void *high_memory;
struct page *mem_map;
unsigned long max_mapnr;
unsigned long num_physpages;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
-int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
+int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
@@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
*/
void __init mmap_init(void)
{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0);
+ VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
}
@@ -511,8 +515,6 @@ static void add_nommu_region(struct vm_region *region)
validate_nommu_regions();
- BUG_ON(region->vm_start & ~PAGE_MASK);
-
parent = NULL;
p = &nommu_region_tree.rb_node;
while (*p) {
@@ -1847,12 +1849,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (mm)
allowed -= mm->total_vm / 32;
- /*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
- */
- if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
+
error:
vm_unacct_memory(pages);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f3166e308d..a7b2460e922 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -284,22 +284,28 @@ static void dump_tasks(const struct mem_cgroup *mem)
printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
"name\n");
do_each_thread(g, p) {
- /*
- * total_vm and rss sizes do not exist for tasks with a
- * detached mm so there's no need to report them.
- */
- if (!p->mm)
- continue;
+ struct mm_struct *mm;
+
if (mem && !task_in_mem_cgroup(p, mem))
continue;
if (!thread_group_leader(p))
continue;
task_lock(p);
+ mm = p->mm;
+ if (!mm) {
+ /*
+ * total_vm and rss sizes do not exist for tasks with no
+ * mm so there's no need to report them; they can't be
+ * oom killed anyway.
+ */
+ task_unlock(p);
+ continue;
+ }
printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
- p->pid, __task_cred(p)->uid, p->tgid,
- p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p),
- p->oomkilladj, p->comm);
+ p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
+ get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+ p->comm);
task_unlock(p);
} while_each_thread(g, p);
}
@@ -514,34 +520,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
*/
static void __out_of_memory(gfp_t gfp_mask, int order)
{
- if (sysctl_oom_kill_allocating_task) {
- oom_kill_process(current, gfp_mask, order, 0, NULL,
- "Out of memory (oom_kill_allocating_task)");
-
- } else {
- unsigned long points;
- struct task_struct *p;
-
-retry:
- /*
- * Rambo mode: Shoot down a process and hope it solves whatever
- * issues we may have.
- */
- p = select_bad_process(&points, NULL);
+ struct task_struct *p;
+ unsigned long points;
- if (PTR_ERR(p) == -1UL)
+ if (sysctl_oom_kill_allocating_task)
+ if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
+ "Out of memory (oom_kill_allocating_task)"))
return;
+retry:
+ /*
+ * Rambo mode: Shoot down a process and hope it solves whatever
+ * issues we may have.
+ */
+ p = select_bad_process(&points, NULL);
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- panic("Out of memory and no killable processes...\n");
- }
+ if (PTR_ERR(p) == -1UL)
+ return;
- if (oom_kill_process(p, gfp_mask, order, points, NULL,
- "Out of memory"))
- goto retry;
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ panic("Out of memory and no killable processes...\n");
}
+
+ if (oom_kill_process(p, gfp_mask, order, points, NULL,
+ "Out of memory"))
+ goto retry;
}
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 30351f0063a..bb553c3e955 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -94,12 +94,12 @@ unsigned long vm_dirty_bytes;
/*
* The interval between `kupdate'-style writebacks
*/
-unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
+unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
/*
* The longest time for which data is allowed to remain dirty
*/
-unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
+unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,7 +770,7 @@ static void wb_kupdate(unsigned long arg)
sync_supers();
- oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
+ oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
start_jif = jiffies;
next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
nr_to_write = global_page_state(NR_FILE_DIRTY) +
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f26991fff..474c7e9dd51 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve;
static int __meminitdata nr_nodemap_entries;
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
- static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
- static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -2681,6 +2677,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
static int zone_batchsize(struct zone *zone)
{
+#ifdef CONFIG_MMU
int batch;
/*
@@ -2706,9 +2703,26 @@ static int zone_batchsize(struct zone *zone)
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
- batch = (1 << (fls(batch + batch/2)-1)) - 1;
+ batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
+
+#else
+ /* The deferral and batching of frees should be suppressed under NOMMU
+ * conditions.
+ *
+ * The problem is that NOMMU needs to be able to allocate large chunks
+ * of contiguous memory as there's no hardware page translation to
+ * assemble apparent contiguous memory from discontiguous pages.
+ *
+ * Queueing large contiguous runs of pages for batching, however,
+ * causes the pages to actually be freed in smaller chunks. As there
+ * can be a significant delay between the individual batches being
+ * recycled, this leads to the once large chunks of space being
+ * fragmented and becoming unavailable for high-order allocations.
+ */
+ return 0;
+#endif
}
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
@@ -3085,64 +3099,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
}
/**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- mminit_dprintk(MMINIT_TRACE, "zoneboundary",
- "Entering push_node_boundaries(%u, %lu, %lu)\n",
- nid, start_pfn, end_pfn);
-
- /* Initialise the boundary for this node if necessary */
- if (node_boundary_end_pfn[nid] == 0)
- node_boundary_start_pfn[nid] = -1UL;
-
- /* Update the boundaries */
- if (node_boundary_start_pfn[nid] > start_pfn)
- node_boundary_start_pfn[nid] = start_pfn;
- if (node_boundary_end_pfn[nid] < end_pfn)
- node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __meminit account_node_boundary(unsigned int nid,
- unsigned long *start_pfn, unsigned long *end_pfn)
-{
- mminit_dprintk(MMINIT_TRACE, "zoneboundary",
- "Entering account_node_boundary(%u, %lu, %lu)\n",
- nid, *start_pfn, *end_pfn);
-
- /* Return if boundary information has not been provided */
- if (node_boundary_end_pfn[nid] == 0)
- return;
-
- /* Check the boundaries and update if necessary */
- if (node_boundary_start_pfn[nid] < *start_pfn)
- *start_pfn = node_boundary_start_pfn[nid];
- if (node_boundary_end_pfn[nid] > *end_pfn)
- *end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
- unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __meminit account_node_boundary(unsigned int nid,
- unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
-/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3167,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
if (*start_pfn == -1UL)
*start_pfn = 0;
-
- /* Push the node boundaries out if requested */
- account_node_boundary(nid, start_pfn, end_pfn);
}
/*
@@ -3775,10 +3728,6 @@ void __init remove_all_active_ranges(void)
{
memset(early_node_map, 0, sizeof(early_node_map));
nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
- memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
- memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
}
/* Compare two active node_active_regions */
diff --git a/mm/pdflush.c b/mm/pdflush.c
index f2caf96993f..235ac440c44 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -58,14 +58,6 @@ static DEFINE_SPINLOCK(pdflush_lock);
int nr_pdflush_threads = 0;
/*
- * The max/min number of pdflush threads. R/W by sysctl at
- * /proc/sys/vm/nr_pdflush_threads_max/min
- */
-int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
-int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
-
-
-/*
* The time at which the pdflush thread pool last went empty
*/
static unsigned long last_empty_jifs;
@@ -76,7 +68,7 @@ static unsigned long last_empty_jifs;
* Thread pool management algorithm:
*
* - The minimum and maximum number of pdflush instances are bound
- * by nr_pdflush_threads_min and nr_pdflush_threads_max.
+ * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
*
* - If there have been no idle pdflush instances for 1 second, create
* a new one.
@@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work)
* To throttle creation, we reset last_empty_jifs.
*/
if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
- if (list_empty(&pdflush_list) &&
- nr_pdflush_threads < nr_pdflush_threads_max) {
- last_empty_jifs = jiffies;
- nr_pdflush_threads++;
- spin_unlock_irq(&pdflush_lock);
- start_one_pdflush_thread();
- spin_lock_irq(&pdflush_lock);
+ if (list_empty(&pdflush_list)) {
+ if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
+ last_empty_jifs = jiffies;
+ nr_pdflush_threads++;
+ spin_unlock_irq(&pdflush_lock);
+ start_one_pdflush_thread();
+ spin_lock_irq(&pdflush_lock);
+ }
}
}
@@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work)
*/
if (list_empty(&pdflush_list))
continue;
- if (nr_pdflush_threads <= nr_pdflush_threads_min)
+ if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
continue;
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -266,9 +259,9 @@ static int __init pdflush_init(void)
* Pre-set nr_pdflush_threads... If we fail to create,
* the count will be decremented.
*/
- nr_pdflush_threads = nr_pdflush_threads_min;
+ nr_pdflush_threads = MIN_PDFLUSH_THREADS;
- for (i = 0; i < nr_pdflush_threads_min; i++)
+ for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
start_one_pdflush_thread();
return 0;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 16521664010..23122af3261 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -14,7 +14,7 @@
* Original design by Rik van Riel <riel@conectiva.com.br> 2001
* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
- * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
+ * Contributions by Hugh Dickins 2003, 2004
*/
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index d94d2e9146b..b25f95ce3db 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -24,6 +24,7 @@
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
+#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -43,7 +44,6 @@ static struct vfsmount *shm_mnt;
#include <linux/exportfs.h>
#include <linux/generic_acl.h>
#include <linux/mman.h>
-#include <linux/pagemap.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
@@ -65,13 +65,28 @@ static struct vfsmount *shm_mnt;
#include <asm/div64.h>
#include <asm/pgtable.h>
+/*
+ * The maximum size of a shmem/tmpfs file is limited by the maximum size of
+ * its triple-indirect swap vector - see illustration at shmem_swp_entry().
+ *
+ * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
+ * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
+ * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
+ * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
+ *
+ * We use / and * instead of shifts in the definitions below, so that the swap
+ * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
+ */
#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
+#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
+#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
+#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
+#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
+#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
+
+#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
@@ -1325,8 +1340,12 @@ repeat:
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
if (error == -ENOMEM) {
- /* allow reclaim from this memory cgroup */
- error = mem_cgroup_shrink_usage(swappage,
+ /*
+ * reclaim from proper memory cgroup and
+ * call memcg's OOM if needed.
+ */
+ error = mem_cgroup_shmem_charge_fallback(
+ swappage,
current->mm,
gfp);
if (error) {
@@ -2581,7 +2600,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
-#define SHMEM_MAX_BYTES LLONG_MAX
+#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
#endif /* CONFIG_SHMEM */
diff --git a/mm/slob.c b/mm/slob.c
index a2d4ab32198..f92e66d558b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -60,6 +60,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -255,6 +256,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
static void slob_free_pages(void *b, int order)
{
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += 1 << order;
free_pages((unsigned long)b, order);
}
@@ -407,7 +410,7 @@ static void slob_free(void *block, int size)
spin_unlock_irqrestore(&slob_lock, flags);
clear_slob_page(sp);
free_slob_page(sp);
- free_page((unsigned long)b);
+ slob_free_pages(b, 0);
return;
}
diff --git a/mm/slub.c b/mm/slub.c
index 7ab54ecbd3f..65ffda5934b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -9,6 +9,7 @@
*/
#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
@@ -1170,6 +1171,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlab(page);
reset_page_mapcount(page);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += pages;
__free_pages(page, order);
}
@@ -1909,7 +1912,7 @@ static inline int calculate_order(int size)
* Doh this slab cannot be placed using slub_max_order.
*/
order = slab_order(size, 1, MAX_ORDER, 1);
- if (order <= MAX_ORDER)
+ if (order < MAX_ORDER)
return order;
return -ENOSYS;
}
@@ -2522,6 +2525,7 @@ __setup("slub_min_order=", setup_slub_min_order);
static int __init setup_slub_max_order(char *str)
{
get_option(&str, &slub_max_order);
+ slub_max_order = min(slub_max_order, MAX_ORDER - 1);
return 1;
}
diff --git a/mm/swap.c b/mm/swap.c
index bede23ce64e..cb29ae5d33a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
EXPORT_SYMBOL(pagevec_lookup_tag);
-#ifdef CONFIG_SMP
-/*
- * We tolerate a little inaccuracy to avoid ping-ponging the counter between
- * CPUs
- */
-#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
-
-static DEFINE_PER_CPU(long, committed_space);
-
-void vm_acct_memory(long pages)
-{
- long *local;
-
- preempt_disable();
- local = &__get_cpu_var(committed_space);
- *local += pages;
- if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
- atomic_long_add(*local, &vm_committed_space);
- *local = 0;
- }
- preempt_enable();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Drop the CPU's cached committed space back into the central pool. */
-static int cpu_swap_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- long *committed;
-
- committed = &per_cpu(committed_space, (long)hcpu);
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- atomic_long_add(*committed, &vm_committed_space);
- *committed = 0;
- drain_cpu_pagevecs((long)hcpu);
- }
- return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-#endif /* CONFIG_SMP */
-
/*
* Perform any setup for the swap system
*/
@@ -554,7 +511,4 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
-#ifdef CONFIG_HOTPLUG_CPU
- hotcpu_notifier(cpu_swap_callback, 0);
-#endif
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3ecea98ecb4..1416e7e9e02 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
*/
void __delete_from_swap_cache(struct page *page)
{
- swp_entry_t ent = {.val = page_private(page)};
-
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageSwapCache(page));
VM_BUG_ON(PageWriteback(page));
@@ -121,7 +119,6 @@ void __delete_from_swap_cache(struct page *page)
total_swapcache_pages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total);
- mem_cgroup_uncharge_swapcache(page, ent);
}
/**
@@ -191,6 +188,7 @@ void delete_from_swap_cache(struct page *page)
__delete_from_swap_cache(page);
spin_unlock_irq(&swapper_space.tree_lock);
+ mem_cgroup_uncharge_swapcache(page, entry);
swap_free(entry);
page_cache_release(page);
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 55206fab7b9..12e1579f916 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -359,6 +359,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
BUG_ON(page_has_private(page));
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
page_cache_release(page); /* pagecache ref */
return 1;
failed:
diff --git a/mm/util.c b/mm/util.c
index 2599e83eea1..55bef160b9f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -223,6 +223,22 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
}
#endif
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
int __attribute__((weak)) get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fab19876b4d..083716ea38c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -402,6 +402,7 @@ overflow:
printk(KERN_WARNING
"vmap allocation for size %lu failed: "
"use vmalloc=<size> to increase size.\n", size);
+ kfree(va);
return ERR_PTR(-EBUSY);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 39fdfb14eea..d254306562c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,6 +63,9 @@ struct scan_control {
/* Can mapped pages be reclaimed? */
int may_unmap;
+ /* Can pages be swapped as part of reclaim? */
+ int may_swap;
+
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
* In this context, it doesn't matter that we scan the
@@ -467,10 +470,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_swapcache(page, swap);
swap_free(swap);
} else {
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
}
return 1;
@@ -1380,7 +1385,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
/* If we have no swap space, do not bother scanning anon pages. */
- if (nr_swap_pages <= 0) {
+ if (!sc->may_swap || (nr_swap_pages <= 0)) {
percent[0] = 0;
percent[1] = 100;
return;
@@ -1468,7 +1473,7 @@ static void shrink_zone(int priority, struct zone *zone,
for_each_evictable_lru(l) {
int file = is_file_lru(l);
- int scan;
+ unsigned long scan;
scan = zone_nr_pages(zone, sc, l);
if (priority) {
@@ -1697,6 +1702,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_writepage = !laptop_mode,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_unmap = 1,
+ .may_swap = 1,
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
@@ -1717,6 +1723,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
+ .may_swap = !noswap,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = swappiness,
.order = 0,
@@ -1726,9 +1733,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
};
struct zonelist *zonelist;
- if (noswap)
- sc.may_unmap = 0;
-
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1767,6 +1771,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
+ .may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
.order = order,
@@ -2088,13 +2093,13 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
nr_reclaimed += shrink_list(l, nr_to_scan, zone,
sc, prio);
if (nr_reclaimed >= nr_pages) {
- sc->nr_reclaimed = nr_reclaimed;
+ sc->nr_reclaimed += nr_reclaimed;
return;
}
}
}
}
- sc->nr_reclaimed = nr_reclaimed;
+ sc->nr_reclaimed += nr_reclaimed;
}
/*
@@ -2115,6 +2120,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
.may_unmap = 0,
.may_writepage = 1,
.isolate_pages = isolate_pages_global,
+ .nr_reclaimed = 0,
};
current->reclaim_state = &reclaim_state;
@@ -2297,6 +2303,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .may_swap = 1,
.swap_cluster_max = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 66f6130976c..74d66dba0cb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
continue;
page = pfn_to_page(pfn);
-#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
- /*
- * Ordinarily, memory holes in flatmem still have a valid
- * memmap for the PFN range. However, an architecture for
- * embedded systems (e.g. ARM) can free up the memmap backing
- * holes to save memory on the assumption the memmap is
- * never used. The page_zone linkages are then broken even
- * though pfn_valid() returns true. Skip the page if the
- * linkages are broken. Even if this test passed, the impact
- * is that the counters for the movable type are off but
- * fragmentation monitoring is likely meaningless on small
- * systems.
- */
- if (page_zone(page) != zone)
+
+ /* Watch for unexpected holes punched in the memmap */
+ if (!memmap_valid_within(pfn, page, zone))
continue;
-#endif
+
mtype = get_pageblock_migratetype(page);
if (mtype < MIGRATE_TYPES)