aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memory.c10
-rw-r--r--mm/mempolicy.c44
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c123
-rw-r--r--mm/page_alloc.c46
-rw-r--r--mm/shmem.c81
-rw-r--r--mm/slab.c10
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmscan.c116
11 files changed, 331 insertions, 128 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67f29516662..508707704d2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
BUG_ON(page_count(page));
INIT_LIST_HEAD(&page->lru);
- page[1].mapping = NULL;
+ page[1].lru.next = NULL; /* reset dtor */
spin_lock(&hugetlb_lock);
enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
}
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
- page[1].mapping = (void *)free_huge_page;
+ page[1].lru.next = (void *)free_huge_page; /* set dtor */
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
clear_user_highpage(&page[i], addr);
return page;
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299..af3d573b014 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
struct mm_struct * mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
- int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+ int new_flags = vma->vm_flags;
switch (behavior) {
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
case MADV_SEQUENTIAL:
- new_flags |= VM_SEQ_READ;
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
- new_flags |= VM_RAND_READ;
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
- default:
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ new_flags &= ~VM_DONTCOPY;
break;
}
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
long error;
switch (behavior) {
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO) {
+ error = -EINVAL;
+ break;
+ }
+ case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8..9abc6008544 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmalloc_earlyreserve);
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+ randomize_va_space = 0;
+ return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b7..67af4cea1e2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
}
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
+
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
- int num, max, nd;
+ int num, max, nd, k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
- zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+ zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
num = 0;
- for_each_node_mask(nd, *nodes)
- zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+ /* First put in the highest zones from all nodes, then all the next
+ lower zones etc. Avoid empty zones because the memory allocator
+ doesn't like them. If you implement node hot removal you
+ have to fix that. */
+ for (k = policy_zone; k >= 0; k--) {
+ for_each_node_mask(nd, *nodes) {
+ struct zone *z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ zl->zones[num++] = z;
+ }
+ }
zl->zones[num] = NULL;
return zl;
}
@@ -542,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (isolate_lru_page(page))
- list_add(&page->lru, pagelist);
+ list_add_tail(&page->lru, pagelist);
}
}
@@ -559,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist,
LIST_HEAD(moved);
LIST_HEAD(failed);
int err = 0;
+ unsigned long offset = 0;
int nr_pages;
struct page *page;
struct list_head *p;
@@ -566,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist,
redo:
nr_pages = 0;
list_for_each(p, pagelist) {
- if (vma)
- page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+ if (vma) {
+ /*
+ * The address passed to alloc_page_vma is used to
+ * generate the proper interleave behavior. We fake
+ * the address here by an increasing offset in order
+ * to get the proper distribution of pages.
+ *
+ * No decision has been made as to which page
+ * a certain old page is moved to so we cannot
+ * specify the correct address.
+ */
+ page = alloc_page_vma(GFP_HIGHUSER, vma,
+ offset + vma->vm_start);
+ offset += PAGE_SIZE;
+ }
else
page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
@@ -575,9 +599,9 @@ redo:
err = -ENOMEM;
goto out;
}
- list_add(&page->lru, &newlist);
+ list_add_tail(&page->lru, &newlist);
nr_pages++;
- if (nr_pages > MIGRATE_CHUNK_SIZE);
+ if (nr_pages > MIGRATE_CHUNK_SIZE)
break;
}
err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -798,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d6823..99d21020ec9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
EXPORT_SYMBOL(vfree);
EXPORT_SYMBOL(vmalloc_to_page);
EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
/*
* Handle all mappings that got truncated by a "truncate()"
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f2a56..8123fad5a48 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
/*
* Processes which fork a lot of child processes are likely
- * a good choice. We add the vmsize of the children if they
+ * a good choice. We add half the vmsize of the children if they
* have an own mm. This prevents forking servers to flood the
- * machine with an endless amount of children
+ * machine with an endless amount of children. In case a single
+ * child is eating the vast majority of memory, adding only half
+ * to the parents will make the child our kill candidate of choice.
*/
list_for_each(tsk, &p->children) {
struct task_struct *chld;
chld = list_entry(tsk, struct task_struct, sibling);
if (chld->mm != p->mm && chld->mm)
- points += chld->mm->total_vm;
+ points += chld->mm->total_vm/2 + 1;
}
/*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
}
/*
+ * Types of limitations to the nodes from which allocations may occur
+ */
+#define CONSTRAINT_NONE 1
+#define CONSTRAINT_MEMORY_POLICY 2
+#define CONSTRAINT_CPUSET 3
+
+/*
+ * Determine the type of allocation constraint.
+ */
+static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NUMA
+ struct zone **z;
+ nodemask_t nodes = node_online_map;
+
+ for (z = zonelist->zones; *z; z++)
+ if (cpuset_zone_allowed(*z, gfp_mask))
+ node_clear((*z)->zone_pgdat->node_id,
+ nodes);
+ else
+ return CONSTRAINT_CPUSET;
+
+ if (!nodes_empty(nodes))
+ return CONSTRAINT_MEMORY_POLICY;
+#endif
+
+ return CONSTRAINT_NONE;
+}
+
+/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. We expect the caller will lock the tasklist.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct * select_bad_process(void)
+static struct task_struct *select_bad_process(unsigned long *ppoints)
{
- unsigned long maxpoints = 0;
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
+ *ppoints = 0;
do_posix_clock_monotonic_gettime(&uptime);
do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
return p;
points = badness(p, uptime.tv_sec);
- if (points > maxpoints || !chosen) {
+ if (points > *ppoints || !chosen) {
chosen = p;
- maxpoints = points;
+ *ppoints = points;
}
} while_each_thread(g, p);
return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
* CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
* we select a process with CAP_SYS_RAW_IO set).
*/
-static void __oom_kill_task(task_t *p)
+static void __oom_kill_task(task_t *p, const char *message)
{
if (p->pid == 1) {
WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
return;
}
task_unlock(p);
- printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
- p->pid, p->comm);
+ printk(KERN_ERR "%s: Killed process %d (%s).\n",
+ message, p->pid, p->comm);
/*
* We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
force_sig(SIGKILL, p);
}
-static struct mm_struct *oom_kill_task(task_t *p)
+static struct mm_struct *oom_kill_task(task_t *p, const char *message)
{
struct mm_struct *mm = get_task_mm(p);
task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
return NULL;
}
- __oom_kill_task(p);
+ __oom_kill_task(p, message);
/*
* kill all processes that share the ->mm (i.e. all threads),
* but are in a different thread group
*/
do_each_thread(g, q)
if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
+ __oom_kill_task(q, message);
while_each_thread(g, q);
return mm;
}
-static struct mm_struct *oom_kill_process(struct task_struct *p)
+static struct mm_struct *oom_kill_process(struct task_struct *p,
+ unsigned long points, const char *message)
{
struct mm_struct *mm;
struct task_struct *c;
struct list_head *tsk;
+ printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
+ "children.\n", p->pid, p->comm, points);
/* Try to kill a child first */
list_for_each(tsk, &p->children) {
c = list_entry(tsk, struct task_struct, sibling);
if (c->mm == p->mm)
continue;
- mm = oom_kill_task(c);
+ mm = oom_kill_task(c, message);
if (mm)
return mm;
}
- return oom_kill_task(p);
+ return oom_kill_task(p, message);
}
/**
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
struct mm_struct *mm = NULL;
- task_t * p;
+ task_t *p;
+ unsigned long points;
if (printk_ratelimit()) {
printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -277,25 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
cpuset_lock();
read_lock(&tasklist_lock);
+
+ /*
+ * Check if there were limitations on the allocation (only relevant for
+ * NUMA) that may require different handling.
+ */
+ switch (constrained_alloc(zonelist, gfp_mask)) {
+ case CONSTRAINT_MEMORY_POLICY:
+ mm = oom_kill_process(current, points,
+ "No available memory (MPOL_BIND)");
+ break;
+
+ case CONSTRAINT_CPUSET:
+ mm = oom_kill_process(current, points,
+ "No available memory in cpuset");
+ break;
+
+ case CONSTRAINT_NONE:
retry:
- p = select_bad_process();
+ /*
+ * Rambo mode: Shoot down a process and hope it solves whatever
+ * issues we may have.
+ */
+ p = select_bad_process(&points);
- if (PTR_ERR(p) == -1UL)
- goto out;
+ if (PTR_ERR(p) == -1UL)
+ goto out;
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- cpuset_unlock();
- panic("Out of memory and no killable processes...\n");
- }
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ cpuset_unlock();
+ panic("Out of memory and no killable processes...\n");
+ }
- mm = oom_kill_process(p);
- if (!mm)
- goto retry;
+ mm = oom_kill_process(p, points, "Out of memory");
+ if (!mm)
+ goto retry;
+
+ break;
+ }
- out:
- read_unlock(&tasklist_lock);
+out:
cpuset_unlock();
if (mm)
mmput(mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde04ff4be3..791690d7d3f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
int percpu_pagelist_fraction;
static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
* All pages have PG_compound set. All pages have their ->private pointing at
* the head page (even the head page has this).
*
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present. This usage means that zero-order pages
- * may not be compound.
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function. Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
*/
+
+static void free_compound_page(struct page *page)
+{
+ __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
static void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
- page[1].mapping = NULL;
- page[1].index = order;
+ page[1].lru.next = (void *)free_compound_page; /* set dtor */
+ page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- if (unlikely(page[1].index != order))
+ if (unlikely((unsigned long)page[1].lru.prev != order))
bad_page(page);
for (i = 0; i < nr_pages; i++) {
@@ -1011,7 +1015,7 @@ rebalance:
if (page)
goto got_pg;
- out_of_memory(gfp_mask, order);
+ out_of_memory(zonelist, gfp_mask, order);
goto restart;
}
@@ -1537,29 +1541,29 @@ static int __initdata node_load[MAX_NUMNODES];
*/
static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
{
- int i, n, val;
+ int n, val;
int min_val = INT_MAX;
int best_node = -1;
- for_each_online_node(i) {
- cpumask_t tmp;
+ /* Use the local node if we haven't already */
+ if (!node_isset(node, *used_node_mask)) {
+ node_set(node, *used_node_mask);
+ return node;
+ }
- /* Start from local node */
- n = (node+i) % num_online_nodes();
+ for_each_online_node(n) {
+ cpumask_t tmp;
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
- /* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
- best_node = node;
- break;
- }
-
/* Use the distance array to find the distance */
val = node_distance(node, n);
+ /* Penalize nodes under us ("prefer the next node") */
+ val += (n < node);
+
/* Give preference to headless and unused nodes */
tmp = node_to_cpumask(n);
if (!cpus_empty(tmp))
diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b812f9..7c455fbaff7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
+#include <linux/ctype.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
}
#ifdef CONFIG_NUMA
+static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+ char *nodelist = strchr(value, ':');
+ int err = 1;
+
+ if (nodelist) {
+ /* NUL-terminate policy string */
+ *nodelist++ = '\0';
+ if (nodelist_parse(nodelist, *policy_nodes))
+ goto out;
+ }
+ if (!strcmp(value, "default")) {
+ *policy = MPOL_DEFAULT;
+ /* Don't allow a nodelist */
+ if (!nodelist)
+ err = 0;
+ } else if (!strcmp(value, "prefer")) {
+ *policy = MPOL_PREFERRED;
+ /* Insist on a nodelist of one node only */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (!*rest)
+ err = 0;
+ }
+ } else if (!strcmp(value, "bind")) {
+ *policy = MPOL_BIND;
+ /* Insist on a nodelist */
+ if (nodelist)
+ err = 0;
+ } else if (!strcmp(value, "interleave")) {
+ *policy = MPOL_INTERLEAVE;
+ /* Default to nodes online if no nodelist */
+ if (!nodelist)
+ *policy_nodes = node_online_map;
+ err = 0;
+ }
+out:
+ /* Restore string for error message */
+ if (nodelist)
+ *--nodelist = ':';
+ return err;
+}
+
static struct page *shmem_swapin_async(struct shared_policy *p,
swp_entry_t entry, unsigned long idx)
{
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
return page;
}
#else
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+ return 1;
+}
+
static inline struct page *
shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
{
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
{
char *this_char, *value, *rest;
- while ((this_char = strsep(&options, ",")) != NULL) {
+ while (options != NULL) {
+ this_char = options;
+ for (;;) {
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ options = strchr(options, ',');
+ if (options == NULL)
+ break;
+ options++;
+ if (!isdigit(*options)) {
+ options[-1] = '\0';
+ break;
+ }
+ }
if (!*this_char)
continue;
if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- if (!strcmp(value,"default"))
- *policy = MPOL_DEFAULT;
- else if (!strcmp(value,"preferred"))
- *policy = MPOL_PREFERRED;
- else if (!strcmp(value,"bind"))
- *policy = MPOL_BIND;
- else if (!strcmp(value,"interleave"))
- *policy = MPOL_INTERLEAVE;
- else
+ if (shmem_parse_mpol(value,policy,policy_nodes))
goto bad_val;
- } else if (!strcmp(this_char,"mpol_nodelist")) {
- nodelist_parse(value, *policy_nodes);
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d971..add05d808a4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1717,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
BUG();
}
+ /*
+ * Prevent CPUs from coming and going.
+ * lock_cpu_hotplug() nests outside cache_chain_mutex
+ */
+ lock_cpu_hotplug();
+
mutex_lock(&cache_chain_mutex);
list_for_each(p, &cache_chain) {
@@ -1918,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->dtor = dtor;
cachep->name = name;
- /* Don't let CPUs to come and go */
- lock_cpu_hotplug();
if (g_cpucache_up == FULL) {
enable_cpucache(cachep);
@@ -1978,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
- unlock_cpu_hotplug();
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
+ unlock_cpu_hotplug();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
diff --git a/mm/swap.c b/mm/swap.c
index 76247424dea..cce3dda59c5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
if (put_page_testzero(page)) {
void (*dtor)(struct page *page);
- dtor = (void (*)(struct page *))page[1].mapping;
+ dtor = (void (*)(struct page *))page[1].lru.next;
(*dtor)(page);
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd0..b0af7593d01 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
BUG_ON(PageActive(page));
sc->nr_scanned++;
+
+ if (!sc->may_swap && page_mapped(page))
+ goto keep_locked;
+
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
struct address_space *mapping = page_mapping(page);
if (page_mapped(page) && mapping)
- if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+ if (try_to_unmap(page, 1) != SWAP_SUCCESS)
goto unlock_retry;
if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
* pages are swapped out.
*
* The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
* or no retryable pages exist anymore.
*
* Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
goto unlock_both;
if (mapping->a_ops->migratepage) {
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
rc = mapping->a_ops->migratepage(newpage, page);
goto unlock_both;
}
/*
- * Trigger writeout if page is dirty
+ * Default handling if a filesystem does not provide
+ * a migration function. We can only migrate clean
+ * pages so try to write out any dirty pages first.
*/
if (PageDirty(page)) {
switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
; /* try to migrate the page below */
}
}
+
/*
- * If we have no buffer or can release the buffer
- * then do a simple migration.
+ * Buffers are managed in a filesystem specific way.
+ * We must have no buffers or drop them.
*/
if (!page_has_buffers(page) ||
try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
* swap them out.
*/
if (pass > 4) {
+ /*
+ * Persistently unable to drop buffers..... As a
+ * measure of last resort we fall back to
+ * swap_page().
+ */
unlock_page(newpage);
newpage = NULL;
rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
+
+ if (unlikely(sc->may_swap)) {
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+
+ /*
+ * `distress' is a measure of how much trouble we're having
+ * reclaiming pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> zone->prev_priority;
+
+ /*
+ * The point of this algorithm is to decide when to start
+ * reclaiming mapped memory instead of just pagecache. Work out
+ * how much memory
+ * is mapped.
+ */
+ mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+ /*
+ * Now decide how much we really want to unmap some pages. The
+ * mapped ratio is downgraded - just because there's a lot of
+ * mapped memory doesn't necessarily mean that page reclaim
+ * isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start
+ * going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm
+ * altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped
+ * memory onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ reclaim_mapped = 1;
+ }
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
sc.nr_reclaimed = 0;
sc.priority = priority;
sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
- atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
- atomic_dec(&zone->reclaim_in_progress);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
@@ -1884,7 +1908,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
sc.swap_cluster_max = SWAP_CLUSTER_MAX;
cond_resched();
- p->flags |= PF_MEMALLOC;
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -1908,11 +1937,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* a long time.
*/
shrink_slab(sc.nr_scanned, gfp_mask, order);
- sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
}
p->reclaim_state = NULL;
- current->flags &= ~PF_MEMALLOC;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
if (sc.nr_reclaimed == 0)
zone->last_unsuccessful_zone_reclaim = jiffies;