aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile2
-rw-r--r--mm/allocpercpu.c3
-rw-r--r--mm/backing-dev.c219
-rw-r--r--mm/bootmem.c196
-rw-r--r--mm/dmapool.c12
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/filemap_xip.c200
-rw-r--r--mm/hugetlb.c84
-rw-r--r--mm/internal.h3
-rw-r--r--mm/maccess.c55
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c190
-rw-r--r--mm/memory.c228
-rw-r--r--mm/memory_hotplug.c188
-rw-r--r--mm/mempolicy.c1051
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c57
-rw-r--r--mm/mmzone.c30
-rw-r--r--mm/nommu.c29
-rw-r--r--mm/oom_kill.c58
-rw-r--r--mm/page-writeback.c77
-rw-r--r--mm/page_alloc.c327
-rw-r--r--mm/pagewalk.c8
-rw-r--r--mm/pdflush.c8
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c146
-rw-r--r--mm/slab.c36
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c592
-rw-r--r--mm/sparse.c134
-rw-r--r--mm/swap.c37
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c14
-rw-r--r--mm/truncate.c11
-rw-r--r--mm/vmalloc.c144
-rw-r--r--mm/vmscan.c75
-rw-r--r--mm/vmstat.c16
41 files changed, 2844 insertions, 1444 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0016ebd4dcb..3aa819d628c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,18 @@ config MEMORY_HOTREMOVE
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
+#
+# If we have space for more page flags then we can enable additional
+# optimizations and functionality.
+#
+# Regular Sparsemem takes page flag bits for the sectionid if it does not
+# use a virtual memmap. Disable extended page flags for 32 bit platforms
+# that require the use of a sectionid in the page flags.
+#
+config PAGEFLAGS_EXTENDED
+ def_bool y
+ depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index a5b0dd93427..18c143b3c46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- page_alloc.o page-writeback.o pdflush.o \
+ maccess.o page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o $(mmu-y)
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index b0012e27fea..f4026bae6ee 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -82,9 +82,10 @@ EXPORT_SYMBOL_GPL(percpu_populate);
int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
cpumask_t *mask)
{
- cpumask_t populated = CPU_MASK_NONE;
+ cpumask_t populated;
int cpu;
+ cpus_clear(populated);
for_each_cpu_mask(cpu, *mask)
if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
__percpu_depopulate_mask(__pdata, &populated);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e8644b1e552..7c4f9e09709 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -4,12 +4,229 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+
+
+static struct class *bdi_class;
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bdi_debug_root;
+
+static void bdi_debug_init(void)
+{
+ bdi_debug_root = debugfs_create_dir("bdi", NULL);
+}
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ long background_thresh;
+ long dirty_thresh;
+ long bdi_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ seq_printf(m,
+ "BdiWriteback: %8lu kB\n"
+ "BdiReclaimable: %8lu kB\n"
+ "BdiDirtyThresh: %8lu kB\n"
+ "DirtyThresh: %8lu kB\n"
+ "BackgroundThresh: %8lu kB\n",
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
+ (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh));
+#undef K
+
+ return 0;
+}
+
+static int bdi_debug_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, bdi_debug_stats_show, inode->i_private);
+}
+
+static const struct file_operations bdi_debug_stats_fops = {
+ .open = bdi_debug_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+{
+ bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+ bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
+ bdi, &bdi_debug_stats_fops);
+}
+
+static void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+ debugfs_remove(bdi->debug_stats);
+ debugfs_remove(bdi->debug_dir);
+}
+#else
+static inline void bdi_debug_init(void)
+{
+}
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
+ const char *name)
+{
+}
+static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+}
+#endif
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned long read_ahead_kb;
+ ssize_t ret = -EINVAL;
+
+ read_ahead_kb = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+ ret = count;
+ }
+ return ret;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *page) \
+{ \
+ struct backing_dev_info *bdi = dev_get_drvdata(dev); \
+ \
+ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+}
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+static ssize_t min_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned int ratio;
+ ssize_t ret = -EINVAL;
+
+ ratio = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+ }
+ return ret;
+}
+BDI_SHOW(min_ratio, bdi->min_ratio)
+
+static ssize_t max_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ char *end;
+ unsigned int ratio;
+ ssize_t ret = -EINVAL;
+
+ ratio = simple_strtoul(buf, &end, 10);
+ if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+ }
+ return ret;
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
+static struct device_attribute bdi_dev_attrs[] = {
+ __ATTR_RW(read_ahead_kb),
+ __ATTR_RW(min_ratio),
+ __ATTR_RW(max_ratio),
+ __ATTR_NULL,
+};
+
+static __init int bdi_class_init(void)
+{
+ bdi_class = class_create(THIS_MODULE, "bdi");
+ bdi_class->dev_attrs = bdi_dev_attrs;
+ bdi_debug_init();
+ return 0;
+}
+
+postcore_initcall(bdi_class_init);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ char *name;
+ va_list args;
+ int ret = 0;
+ struct device *dev;
+
+ va_start(args, fmt);
+ name = kvasprintf(GFP_KERNEL, fmt, args);
+ va_end(args);
+
+ if (!name)
+ return -ENOMEM;
+
+ dev = device_create(bdi_class, parent, MKDEV(0, 0), name);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto exit;
+ }
+
+ bdi->dev = dev;
+ dev_set_drvdata(bdi->dev, bdi);
+ bdi_debug_register(bdi, name);
+
+exit:
+ kfree(name);
+ return ret;
+}
+EXPORT_SYMBOL(bdi_register);
+
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+{
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ if (bdi->dev) {
+ bdi_debug_unregister(bdi);
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+}
+EXPORT_SYMBOL(bdi_unregister);
int bdi_init(struct backing_dev_info *bdi)
{
int i;
int err;
+ bdi->dev = NULL;
+
+ bdi->min_ratio = 0;
+ bdi->max_ratio = 100;
+ bdi->max_prop_frac = PROP_FRAC_BASE;
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
if (err)
@@ -33,6 +250,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
+ bdi_unregister(bdi);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2ccea700968..e8fb927392b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -111,44 +111,74 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
* might be used for boot-time allocations - or it might get added
* to the free page pool later on.
*/
-static int __init reserve_bootmem_core(bootmem_data_t *bdata,
+static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
unsigned long addr, unsigned long size, int flags)
{
unsigned long sidx, eidx;
unsigned long i;
- int ret;
+
+ BUG_ON(!size);
+
+ /* out of range, don't hold other */
+ if (addr + size < bdata->node_boot_start ||
+ PFN_DOWN(addr) > bdata->node_low_pfn)
+ return 0;
/*
- * round up, partially reserved pages are considered
- * fully reserved.
+ * Round up to index to the range.
*/
+ if (addr > bdata->node_boot_start)
+ sidx= PFN_DOWN(addr - bdata->node_boot_start);
+ else
+ sidx = 0;
+
+ eidx = PFN_UP(addr + size - bdata->node_boot_start);
+ if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+ eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
+
+ for (i = sidx; i < eidx; i++) {
+ if (test_bit(i, bdata->node_bootmem_map)) {
+ if (flags & BOOTMEM_EXCLUSIVE)
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+
+}
+
+static void __init reserve_bootmem_core(bootmem_data_t *bdata,
+ unsigned long addr, unsigned long size, int flags)
+{
+ unsigned long sidx, eidx;
+ unsigned long i;
+
BUG_ON(!size);
- BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
- BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
- BUG_ON(addr < bdata->node_boot_start);
- sidx = PFN_DOWN(addr - bdata->node_boot_start);
+ /* out of range */
+ if (addr + size < bdata->node_boot_start ||
+ PFN_DOWN(addr) > bdata->node_low_pfn)
+ return;
+
+ /*
+ * Round up to index to the range.
+ */
+ if (addr > bdata->node_boot_start)
+ sidx= PFN_DOWN(addr - bdata->node_boot_start);
+ else
+ sidx = 0;
+
eidx = PFN_UP(addr + size - bdata->node_boot_start);
+ if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+ eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
- for (i = sidx; i < eidx; i++)
+ for (i = sidx; i < eidx; i++) {
if (test_and_set_bit(i, bdata->node_bootmem_map)) {
#ifdef CONFIG_DEBUG_BOOTMEM
printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
#endif
- if (flags & BOOTMEM_EXCLUSIVE) {
- ret = -EBUSY;
- goto err;
- }
}
-
- return 0;
-
-err:
- /* unreserve memory we accidentally reserved */
- for (i--; i >= sidx; i--)
- clear_bit(i, bdata->node_bootmem_map);
-
- return ret;
+ }
}
static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
@@ -206,9 +236,11 @@ void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
unsigned long align, unsigned long goal, unsigned long limit)
{
- unsigned long offset, remaining_size, areasize, preferred;
+ unsigned long areasize, preferred;
unsigned long i, start = 0, incr, eidx, end_pfn;
void *ret;
+ unsigned long node_boot_start;
+ void *node_bootmem_map;
if (!size) {
printk("__alloc_bootmem_core(): zero-sized request\n");
@@ -216,70 +248,83 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
}
BUG_ON(align & (align-1));
- if (limit && bdata->node_boot_start >= limit)
- return NULL;
-
/* on nodes without memory - bootmem_map is NULL */
if (!bdata->node_bootmem_map)
return NULL;
+ /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
+ node_boot_start = bdata->node_boot_start;
+ node_bootmem_map = bdata->node_bootmem_map;
+ if (align) {
+ node_boot_start = ALIGN(bdata->node_boot_start, align);
+ if (node_boot_start > bdata->node_boot_start)
+ node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
+ PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
+ }
+
+ if (limit && node_boot_start >= limit)
+ return NULL;
+
end_pfn = bdata->node_low_pfn;
limit = PFN_DOWN(limit);
if (limit && end_pfn > limit)
end_pfn = limit;
- eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
- offset = 0;
- if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
- offset = align - (bdata->node_boot_start & (align - 1UL));
- offset = PFN_DOWN(offset);
+ eidx = end_pfn - PFN_DOWN(node_boot_start);
/*
* We try to allocate bootmem pages above 'goal'
* first, then we try to allocate lower pages.
*/
- if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
- preferred = goal - bdata->node_boot_start;
+ preferred = 0;
+ if (goal && PFN_DOWN(goal) < end_pfn) {
+ if (goal > node_boot_start)
+ preferred = goal - node_boot_start;
- if (bdata->last_success >= preferred)
+ if (bdata->last_success > node_boot_start &&
+ bdata->last_success - node_boot_start >= preferred)
if (!limit || (limit && limit > bdata->last_success))
- preferred = bdata->last_success;
- } else
- preferred = 0;
+ preferred = bdata->last_success - node_boot_start;
+ }
- preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
+ preferred = PFN_DOWN(ALIGN(preferred, align));
areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
incr = align >> PAGE_SHIFT ? : 1;
restart_scan:
- for (i = preferred; i < eidx; i += incr) {
+ for (i = preferred; i < eidx;) {
unsigned long j;
- i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
+
+ i = find_next_zero_bit(node_bootmem_map, eidx, i);
i = ALIGN(i, incr);
if (i >= eidx)
break;
- if (test_bit(i, bdata->node_bootmem_map))
+ if (test_bit(i, node_bootmem_map)) {
+ i += incr;
continue;
+ }
for (j = i + 1; j < i + areasize; ++j) {
if (j >= eidx)
goto fail_block;
- if (test_bit(j, bdata->node_bootmem_map))
+ if (test_bit(j, node_bootmem_map))
goto fail_block;
}
start = i;
goto found;
fail_block:
i = ALIGN(j, incr);
+ if (i == j)
+ i += incr;
}
- if (preferred > offset) {
- preferred = offset;
+ if (preferred > 0) {
+ preferred = 0;
goto restart_scan;
}
return NULL;
found:
- bdata->last_success = PFN_PHYS(start);
+ bdata->last_success = PFN_PHYS(start) + node_boot_start;
BUG_ON(start >= eidx);
/*
@@ -289,6 +334,7 @@ found:
*/
if (align < PAGE_SIZE &&
bdata->last_offset && bdata->last_pos+1 == start) {
+ unsigned long offset, remaining_size;
offset = ALIGN(bdata->last_offset, align);
BUG_ON(offset > PAGE_SIZE);
remaining_size = PAGE_SIZE - offset;
@@ -297,14 +343,12 @@ found:
/* last_pos unchanged */
bdata->last_offset = offset + size;
ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
- offset +
- bdata->node_boot_start);
+ offset + node_boot_start);
} else {
remaining_size = size - remaining_size;
areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
- offset +
- bdata->node_boot_start);
+ offset + node_boot_start);
bdata->last_pos = start + areasize - 1;
bdata->last_offset = remaining_size;
}
@@ -312,14 +356,14 @@ found:
} else {
bdata->last_pos = start + areasize - 1;
bdata->last_offset = size & ~PAGE_MASK;
- ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
+ ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
}
/*
* Reserve the area now:
*/
for (i = start; i < start + areasize; i++)
- if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
+ if (unlikely(test_and_set_bit(i, node_bootmem_map)))
BUG();
memset(ret, 0, size);
return ret;
@@ -401,6 +445,11 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
{
+ int ret;
+
+ ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+ if (ret < 0)
+ return;
reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
}
@@ -412,6 +461,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
+ register_page_bootmem_info_node(pgdat);
return free_all_bootmem_core(pgdat);
}
@@ -426,7 +476,18 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
- return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags);
+ bootmem_data_t *bdata;
+ int ret;
+
+ list_for_each_entry(bdata, &bdata_list, list) {
+ ret = can_reserve_bootmem_core(bdata, addr, size, flags);
+ if (ret < 0)
+ return ret;
+ }
+ list_for_each_entry(bdata, &bdata_list, list)
+ reserve_bootmem_core(bdata, addr, size, flags);
+
+ return 0;
}
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
@@ -484,6 +545,37 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
return __alloc_bootmem(size, align, goal);
}
+#ifdef CONFIG_SPARSEMEM
+void * __init alloc_bootmem_section(unsigned long size,
+ unsigned long section_nr)
+{
+ void *ptr;
+ unsigned long limit, goal, start_nr, end_nr, pfn;
+ struct pglist_data *pgdat;
+
+ pfn = section_nr_to_pfn(section_nr);
+ goal = PFN_PHYS(pfn);
+ limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
+ pgdat = NODE_DATA(early_pfn_to_nid(pfn));
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
+ limit);
+
+ if (!ptr)
+ return NULL;
+
+ start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
+ end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
+ if (start_nr != section_nr || end_nr != section_nr) {
+ printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
+ section_nr);
+ free_bootmem_core(pgdat->bdata, __pa(ptr), size);
+ ptr = NULL;
+ }
+
+ return ptr;
+}
+#endif
+
#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
#endif
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 34aaac451a9..b1f0885dda2 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -37,6 +37,10 @@
#include <linux/types.h>
#include <linux/wait.h>
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+#define DMAPOOL_DEBUG 1
+#endif
+
struct dma_pool { /* the pool */
struct list_head page_list;
spinlock_t lock;
@@ -216,7 +220,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
&page->dma, mem_flags);
if (page->vaddr) {
-#ifdef CONFIG_DEBUG_SLAB
+#ifdef DMAPOOL_DEBUG
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
pool_initialise_page(pool, page);
@@ -239,7 +243,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
{
dma_addr_t dma = page->dma;
-#ifdef CONFIG_DEBUG_SLAB
+#ifdef DMAPOOL_DEBUG
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
@@ -336,7 +340,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
page->offset = *(int *)(page->vaddr + offset);
retval = offset + page->vaddr;
*handle = offset + page->dma;
-#ifdef CONFIG_DEBUG_SLAB
+#ifdef DMAPOOL_DEBUG
memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
done:
@@ -391,7 +395,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
offset = vaddr - page->vaddr;
-#ifdef CONFIG_DEBUG_SLAB
+#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
if (pool->dev)
dev_err(pool->dev,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3c0f1e99f5e..343cfdfebd9 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,7 +49,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
goto out;
}
- if (mapping->a_ops->get_xip_page) {
+ if (mapping->a_ops->get_xip_mem) {
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
diff --git a/mm/filemap.c b/mm/filemap.c
index 07e9d9258b4..239d36163bb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -576,10 +576,12 @@ EXPORT_SYMBOL(unlock_page);
*/
void end_page_writeback(struct page *page)
{
- if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
- if (!test_clear_page_writeback(page))
- BUG();
- }
+ if (TestClearPageReclaim(page))
+ rotate_reclaimable_page(page);
+
+ if (!test_clear_page_writeback(page))
+ BUG();
+
smp_mb__after_clear_bit();
wake_up_page(page, PG_writeback);
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 5e598c42afd..3e744abcce9 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,6 +15,7 @@
#include <linux/rmap.h>
#include <linux/sched.h>
#include <asm/tlbflush.h>
+#include <asm/io.h>
/*
* We do use our own empty page to avoid interference with other users
@@ -42,37 +43,41 @@ static struct page *xip_sparse_page(void)
/*
* This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
* stuff.
*
* Note the struct file* is not used at all. It may be NULL.
*/
-static void
+static ssize_t
do_xip_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
struct file *filp,
- loff_t *ppos,
- read_descriptor_t *desc,
- read_actor_t actor)
+ char __user *buf,
+ size_t len,
+ loff_t *ppos)
{
struct inode *inode = mapping->host;
pgoff_t index, end_index;
unsigned long offset;
- loff_t isize;
+ loff_t isize, pos;
+ size_t copied = 0, error = 0;
- BUG_ON(!mapping->a_ops->get_xip_page);
+ BUG_ON(!mapping->a_ops->get_xip_mem);
- index = *ppos >> PAGE_CACHE_SHIFT;
- offset = *ppos & ~PAGE_CACHE_MASK;
+ pos = *ppos;
+ index = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_CACHE_MASK;
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- for (;;) {
- struct page *page;
- unsigned long nr, ret;
+ do {
+ unsigned long nr, left;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int zero = 0;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
@@ -85,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping,
}
}
nr = nr - offset;
+ if (nr > len)
+ nr = len;
- page = mapping->a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 0);
- if (!page)
- goto no_xip_page;
- if (unlikely(IS_ERR(page))) {
- if (PTR_ERR(page) == -ENODATA) {
+ error = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(error)) {
+ if (error == -ENODATA) {
/* sparse */
- page = ZERO_PAGE(0);
- } else {
- desc->error = PTR_ERR(page);
+ zero = 1;
+ } else
goto out;
- }
}
/* If users can be writing to this page using arbitrary
@@ -105,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping,
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ /* address based flush */ ;
/*
- * Ok, we have the page, so now we can copy it to user space...
+ * Ok, we have the mem, so now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
@@ -116,47 +119,38 @@ do_xip_mapping_read(struct address_space *mapping,
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
- ret = actor(desc, page, offset, nr);
- offset += ret;
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
+ if (!zero)
+ left = __copy_to_user(buf+copied, xip_mem+offset, nr);
+ else
+ left = __clear_user(buf + copied, nr);
- if (ret == nr && desc->count)
- continue;
- goto out;
+ if (left) {
+ error = -EFAULT;
+ goto out;
+ }
-no_xip_page:
- /* Did not get the page. Report it */
- desc->error = -EIO;
- goto out;
- }
+ copied += (nr - left);
+ offset += (nr - left);
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+ } while (copied < len);
out:
- *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+ *ppos = pos + copied;
if (filp)
file_accessed(filp);
+
+ return (copied ? copied : error);
}
ssize_t
xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
- read_descriptor_t desc;
-
if (!access_ok(VERIFY_WRITE, buf, len))
return -EFAULT;
- desc.written = 0;
- desc.arg.buf = buf;
- desc.count = len;
- desc.error = 0;
-
- do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
- ppos, &desc, file_read_actor);
-
- if (desc.written)
- return desc.written;
- else
- return desc.error;
+ return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+ buf, len, ppos);
}
EXPORT_SYMBOL_GPL(xip_file_read);
@@ -211,13 +205,16 @@ __xip_unmap (struct address_space * mapping,
*
* This function is derived from filemap_fault, but used for execute in place
*/
-static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct file *file = area->vm_file;
+ struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- struct page *page;
pgoff_t size;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ struct page *page;
+ int error;
/* XXX: are VM_FAULT_ codes OK? */
@@ -225,35 +222,44 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;
- page = mapping->a_ops->get_xip_page(mapping,
- vmf->pgoff*(PAGE_SIZE/512), 0);
- if (!IS_ERR(page))
- goto out;
- if (PTR_ERR(page) != -ENODATA)
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (likely(!error))
+ goto found;
+ if (error != -ENODATA)
return VM_FAULT_OOM;
/* sparse block */
- if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
- (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+ if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+ (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
(!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+ int err;
+
/* maybe shared writable, allocate new block */
- page = mapping->a_ops->get_xip_page(mapping,
- vmf->pgoff*(PAGE_SIZE/512), 1);
- if (IS_ERR(page))
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
+ &xip_mem, &xip_pfn);
+ if (error)
return VM_FAULT_SIGBUS;
- /* unmap page at pgoff from all other vmas */
+ /* unmap sparse mappings at pgoff from all other vmas */
__xip_unmap(mapping, vmf->pgoff);
+
+found:
+ err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+ xip_pfn);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ BUG_ON(err);
+ return VM_FAULT_NOPAGE;
} else {
/* not shared and writable, use xip_sparse_page() */
page = xip_sparse_page();
if (!page)
return VM_FAULT_OOM;
- }
-out:
- page_cache_get(page);
- vmf->page = page;
- return 0;
+ page_cache_get(page);
+ vmf->page = page;
+ return 0;
+ }
}
static struct vm_operations_struct xip_file_vm_ops = {
@@ -262,11 +268,11 @@ static struct vm_operations_struct xip_file_vm_ops = {
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
{
- BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+ BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
file_accessed(file);
vma->vm_ops = &xip_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
+ vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
return 0;
}
EXPORT_SYMBOL_GPL(xip_file_mmap);
@@ -279,17 +285,17 @@ __xip_file_write(struct file *filp, const char __user *buf,
const struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
long status = 0;
- struct page *page;
size_t bytes;
ssize_t written = 0;
- BUG_ON(!mapping->a_ops->get_xip_page);
+ BUG_ON(!mapping->a_ops->get_xip_mem);
do {
unsigned long index;
unsigned long offset;
size_t copied;
- char *kaddr;
+ void *xip_mem;
+ unsigned long xip_pfn;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
index = pos >> PAGE_CACHE_SHIFT;
@@ -297,28 +303,22 @@ __xip_file_write(struct file *filp, const char __user *buf,
if (bytes > count)
bytes = count;
- page = a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 0);
- if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+ status = a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (status == -ENODATA) {
/* we allocate a new page unmap it */
- page = a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 1);
- if (!IS_ERR(page))
+ status = a_ops->get_xip_mem(mapping, index, 1,
+ &xip_mem, &xip_pfn);
+ if (!status)
/* unmap page at pgoff from all other vmas */
__xip_unmap(mapping, index);
}
- if (IS_ERR(page)) {
- status = PTR_ERR(page);
+ if (status)
break;
- }
- fault_in_pages_readable(buf, bytes);
- kaddr = kmap_atomic(page, KM_USER0);
copied = bytes -
- __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(page);
+ __copy_from_user_nocache(xip_mem + offset, buf, bytes);
if (likely(copied > 0)) {
status = copied;
@@ -398,7 +398,7 @@ EXPORT_SYMBOL_GPL(xip_file_write);
/*
* truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_page
+ * functionality is analog to block_truncate_page but does use get_xip_mem
* to get the page instead of page cache
*/
int
@@ -408,9 +408,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize;
unsigned length;
- struct page *page;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int err;
- BUG_ON(!mapping->a_ops->get_xip_page);
+ BUG_ON(!mapping->a_ops->get_xip_mem);
blocksize = 1 << mapping->host->i_blkbits;
length = offset & (blocksize - 1);
@@ -421,18 +423,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
length = blocksize - length;
- page = mapping->a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 0);
- if (!page)
- return -ENOMEM;
- if (unlikely(IS_ERR(page))) {
- if (PTR_ERR(page) == -ENODATA)
+ err = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(err)) {
+ if (err == -ENODATA)
/* Hole? No need to truncate */
return 0;
else
- return PTR_ERR(page);
+ return err;
}
- zero_user(page, offset, length);
+ memset(xip_mem + offset, 0, length);
return 0;
}
EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51c9e2c0164..bbf953eeb58 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,13 +95,16 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
int nid;
struct page *page = NULL;
struct mempolicy *mpol;
+ nodemask_t *nodemask;
struct zonelist *zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask, &mpol);
- struct zone **z;
-
- for (z = zonelist->zones; *z; z++) {
- nid = zone_to_nid(*z);
- if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
+ htlb_alloc_mask, &mpol, &nodemask);
+ struct zone *zone;
+ struct zoneref *z;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ MAX_NR_ZONES - 1, nodemask) {
+ nid = zone_to_nid(zone);
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
!list_empty(&hugepage_freelists[nid])) {
page = list_entry(hugepage_freelists[nid].next,
struct page, lru);
@@ -113,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
break;
}
}
- mpol_free(mpol); /* unref if mpol !NULL */
+ mpol_cond_put(mpol);
return page;
}
@@ -129,6 +132,7 @@ static void update_and_free_page(struct page *page)
}
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
+ arch_release_hugepage(page);
__free_pages(page, HUGETLB_PAGE_ORDER);
}
@@ -195,9 +199,14 @@ static struct page *alloc_fresh_huge_page_node(int nid)
struct page *page;
page = alloc_pages_node(nid,
- htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
if (page) {
+ if (arch_prepare_hugepage(page)) {
+ __free_pages(page, HUGETLB_PAGE_ORDER);
+ return NULL;
+ }
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
nr_huge_pages++;
@@ -239,6 +248,11 @@ static int alloc_fresh_huge_page(void)
hugetlb_next_nid = next_nid;
} while (!page && hugetlb_next_nid != start_nid);
+ if (ret)
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
return ret;
}
@@ -281,7 +295,8 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
}
spin_unlock(&hugetlb_lock);
- page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
spin_lock(&hugetlb_lock);
@@ -299,9 +314,11 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
*/
nr_huge_pages_node[nid]++;
surplus_huge_pages_node[nid]++;
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
} else {
nr_huge_pages--;
surplus_huge_pages--;
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
}
spin_unlock(&hugetlb_lock);
@@ -369,11 +386,19 @@ retry:
resv_huge_pages += delta;
ret = 0;
free:
+ /* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ if ((--needed) < 0)
+ break;
list_del(&page->lru);
- if ((--needed) >= 0)
- enqueue_huge_page(page);
- else {
+ enqueue_huge_page(page);
+ }
+
+ /* Free unnecessary surplus pages to the buddy allocator */
+ if (!list_empty(&surplus_list)) {
+ spin_unlock(&hugetlb_lock);
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ list_del(&page->lru);
/*
* The page has a reference count of zero already, so
* call free_huge_page directly instead of using
@@ -381,10 +406,9 @@ free:
* unlocked which is safe because free_huge_page takes
* hugetlb_lock before deciding how to free the page.
*/
- spin_unlock(&hugetlb_lock);
free_huge_page(page);
- spin_lock(&hugetlb_lock);
}
+ spin_lock(&hugetlb_lock);
}
return ret;
@@ -718,7 +742,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
- entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
@@ -731,8 +755,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
{
pte_t entry;
- entry = pte_mkwrite(pte_mkdirty(*ptep));
- if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
update_mmu_cache(vma, address, entry);
}
}
@@ -762,10 +786,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
- if (!pte_none(*src_pte)) {
+ if (!huge_pte_none(huge_ptep_get(src_pte))) {
if (cow)
- ptep_set_wrprotect(src, addr, src_pte);
- entry = *src_pte;
+ huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
@@ -809,7 +833,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
- if (pte_none(pte))
+ if (huge_pte_none(pte))
continue;
page = pte_page(pte);
@@ -873,8 +897,9 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
- if (likely(pte_same(*ptep, pte))) {
+ if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
+ huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
/* Make the old page be freed below */
@@ -942,7 +967,7 @@ retry:
goto backout;
ret = 0;
- if (!pte_none(*ptep))
+ if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -984,8 +1009,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* the same page in the page cache.
*/
mutex_lock(&hugetlb_instantiation_mutex);
- entry = *ptep;
- if (pte_none(entry)) {
+ entry = huge_ptep_get(ptep);
+ if (huge_pte_none(entry)) {
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
@@ -995,7 +1020,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
- if (likely(pte_same(entry, *ptep)))
+ if (likely(pte_same(entry, huge_ptep_get(ptep))))
if (write_access && !pte_write(entry))
ret = hugetlb_cow(mm, vma, address, ptep, entry);
spin_unlock(&mm->page_table_lock);
@@ -1025,7 +1050,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
- if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) {
+ if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
+ (write && !pte_write(huge_ptep_get(pte)))) {
int ret;
spin_unlock(&mm->page_table_lock);
@@ -1041,7 +1067,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
- page = pte_page(*pte);
+ page = pte_page(huge_ptep_get(pte));
same_page:
if (pages) {
get_page(page);
@@ -1090,7 +1116,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
continue;
if (huge_pmd_unshare(mm, &address, ptep))
continue;
- if (!pte_none(*ptep)) {
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
diff --git a/mm/internal.h b/mm/internal.h
index 789727309f4..0034e947e4b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,8 +34,7 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
-extern void __init __free_pages_bootmem(struct page *page,
- unsigned int order);
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
/*
* function for dealing with page's order in buddy system.
diff --git a/mm/maccess.c b/mm/maccess.c
new file mode 100644
index 00000000000..ac40796cfb1
--- /dev/null
+++ b/mm/maccess.c
@@ -0,0 +1,55 @@
+/*
+ * Access kernel memory without faulting.
+ */
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+/**
+ * probe_kernel_read(): safely attempt to read from a location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_read(void *dst, void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst,
+ (__force const void __user *)src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_read);
+
+/**
+ * probe_kernel_write(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_write(void *dst, void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_write);
diff --git a/mm/madvise.c b/mm/madvise.c
index 93ee375b38e..23a0ec3e0ea 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,7 +112,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
if (!file)
return -EBADF;
- if (file->f_mapping->a_ops->get_xip_page) {
+ if (file->f_mapping->a_ops->get_xip_mem) {
/* no bad return value, but ignore advice */
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2e0bfc93484..e46451e1d9b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -26,15 +26,18 @@
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys;
static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+static struct kmem_cache *page_cgroup_cache;
/*
* Statistics for memory cgroup.
@@ -45,6 +48,8 @@ enum mem_cgroup_stat_index {
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
+ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_NSTATS,
};
@@ -196,6 +201,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
else
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
+
+ if (charge)
+ __mem_cgroup_stat_add_safe(stat,
+ MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+ else
+ __mem_cgroup_stat_add_safe(stat,
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
}
static struct mem_cgroup_per_zone *
@@ -236,26 +248,12 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
css);
}
-static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
-void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
-{
- struct mem_cgroup *mem;
-
- mem = mem_cgroup_from_task(p);
- css_get(&mem->css);
- mm->mem_cgroup = mem;
-}
-
-void mm_free_cgroup(struct mm_struct *mm)
-{
- css_put(&mm->mem_cgroup->css);
-}
-
static inline int page_cgroup_locked(struct page *page)
{
return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
@@ -287,10 +285,10 @@ static void unlock_page_cgroup(struct page *page)
bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
-static void __mem_cgroup_remove_list(struct page_cgroup *pc)
+static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
+ struct page_cgroup *pc)
{
int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
- struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
if (from)
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
@@ -301,10 +299,10 @@ static void __mem_cgroup_remove_list(struct page_cgroup *pc)
list_del_init(&pc->lru);
}
-static void __mem_cgroup_add_list(struct page_cgroup *pc)
+static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
+ struct page_cgroup *pc)
{
int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
- struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
if (!to) {
MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
@@ -476,6 +474,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
+ BUG_ON(!mem_cont);
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
if (active)
src = &mz->active_list;
@@ -560,7 +559,7 @@ retry:
}
unlock_page_cgroup(page);
- pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
+ pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
if (pc == NULL)
goto err;
@@ -574,7 +573,7 @@ retry:
mm = &init_mm;
rcu_read_lock();
- mem = rcu_dereference(mm->mem_cgroup);
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
/*
* For every charge from the cgroup, increment reference count
*/
@@ -602,7 +601,6 @@ retry:
mem_cgroup_out_of_memory(mem, gfp_mask);
goto out;
}
- congestion_wait(WRITE, HZ/10);
}
pc->ref_cnt = 1;
@@ -610,7 +608,7 @@ retry:
pc->page = page;
pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
- pc->flags |= PAGE_CGROUP_FLAG_CACHE;
+ pc->flags = PAGE_CGROUP_FLAG_CACHE;
lock_page_cgroup(page);
if (page_get_page_cgroup(page)) {
@@ -622,14 +620,14 @@ retry:
*/
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
- kfree(pc);
+ kmem_cache_free(page_cgroup_cache, pc);
goto retry;
}
page_assign_page_cgroup(page, pc);
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(pc);
+ __mem_cgroup_add_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(page);
@@ -637,7 +635,7 @@ done:
return 0;
out:
css_put(&mem->css);
- kfree(pc);
+ kmem_cache_free(page_cgroup_cache, pc);
err:
return -ENOMEM;
}
@@ -685,7 +683,7 @@ void mem_cgroup_uncharge_page(struct page *page)
if (--(pc->ref_cnt) == 0) {
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(pc);
+ __mem_cgroup_remove_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
page_assign_page_cgroup(page, NULL);
@@ -695,7 +693,7 @@ void mem_cgroup_uncharge_page(struct page *page)
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
- kfree(pc);
+ kmem_cache_free(page_cgroup_cache, pc);
return;
}
@@ -747,7 +745,7 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage)
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(pc);
+ __mem_cgroup_remove_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
page_assign_page_cgroup(page, NULL);
@@ -759,7 +757,7 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage)
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(pc);
+ __mem_cgroup_add_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(newpage);
@@ -853,13 +851,10 @@ static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
return 0;
}
-static ssize_t mem_cgroup_read(struct cgroup *cont,
- struct cftype *cft, struct file *file,
- char __user *userbuf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
- return res_counter_read(&mem_cgroup_from_cont(cont)->res,
- cft->private, userbuf, nbytes, ppos,
- NULL);
+ return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
+ cft->private);
}
static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
@@ -871,27 +866,25 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
mem_cgroup_write_strategy);
}
-static ssize_t mem_force_empty_write(struct cgroup *cont,
- struct cftype *cft, struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *ppos)
+static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
- int ret = mem_cgroup_force_empty(mem);
- if (!ret)
- ret = nbytes;
- return ret;
+ struct mem_cgroup *mem;
+
+ mem = mem_cgroup_from_cont(cont);
+ switch (event) {
+ case RES_MAX_USAGE:
+ res_counter_reset_max(&mem->res);
+ break;
+ case RES_FAILCNT:
+ res_counter_reset_failcnt(&mem->res);
+ break;
+ }
+ return 0;
}
-/*
- * Note: This should be removed if cgroup supports write-only file.
- */
-static ssize_t mem_force_empty_read(struct cgroup *cont,
- struct cftype *cft,
- struct file *file, char __user *userbuf,
- size_t nbytes, loff_t *ppos)
+static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
{
- return -EINVAL;
+ return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
}
static const struct mem_cgroup_stat_desc {
@@ -900,11 +893,13 @@ static const struct mem_cgroup_stat_desc {
} mem_cgroup_stat_desc[] = {
[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+ [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
+ [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
};
-static int mem_control_stat_show(struct seq_file *m, void *arg)
+static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+ struct cgroup_map_cb *cb)
{
- struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
struct mem_cgroup_stat *stat = &mem_cont->stat;
int i;
@@ -914,8 +909,7 @@ static int mem_control_stat_show(struct seq_file *m, void *arg)
val = mem_cgroup_read_stat(stat, i);
val *= mem_cgroup_stat_desc[i].unit;
- seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
- (long long)val);
+ cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
}
/* showing # of active pages */
{
@@ -925,52 +919,43 @@ static int mem_control_stat_show(struct seq_file *m, void *arg)
MEM_CGROUP_ZSTAT_INACTIVE);
active = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_ACTIVE);
- seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
- seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+ cb->fill(cb, "active", (active) * PAGE_SIZE);
+ cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
}
return 0;
}
-static const struct file_operations mem_control_stat_file_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int mem_control_stat_open(struct inode *unused, struct file *file)
-{
- /* XXX __d_cont */
- struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
- file->f_op = &mem_control_stat_file_operations;
- return single_open(file, mem_control_stat_show, cont);
-}
-
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = RES_USAGE,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read,
+ },
+ {
+ .name = "max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .trigger = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read,
},
{
.name = "limit_in_bytes",
.private = RES_LIMIT,
.write = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read,
},
{
.name = "failcnt",
.private = RES_FAILCNT,
- .read = mem_cgroup_read,
+ .trigger = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read,
},
{
.name = "force_empty",
- .write = mem_force_empty_write,
- .read = mem_force_empty_read,
+ .trigger = mem_force_empty_write,
},
{
.name = "stat",
- .open = mem_control_stat_open,
+ .read_map = mem_control_stat_show,
},
};
@@ -1010,6 +995,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
kfree(mem->info.nodeinfo[node]);
}
+static struct mem_cgroup *mem_cgroup_alloc(void)
+{
+ struct mem_cgroup *mem;
+
+ if (sizeof(*mem) < PAGE_SIZE)
+ mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+ else
+ mem = vmalloc(sizeof(*mem));
+
+ if (mem)
+ memset(mem, 0, sizeof(*mem));
+ return mem;
+}
+
+static void mem_cgroup_free(struct mem_cgroup *mem)
+{
+ if (sizeof(*mem) < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+}
+
+
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
@@ -1018,17 +1026,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
if (unlikely((cont->parent) == NULL)) {
mem = &init_mem_cgroup;
- init_mm.mem_cgroup = mem;
- } else
- mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
-
- if (mem == NULL)
- return ERR_PTR(-ENOMEM);
+ page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
+ } else {
+ mem = mem_cgroup_alloc();
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
+ }
res_counter_init(&mem->res);
- memset(&mem->info, 0, sizeof(mem->info));
-
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
@@ -1038,7 +1044,7 @@ free_out:
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
if (cont->parent != NULL)
- kfree(mem);
+ mem_cgroup_free(mem);
return ERR_PTR(-ENOMEM);
}
@@ -1058,7 +1064,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
- kfree(mem_cgroup_from_cont(cont));
+ mem_cgroup_free(mem_cgroup_from_cont(cont));
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -1098,10 +1104,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
if (!thread_group_leader(p))
goto out;
- css_get(&mem->css);
- rcu_assign_pointer(mm->mem_cgroup, mem);
- css_put(&old_mem->css);
-
out:
mmput(mm);
}
diff --git a/mm/memory.c b/mm/memory.c
index 0d14d1e58a5..bbab1e37055 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags)
}
/*
- * This function gets the "struct page" associated with a pte.
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
*
- * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
- * will have each page table entry just pointing to a raw page frame
- * number, and as far as the VM layer is concerned, those do not have
- * pages associated with them - even if the PFN might point to memory
- * that otherwise is perfectly fine and has a "struct page".
+ * "Special" mappings do not wish to be associated with a "struct page" (either
+ * it doesn't exist, or it exists but they don't want to touch it). In this
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
*
- * The way we recognize those mappings is through the rules set up
- * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
- * and the vm_pgoff will point to the first PFN mapped: thus every
- * page that is a raw mapping will always honor the rule
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
+ * pte bit, in which case this function is trivial. Secondly, an architecture
+ * may not have a spare pte bit, which requires a more complicated scheme,
+ * described below.
+ *
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
+ *
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
*
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
*
- * and if that isn't true, the page has been COW'ed (in which case it
- * _does_ have a "struct page" associated with it even if it is in a
- * VM_PFNMAP range).
+ * And for normal mappings this is false.
+ *
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
+ *
+ *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
*/
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+# define HAVE_PTE_SPECIAL 1
+#else
+# define HAVE_PTE_SPECIAL 0
+#endif
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
{
- unsigned long pfn = pte_pfn(pte);
+ unsigned long pfn;
- if (unlikely(vma->vm_flags & VM_PFNMAP)) {
- unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
- return NULL;
+ if (HAVE_PTE_SPECIAL) {
+ if (likely(!pte_special(pte))) {
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ return pte_page(pte);
+ }
+ VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+ return NULL;
}
-#ifdef CONFIG_DEBUG_VM
- /*
- * Add some anal sanity checks for now. Eventually,
- * we should just do "return pfn_to_page(pfn)", but
- * in the meantime we check that we get a valid pfn,
- * and that the resulting page looks ok.
- */
- if (unlikely(!pfn_valid(pfn))) {
- print_bad_pte(vma, pte, addr);
- return NULL;
+ /* !HAVE_PTE_SPECIAL case follows: */
+
+ pfn = pte_pfn(pte);
+
+ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ if (!pfn_valid(pfn))
+ return NULL;
+ goto out;
+ } else {
+ unsigned long off;
+ off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
}
-#endif
+
+ VM_BUG_ON(!pfn_valid(pfn));
/*
- * NOTE! We still have PageReserved() pages in the page
- * tables.
+ * NOTE! We still have PageReserved() pages in the page tables.
*
- * The PAGE_ZERO() pages and various VDSO mappings can
- * cause them to exist.
+ * eg. VDSO mappings can cause them to exist.
*/
+out:
return pfn_to_page(pfn);
}
@@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (pages)
foll_flags |= FOLL_GET;
if (!write && !(vma->vm_flags & VM_LOCKED) &&
- (!vma->vm_ops || (!vma->vm_ops->nopage &&
- !vma->vm_ops->fault)))
+ (!vma->vm_ops || !vma->vm_ops->fault))
foll_flags |= FOLL_ANON;
do {
@@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
* old drivers should use this, and they needed to mark their
* pages reserved for the old functions anyway.
*/
-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, pgprot_t prot)
{
+ struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
@@ -1202,40 +1239,26 @@ out:
*
* The page does not need to be reserved.
*/
-int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page)
{
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
if (!page_count(page))
return -EINVAL;
vma->vm_flags |= VM_INSERTPAGE;
- return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+ return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
-/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
+static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte, entry;
spinlock_t *ptl;
- BUG_ON(!(vma->vm_flags & VM_PFNMAP));
- BUG_ON(is_cow_mapping(vma->vm_flags));
-
retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
@@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
goto out_unlock;
/* Ok, finally just insert the thing.. */
- entry = pfn_pte(pfn, vma->vm_page_prot);
+ entry = pte_mkspecial(pfn_pte(pfn, prot));
set_pte_at(mm, addr, pte, entry);
- update_mmu_cache(vma, addr, entry);
+ update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
retval = 0;
out_unlock:
pte_unmap_unlock(pte, ptl);
-
out:
return retval;
}
+
+/**
+ * vm_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return NULL.
+ */
+int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ /*
+ * Technically, architectures with pte_special can avoid all these
+ * restrictions (same for remap_pfn_range). However we would like
+ * consistency in testing and feature parity among all, so we should
+ * try to keep these invariants in place for everybody.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+ return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
EXPORT_SYMBOL(vm_insert_pfn);
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+
+ /*
+ * If we don't have pte special, then we have to use the pfn_valid()
+ * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+ * refcount the page if pfn_valid is true (hence insert_page rather
+ * than insert_pfn).
+ */
+ if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+ struct page *page;
+
+ page = pfn_to_page(pfn);
+ return insert_page(vma, addr, page, vma->vm_page_prot);
+ }
+ return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_mixed);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
- set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+ set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
BUG_ON(vma->vm_flags & VM_PFNMAP);
- if (likely(vma->vm_ops->fault)) {
- ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
- return ret;
- } else {
- /* Legacy ->nopage path */
- ret = 0;
- vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
- /* no page was available -- either SIGBUS or OOM */
- if (unlikely(vmf.page == NOPAGE_SIGBUS))
- return VM_FAULT_SIGBUS;
- else if (unlikely(vmf.page == NOPAGE_OOM))
- return VM_FAULT_OOM;
- }
+ ret = vma->vm_ops->fault(vma, &vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ return ret;
/*
* For consistency in subsequent calls, make the faulted page always
@@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long pfn;
pte_unmap(page_table);
- BUG_ON(!(vma->vm_flags & VM_PFNMAP));
- BUG_ON(is_cow_mapping(vma->vm_flags));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
if (unlikely(pfn == NOPFN_OOM))
return VM_FAULT_OOM;
else if (unlikely(pfn == NOPFN_SIGBUS))
@@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
- if (vma->vm_ops->fault || vma->vm_ops->nopage)
+ if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
pte, pmd, write_access, entry);
if (unlikely(vma->vm_ops->nopfn))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7469c503580..b17dca7249f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,8 @@
#include <asm/tlbflush.h>
+#include "internal.h"
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
@@ -58,8 +60,105 @@ static void release_memory_resource(struct resource *res)
return;
}
-
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void get_page_bootmem(unsigned long info, struct page *page, int magic)
+{
+ atomic_set(&page->_mapcount, magic);
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ atomic_inc(&page->_count);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ int magic;
+
+ magic = atomic_read(&page->_mapcount);
+ BUG_ON(magic >= -1);
+
+ if (atomic_dec_return(&page->_count) == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ reset_page_mapcount(page);
+ __free_pages_bootmem(page, 0);
+ }
+
+}
+
+void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long *usemap, mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+
+ if (!pfn_valid(start_pfn))
+ return;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usemap = __nr_to_section(section_nr)->pageblock_flags;
+ page = virt_to_page(usemap);
+
+ mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_INFO);
+
+}
+
+void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+ struct zone *zone;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ zone = &pgdat->node_zones[0];
+ for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
+ if (zone->wait_table) {
+ nr_pages = zone->wait_table_hash_nr_entries
+ * sizeof(wait_queue_head_t);
+ nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
+ page = virt_to_page(zone->wait_table);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+ }
+ }
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pfn + pgdat->node_spanned_pages;
+
+ /* register_section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
+ register_page_bootmem_info_section(pfn);
+
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
@@ -101,6 +200,36 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
return register_new_memory(__pfn_to_section(phys_start_pfn));
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+ /*
+ * XXX: Freeing memmap with vmemmap is not implement yet.
+ * This should be removed later.
+ */
+ return -EBUSY;
+}
+#else
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+ unsigned long flags;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int ret = -EINVAL;
+
+ if (!valid_section(ms))
+ return ret;
+
+ ret = unregister_memory_section(ms);
+ if (ret)
+ return ret;
+
+ pgdat_resize_lock(pgdat, &flags);
+ sparse_remove_one_section(zone, ms);
+ pgdat_resize_unlock(pgdat, &flags);
+ return 0;
+}
+#endif
+
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
@@ -134,6 +263,42 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
}
EXPORT_SYMBOL_GPL(__add_pages);
+/**
+ * __remove_pages() - remove sections of pages from a zone
+ * @zone: zone from which pages need to be removed
+ * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @nr_pages: number of pages to remove (must be multiple of section size)
+ *
+ * Generic helper function to remove section mappings and sysfs entries
+ * for the section of the memory we are removing. Caller needs to make
+ * sure that pages are marked reserved and zones are adjust properly by
+ * calling offline_pages().
+ */
+int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, ret = 0;
+ int sections_to_remove;
+
+ /*
+ * We can only remove entire sections
+ */
+ BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
+ BUG_ON(nr_pages % PAGES_PER_SECTION);
+
+ release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+
+ sections_to_remove = nr_pages / PAGES_PER_SECTION;
+ for (i = 0; i < sections_to_remove; i++) {
+ unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+ ret = __remove_section(zone, __pfn_to_section(pfn));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__remove_pages);
+
static void grow_zone_span(struct zone *zone,
unsigned long start_pfn, unsigned long end_pfn)
{
@@ -164,6 +329,25 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
pgdat->node_start_pfn;
}
+void online_page(struct page *page)
+{
+ totalram_pages++;
+ num_physpages++;
+
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages++;
+#endif
+
+#ifdef CONFIG_FLATMEM
+ max_mapnr = max(page_to_pfn(page), max_mapnr);
+#endif
+
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+}
+
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
@@ -208,7 +392,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
- * memory_block->state_sem.
+ * memory_block->state_mutex.
*/
zone = page_zone(pfn_to_page(pfn));
pgdat_resize_lock(zone->zone_pgdat, &flags);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3c360112150..a37a5034f63 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -63,7 +63,6 @@
grows down?
make bind policy root only? It can trigger oom much faster and the
kernel is not always grateful with that.
- could replace all the switch()es with a mempolicy_ops structure.
*/
#include <linux/mempolicy.h>
@@ -89,6 +88,7 @@
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/ctype.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -105,142 +105,264 @@ static struct kmem_cache *sn_cache;
policied. */
enum zone_type policy_zone = 0;
+/*
+ * run-time system-wide default policy => local allocation
+ */
struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .policy = MPOL_DEFAULT,
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_LOCAL,
};
-static void mpol_rebind_policy(struct mempolicy *pol,
- const nodemask_t *newmask);
+static const struct mempolicy_operations {
+ int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+} mpol_ops[MPOL_MAX];
-/* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+/* Check that the nodemask contains at least one populated zone */
+static int is_valid_nodemask(const nodemask_t *nodemask)
{
- int was_empty, is_empty;
+ int nd, k;
- if (!nodes)
- return 0;
+ /* Check that there is something useful in this mask */
+ k = policy_zone;
- /*
- * "Contextualize" the in-coming nodemast for cpusets:
- * Remember whether in-coming nodemask was empty, If not,
- * restrict the nodes to the allowed nodes in the cpuset.
- * This is guaranteed to be a subset of nodes with memory.
- */
- cpuset_update_task_memory_state();
- is_empty = was_empty = nodes_empty(*nodes);
- if (!was_empty) {
- nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
- is_empty = nodes_empty(*nodes); /* after "contextualization" */
- }
+ for_each_node_mask(nd, *nodemask) {
+ struct zone *z;
- switch (mode) {
- case MPOL_DEFAULT:
- /*
- * require caller to specify an empty nodemask
- * before "contextualization"
- */
- if (!was_empty)
- return -EINVAL;
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- /*
- * require at least 1 valid node after "contextualization"
- */
- if (is_empty)
- return -EINVAL;
- break;
- case MPOL_PREFERRED:
- /*
- * Did caller specify invalid nodes?
- * Don't silently accept this as "local allocation".
- */
- if (!was_empty && is_empty)
- return -EINVAL;
- break;
+ for (k = 0; k <= policy_zone; k++) {
+ z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ return 1;
+ }
}
+
return 0;
}
-/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
- struct zonelist *zl;
- int num, max, nd;
- enum zone_type k;
+ return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+}
- max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
- max++; /* space for zlcache_ptr (see mmzone.h) */
- zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
- if (!zl)
- return ERR_PTR(-ENOMEM);
- zl->zlcache_ptr = NULL;
- num = 0;
- /* First put in the highest zones from all nodes, then all the next
- lower zones etc. Avoid empty zones because the memory allocator
- doesn't like them. If you implement node hot removal you
- have to fix that. */
- k = MAX_NR_ZONES - 1;
- while (1) {
- for_each_node_mask(nd, *nodes) {
- struct zone *z = &NODE_DATA(nd)->node_zones[k];
- if (z->present_pages > 0)
- zl->zones[num++] = z;
- }
- if (k == 0)
- break;
- k--;
- }
- if (num == 0) {
- kfree(zl);
- return ERR_PTR(-EINVAL);
- }
- zl->zones[num] = NULL;
- return zl;
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
+ const nodemask_t *rel)
+{
+ nodemask_t tmp;
+ nodes_fold(tmp, *orig, nodes_weight(*rel));
+ nodes_onto(*ret, tmp, *rel);
+}
+
+static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
+}
+
+static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (!nodes)
+ pol->flags |= MPOL_F_LOCAL; /* local allocation */
+ else if (nodes_empty(*nodes))
+ return -EINVAL; /* no allowed nodes */
+ else
+ pol->v.preferred_node = first_node(*nodes);
+ return 0;
+}
+
+static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (!is_valid_nodemask(nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
}
/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
{
struct mempolicy *policy;
+ nodemask_t cpuset_context_nmask;
+ int ret;
- pr_debug("setting mode %d nodes[0] %lx\n",
- mode, nodes ? nodes_addr(*nodes)[0] : -1);
+ pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
- if (mode == MPOL_DEFAULT)
- return NULL;
+ if (mode == MPOL_DEFAULT) {
+ if (nodes && !nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ return NULL; /* simply delete any existing policy */
+ }
+ VM_BUG_ON(!nodes);
+
+ /*
+ * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
+ * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
+ * All other modes require a valid pointer to a non-empty nodemask.
+ */
+ if (mode == MPOL_PREFERRED) {
+ if (nodes_empty(*nodes)) {
+ if (((flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES)))
+ return ERR_PTR(-EINVAL);
+ nodes = NULL; /* flag local alloc */
+ }
+ } else if (nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
- switch (mode) {
- case MPOL_INTERLEAVE:
- policy->v.nodes = *nodes;
- if (nodes_weight(policy->v.nodes) == 0) {
- kmem_cache_free(policy_cache, policy);
- return ERR_PTR(-EINVAL);
- }
- break;
- case MPOL_PREFERRED:
- policy->v.preferred_node = first_node(*nodes);
- if (policy->v.preferred_node >= MAX_NUMNODES)
- policy->v.preferred_node = -1;
- break;
- case MPOL_BIND:
- policy->v.zonelist = bind_zonelist(nodes);
- if (IS_ERR(policy->v.zonelist)) {
- void *error_code = policy->v.zonelist;
- kmem_cache_free(policy_cache, policy);
- return error_code;
- }
- break;
+ policy->mode = mode;
+ policy->flags = flags;
+
+ if (nodes) {
+ /*
+ * cpuset related setup doesn't apply to local allocation
+ */
+ cpuset_update_task_memory_state();
+ if (flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+ &cpuset_current_mems_allowed);
+ else
+ nodes_and(cpuset_context_nmask, *nodes,
+ cpuset_current_mems_allowed);
+ if (mpol_store_user_nodemask(policy))
+ policy->w.user_nodemask = *nodes;
+ else
+ policy->w.cpuset_mems_allowed =
+ cpuset_mems_allowed(current);
+ }
+
+ ret = mpol_ops[mode].create(policy,
+ nodes ? &cpuset_context_nmask : NULL);
+ if (ret < 0) {
+ kmem_cache_free(policy_cache, policy);
+ return ERR_PTR(ret);
}
- policy->policy = mode;
- policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
}
+/* Slow path of a mpol destructor. */
+void __mpol_put(struct mempolicy *p)
+{
+ if (!atomic_dec_and_test(&p->refcnt))
+ return;
+ kmem_cache_free(policy_cache, p);
+}
+
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+{
+}
+
+static void mpol_rebind_nodemask(struct mempolicy *pol,
+ const nodemask_t *nodes)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES)
+ nodes_and(tmp, pol->w.user_nodemask, *nodes);
+ else if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ else {
+ nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
+ }
+
+ pol->v.nodes = tmp;
+ if (!node_isset(current->il_next, tmp)) {
+ current->il_next = next_node(current->il_next, tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = first_node(tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = numa_node_id();
+ }
+}
+
+static void mpol_rebind_preferred(struct mempolicy *pol,
+ const nodemask_t *nodes)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES) {
+ int node = first_node(pol->w.user_nodemask);
+
+ if (node_isset(node, *nodes)) {
+ pol->v.preferred_node = node;
+ pol->flags &= ~MPOL_F_LOCAL;
+ } else
+ pol->flags |= MPOL_F_LOCAL;
+ } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ pol->v.preferred_node = first_node(tmp);
+ } else if (!(pol->flags & MPOL_F_LOCAL)) {
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
+ }
+}
+
+/* Migrate a policy to a different set of nodes */
+static void mpol_rebind_policy(struct mempolicy *pol,
+ const nodemask_t *newmask)
+{
+ if (!pol)
+ return;
+ if (!mpol_store_user_nodemask(pol) &&
+ nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
+ return;
+ mpol_ops[pol->mode].rebind(pol, newmask);
+}
+
+/*
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+ mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ mpol_rebind_policy(vma->vm_policy, new);
+ up_write(&mm->mmap_sem);
+}
+
+static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+ [MPOL_DEFAULT] = {
+ .rebind = mpol_rebind_default,
+ },
+ [MPOL_INTERLEAVE] = {
+ .create = mpol_new_interleave,
+ .rebind = mpol_rebind_nodemask,
+ },
+ [MPOL_PREFERRED] = {
+ .create = mpol_new_preferred,
+ .rebind = mpol_rebind_preferred,
+ },
+ [MPOL_BIND] = {
+ .create = mpol_new_bind,
+ .rebind = mpol_rebind_nodemask,
+ },
+};
+
static void gather_stats(struct page *, void *, int pte_dirty);
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
@@ -421,7 +543,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
if (!err) {
mpol_get(new);
vma->vm_policy = new;
- mpol_free(old);
+ mpol_put(old);
}
return err;
}
@@ -479,46 +601,55 @@ static void mpol_set_task_struct_flag(void)
}
/* Set the process memory policy */
-static long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
{
struct mempolicy *new;
+ struct mm_struct *mm = current->mm;
- if (mpol_check_policy(mode, nodes))
- return -EINVAL;
- new = mpol_new(mode, nodes);
+ new = mpol_new(mode, flags, nodes);
if (IS_ERR(new))
return PTR_ERR(new);
- mpol_free(current->mempolicy);
+
+ /*
+ * prevent changing our mempolicy while show_numa_maps()
+ * is using it.
+ * Note: do_set_mempolicy() can be called at init time
+ * with no 'mm'.
+ */
+ if (mm)
+ down_write(&mm->mmap_sem);
+ mpol_put(current->mempolicy);
current->mempolicy = new;
mpol_set_task_struct_flag();
- if (new && new->policy == MPOL_INTERLEAVE)
+ if (new && new->mode == MPOL_INTERLEAVE &&
+ nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
+ if (mm)
+ up_write(&mm->mmap_sem);
+
return 0;
}
-/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
- int i;
-
nodes_clear(*nodes);
- switch (p->policy) {
+ if (p == &default_policy)
+ return;
+
+ switch (p->mode) {
case MPOL_BIND:
- for (i = 0; p->v.zonelist->zones[i]; i++)
- node_set(zone_to_nid(p->v.zonelist->zones[i]),
- *nodes);
- break;
- case MPOL_DEFAULT:
- break;
+ /* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
- /* or use current node instead of memory_map? */
- if (p->v.preferred_node < 0)
- *nodes = node_states[N_HIGH_MEMORY];
- else
+ if (!(p->flags & MPOL_F_LOCAL))
node_set(p->v.preferred_node, *nodes);
+ /* else return empty node mask for local allocation */
break;
default:
BUG();
@@ -561,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
if (flags & MPOL_F_ADDR) {
+ /*
+ * Do NOT fall back to task policy if the
+ * vma/shared policy at addr is NULL. We
+ * want to return MPOL_DEFAULT in this case.
+ */
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
@@ -575,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
return -EINVAL;
if (!pol)
- pol = &default_policy;
+ pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
@@ -584,14 +720,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
- pol->policy == MPOL_INTERLEAVE) {
+ pol->mode == MPOL_INTERLEAVE) {
*policy = current->il_next;
} else {
err = -EINVAL;
goto out;
}
- } else
- *policy = pol->policy;
+ } else {
+ *policy = pol == &default_policy ? MPOL_DEFAULT :
+ pol->mode;
+ *policy |= pol->flags;
+ }
if (vma) {
up_read(&current->mm->mmap_sem);
@@ -600,9 +739,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
err = 0;
if (nmask)
- get_zonemask(pol, nmask);
+ get_policy_nodemask(pol, nmask);
out:
+ mpol_cond_put(pol);
if (vma)
up_read(&current->mm->mmap_sem);
return err;
@@ -664,7 +804,7 @@ int do_migrate_pages(struct mm_struct *mm,
int err = 0;
nodemask_t tmp;
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
err = migrate_vmas(mm, from_nodes, to_nodes, flags);
if (err)
@@ -781,8 +921,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
#endif
static long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask,
- unsigned long flags)
+ unsigned short mode, unsigned short mode_flags,
+ nodemask_t *nmask, unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
@@ -791,9 +931,8 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);
- if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- || mode > MPOL_MAX)
+ if (flags & ~(unsigned long)(MPOL_MF_STRICT |
+ MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -812,10 +951,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (end == start)
return 0;
- if (mpol_check_policy(mode, nmask))
- return -EINVAL;
-
- new = mpol_new(mode, nmask);
+ new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
@@ -826,8 +962,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
- pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode, nmask ? nodes_addr(*nmask)[0] : -1);
+ pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+ start, start + len, mode, mode_flags,
+ nmask ? nodes_addr(*nmask)[0] : -1);
down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask,
@@ -848,7 +985,7 @@ static long do_mbind(unsigned long start, unsigned long len,
}
up_write(&mm->mmap_sem);
- mpol_free(new);
+ mpol_put(new);
return err;
}
@@ -926,11 +1063,19 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
{
nodemask_t nodes;
int err;
+ unsigned short mode_flags;
+ mode_flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if (mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((mode_flags & MPOL_F_STATIC_NODES) &&
+ (mode_flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, &nodes, flags);
+ return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}
/* Set the process memory policy */
@@ -939,13 +1084,18 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
{
int err;
nodemask_t nodes;
+ unsigned short flags;
- if (mode < 0 || mode > MPOL_MAX)
+ flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if ((unsigned int)mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, &nodes);
+ return do_set_mempolicy(mode, flags, &nodes);
}
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1131,59 +1281,75 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
*
* Returns effective policy for a VMA at specified address.
* Falls back to @task or system default policy, as necessary.
- * Returned policy has extra reference count if shared, vma,
- * or some other task's policy [show_numa_maps() can pass
- * @task != current]. It is the caller's responsibility to
- * free the reference in these cases.
+ * Current or other task's task mempolicy and non-shared vma policies
+ * are protected by the task's mmap_sem, which must be held for read by
+ * the caller.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task. It is the caller's responsibility to free the
+ * extra reference for shared policies.
*/
-static struct mempolicy * get_vma_policy(struct task_struct *task,
+static struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
- int shared_pol = 0;
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
- pol = vma->vm_ops->get_policy(vma, addr);
- shared_pol = 1; /* if pol non-NULL, add ref below */
- } else if (vma->vm_policy &&
- vma->vm_policy->policy != MPOL_DEFAULT)
+ struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
+ addr);
+ if (vpol)
+ pol = vpol;
+ } else if (vma->vm_policy)
pol = vma->vm_policy;
}
if (!pol)
pol = &default_policy;
- else if (!shared_pol && pol != current->mempolicy)
- mpol_get(pol); /* vma or other task's policy */
return pol;
}
-/* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
- int nd;
+ /* Lower zones don't get a nodemask applied for MPOL_BIND */
+ if (unlikely(policy->mode == MPOL_BIND) &&
+ gfp_zone(gfp) >= policy_zone &&
+ cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
+ return &policy->v.nodes;
- switch (policy->policy) {
+ return NULL;
+}
+
+/* Return a zonelist indicated by gfp for node representing a mempolicy */
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+{
+ int nd = numa_node_id();
+
+ switch (policy->mode) {
case MPOL_PREFERRED:
- nd = policy->v.preferred_node;
- if (nd < 0)
- nd = numa_node_id();
+ if (!(policy->flags & MPOL_F_LOCAL))
+ nd = policy->v.preferred_node;
break;
case MPOL_BIND:
- /* Lower zones don't get a policy applied */
- /* Careful: current->mems_allowed might have moved */
- if (gfp_zone(gfp) >= policy_zone)
- if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
- return policy->v.zonelist;
- /*FALL THROUGH*/
+ /*
+ * Normally, MPOL_BIND allocations are node-local within the
+ * allowed nodemask. However, if __GFP_THISNODE is set and the
+ * current node is part of the mask, we use the zonelist for
+ * the first node in the mask instead.
+ */
+ if (unlikely(gfp & __GFP_THISNODE) &&
+ unlikely(!node_isset(nd, policy->v.nodes)))
+ nd = first_node(policy->v.nodes);
+ break;
case MPOL_INTERLEAVE: /* should not happen */
- case MPOL_DEFAULT:
- nd = numa_node_id();
break;
default:
- nd = 0;
BUG();
}
- return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
+ return node_zonelist(nd, gfp);
}
/* Do dynamic interleaving for a process */
@@ -1196,36 +1362,51 @@ static unsigned interleave_nodes(struct mempolicy *policy)
next = next_node(nid, policy->v.nodes);
if (next >= MAX_NUMNODES)
next = first_node(policy->v.nodes);
- me->il_next = next;
+ if (next < MAX_NUMNODES)
+ me->il_next = next;
return nid;
}
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
+ * @policy must be protected by freeing by the caller. If @policy is
+ * the current task's mempolicy, this protection is implicit, as only the
+ * task can change it's policy. The system default policy requires no
+ * such protection.
*/
unsigned slab_node(struct mempolicy *policy)
{
- int pol = policy ? policy->policy : MPOL_DEFAULT;
+ if (!policy || policy->flags & MPOL_F_LOCAL)
+ return numa_node_id();
+
+ switch (policy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * handled MPOL_F_LOCAL above
+ */
+ return policy->v.preferred_node;
- switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
- case MPOL_BIND:
+ case MPOL_BIND: {
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
- return zone_to_nid(policy->v.zonelist->zones[0]);
-
- case MPOL_PREFERRED:
- if (policy->v.preferred_node >= 0)
- return policy->v.preferred_node;
- /* Fall through */
+ struct zonelist *zonelist;
+ struct zone *zone;
+ enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+ zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+ (void)first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->v.nodes,
+ &zone);
+ return zone->node;
+ }
default:
- return numa_node_id();
+ BUG();
}
}
@@ -1234,10 +1415,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
- unsigned target = (unsigned)off % nnodes;
+ unsigned target;
int c;
int nid = -1;
+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned int)off % nnodes;
c = 0;
do {
nid = next_node(nid, pol->v.nodes);
@@ -1274,40 +1458,30 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* @vma = virtual memory area whose policy is sought
* @addr = address in @vma for shared policy lookup and interleave policy
* @gfp_flags = for requested zone
- * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ * @mpol = pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
*
- * Returns a zonelist suitable for a huge page allocation.
- * If the effective policy is 'BIND, returns pointer to policy's zonelist.
- * If it is also a policy for which get_vma_policy() returns an extra
- * reference, we must hold that reference until after allocation.
- * In that case, return policy via @mpol so hugetlb allocation can drop
- * the reference. For non-'BIND referenced policies, we can/do drop the
- * reference here, so the caller doesn't need to know about the special case
- * for default and current task policy.
+ * Returns a zonelist suitable for a huge page allocation and a pointer
+ * to the struct mempolicy for conditional unref after allocation.
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
- gfp_t gfp_flags, struct mempolicy **mpol)
+ gfp_t gfp_flags, struct mempolicy **mpol,
+ nodemask_t **nodemask)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
- *mpol = NULL; /* probably no unref needed */
- if (pol->policy == MPOL_INTERLEAVE) {
- unsigned nid;
-
- nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- if (unlikely(pol != &default_policy &&
- pol != current->mempolicy))
- __mpol_free(pol); /* finished with pol */
- return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
- }
+ *mpol = get_vma_policy(current, vma, addr);
+ *nodemask = NULL; /* assume !MPOL_BIND */
- zl = zonelist_policy(GFP_HIGHUSER, pol);
- if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
- if (pol->policy != MPOL_BIND)
- __mpol_free(pol); /* finished with pol */
- else
- *mpol = pol; /* unref needed after allocation */
+ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ zl = node_zonelist(interleave_nid(*mpol, vma, addr,
+ HPAGE_SHIFT), gfp_flags);
+ } else {
+ zl = policy_zonelist(gfp_flags, *mpol);
+ if ((*mpol)->mode == MPOL_BIND)
+ *nodemask = &(*mpol)->v.nodes;
}
return zl;
}
@@ -1321,9 +1495,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
struct zonelist *zl;
struct page *page;
- zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
+ zl = node_zonelist(nid, gfp);
page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zl->zones[0])
+ if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}
@@ -1358,28 +1532,27 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
cpuset_update_task_memory_state();
- if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
+ if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
- if (unlikely(pol != &default_policy &&
- pol != current->mempolicy))
- __mpol_free(pol); /* finished with pol */
+ mpol_cond_put(pol);
return alloc_page_interleave(gfp, 0, nid);
}
- zl = zonelist_policy(gfp, pol);
- if (pol != &default_policy && pol != current->mempolicy) {
+ zl = policy_zonelist(gfp, pol);
+ if (unlikely(mpol_needs_cond_ref(pol))) {
/*
- * slow path: ref counted policy -- shared or vma
+ * slow path: ref counted shared policy
*/
- struct page *page = __alloc_pages(gfp, 0, zl);
- __mpol_free(pol);
+ struct page *page = __alloc_pages_nodemask(gfp, 0,
+ zl, policy_nodemask(gfp, pol));
+ __mpol_put(pol);
return page;
}
/*
* fast path: default or task policy
*/
- return __alloc_pages(gfp, 0, zl);
+ return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
}
/**
@@ -1409,22 +1582,28 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
- if (pol->policy == MPOL_INTERLEAVE)
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (pol->mode == MPOL_INTERLEAVE)
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
- return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
+ return __alloc_pages_nodemask(gfp, order,
+ policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
}
EXPORT_SYMBOL(alloc_pages_current);
/*
- * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*/
-/* Slow path of a mempolicy copy */
-struct mempolicy *__mpol_copy(struct mempolicy *old)
+/* Slow path of a mempolicy duplicate */
+struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -1436,55 +1615,64 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
}
*new = *old;
atomic_set(&new->refcnt, 1);
- if (new->policy == MPOL_BIND) {
- int sz = ksize(old->v.zonelist);
- new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
- if (!new->v.zonelist) {
- kmem_cache_free(policy_cache, new);
- return ERR_PTR(-ENOMEM);
- }
- }
return new;
}
+/*
+ * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
+ * eliminate the * MPOL_F_* flags that require conditional ref and
+ * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
+ * after return. Use the returned value.
+ *
+ * Allows use of a mempolicy for, e.g., multiple allocations with a single
+ * policy lookup, even if the policy needs/has extra ref on lookup.
+ * shmem_readahead needs this.
+ */
+struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
+ struct mempolicy *frompol)
+{
+ if (!mpol_needs_cond_ref(frompol))
+ return frompol;
+
+ *tompol = *frompol;
+ tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
+ __mpol_put(frompol);
+ return tompol;
+}
+
+static int mpol_match_intent(const struct mempolicy *a,
+ const struct mempolicy *b)
+{
+ if (a->flags != b->flags)
+ return 0;
+ if (!mpol_store_user_nodemask(a))
+ return 1;
+ return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
+}
+
/* Slow path of a mempolicy comparison */
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
return 0;
- if (a->policy != b->policy)
+ if (a->mode != b->mode)
return 0;
- switch (a->policy) {
- case MPOL_DEFAULT:
- return 1;
+ if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
+ return 0;
+ switch (a->mode) {
+ case MPOL_BIND:
+ /* Fall through */
case MPOL_INTERLEAVE:
return nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- return a->v.preferred_node == b->v.preferred_node;
- case MPOL_BIND: {
- int i;
- for (i = 0; a->v.zonelist->zones[i]; i++)
- if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
- return 0;
- return b->v.zonelist->zones[i] == NULL;
- }
+ return a->v.preferred_node == b->v.preferred_node &&
+ a->flags == b->flags;
default:
BUG();
return 0;
}
}
-/* Slow path of a mpol destructor. */
-void __mpol_free(struct mempolicy *p)
-{
- if (!atomic_dec_and_test(&p->refcnt))
- return;
- if (p->policy == MPOL_BIND)
- kfree(p->v.zonelist);
- p->policy = MPOL_DEFAULT;
- kmem_cache_free(policy_cache, p);
-}
-
/*
* Shared memory backing store policy support.
*
@@ -1547,7 +1735,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
- new->policy ? new->policy->policy : 0);
+ new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
@@ -1573,7 +1761,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
- mpol_free(n->policy);
+ mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
@@ -1587,6 +1775,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
n->start = start;
n->end = end;
mpol_get(pol);
+ pol->flags |= MPOL_F_SHARED; /* for unref */
n->policy = pol;
return n;
}
@@ -1633,33 +1822,41 @@ restart:
sp_insert(sp, new);
spin_unlock(&sp->lock);
if (new2) {
- mpol_free(new2->policy);
+ mpol_put(new2->policy);
kmem_cache_free(sn_cache, new2);
}
return 0;
}
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
- nodemask_t *policy_nodes)
-{
- info->root = RB_ROOT;
- spin_lock_init(&info->lock);
-
- if (policy != MPOL_DEFAULT) {
- struct mempolicy *newpol;
-
- /* Falls back to MPOL_DEFAULT on any error */
- newpol = mpol_new(policy, policy_nodes);
- if (!IS_ERR(newpol)) {
- /* Create pseudo-vma that contains just the policy */
- struct vm_area_struct pvma;
-
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- /* Policy covers entire file */
- pvma.vm_end = TASK_SIZE;
- mpol_set_shared_policy(info, &pvma, newpol);
- mpol_free(newpol);
- }
+/**
+ * mpol_shared_policy_init - initialize shared policy for inode
+ * @sp: pointer to inode shared policy
+ * @mpol: struct mempolicy to install
+ *
+ * Install non-NULL @mpol in inode's shared policy rb-tree.
+ * On entry, the current task has a reference on a non-NULL @mpol.
+ * This must be released on exit.
+ */
+void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
+{
+ sp->root = RB_ROOT; /* empty tree == default mempolicy */
+ spin_lock_init(&sp->lock);
+
+ if (mpol) {
+ struct vm_area_struct pvma;
+ struct mempolicy *new;
+
+ /* contextualize the tmpfs mount point mempolicy */
+ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+ mpol_put(mpol); /* drop our ref on sb mpol */
+ if (IS_ERR(new))
+ return; /* no valid nodemask intersection */
+
+ /* Create pseudo-vma that contains just the policy */
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ pvma.vm_end = TASK_SIZE; /* policy covers entire file */
+ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+ mpol_put(new); /* drop initial ref */
}
}
@@ -1670,9 +1867,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
- pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
+ pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
vma->vm_pgoff,
- sz, npol? npol->policy : -1,
+ sz, npol ? npol->mode : -1,
+ npol ? npol->flags : -1,
npol ? nodes_addr(npol->v.nodes)[0] : -1);
if (npol) {
@@ -1700,7 +1898,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
rb_erase(&n->nd, &p->root);
- mpol_free(n->policy);
+ mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
spin_unlock(&p->lock);
@@ -1745,120 +1943,177 @@ void __init numa_policy_init(void)
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
- if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
+ if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
}
/* Reset policy of current process to default */
void numa_default_policy(void)
{
- do_set_mempolicy(MPOL_DEFAULT, NULL);
+ do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}
-/* Migrate a policy to a different set of nodes */
-static void mpol_rebind_policy(struct mempolicy *pol,
- const nodemask_t *newmask)
-{
- nodemask_t *mpolmask;
- nodemask_t tmp;
+/*
+ * Parse and format mempolicy from/to strings
+ */
- if (!pol)
- return;
- mpolmask = &pol->cpuset_mems_allowed;
- if (nodes_equal(*mpolmask, *newmask))
- return;
+/*
+ * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
+ * Used only for mpol_parse_str() and mpol_to_str()
+ */
+#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
+static const char * const policy_types[] =
+ { "default", "prefer", "bind", "interleave", "local" };
- switch (pol->policy) {
- case MPOL_DEFAULT:
- break;
- case MPOL_INTERLEAVE:
- nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
- pol->v.nodes = tmp;
- *mpolmask = *newmask;
- current->il_next = node_remap(current->il_next,
- *mpolmask, *newmask);
- break;
- case MPOL_PREFERRED:
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- *mpolmask, *newmask);
- *mpolmask = *newmask;
- break;
- case MPOL_BIND: {
- nodemask_t nodes;
- struct zone **z;
- struct zonelist *zonelist;
+#ifdef CONFIG_TMPFS
+/**
+ * mpol_parse_str - parse string to mempolicy
+ * @str: string containing mempolicy to parse
+ * @mpol: pointer to struct mempolicy pointer, returned on success.
+ * @no_context: flag whether to "contextualize" the mempolicy
+ *
+ * Format of input:
+ * <mode>[=<flags>][:<nodelist>]
+ *
+ * if @no_context is true, save the input nodemask in w.user_nodemask in
+ * the returned mempolicy. This will be used to "clone" the mempolicy in
+ * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
+ * mount option. Note that if 'static' or 'relative' mode flags were
+ * specified, the input nodemask will already have been saved. Saving
+ * it again is redundant, but safe.
+ *
+ * On success, returns 0, else 1
+ */
+int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+{
+ struct mempolicy *new = NULL;
+ unsigned short uninitialized_var(mode);
+ unsigned short uninitialized_var(mode_flags);
+ nodemask_t nodes;
+ char *nodelist = strchr(str, ':');
+ char *flags = strchr(str, '=');
+ int i;
+ int err = 1;
+
+ if (nodelist) {
+ /* NUL-terminate mode or flags string */
+ *nodelist++ = '\0';
+ if (nodelist_parse(nodelist, nodes))
+ goto out;
+ if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+ goto out;
+ } else
nodes_clear(nodes);
- for (z = pol->v.zonelist->zones; *z; z++)
- node_set(zone_to_nid(*z), nodes);
- nodes_remap(tmp, nodes, *mpolmask, *newmask);
- nodes = tmp;
- zonelist = bind_zonelist(&nodes);
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
- /* If no mem, then zonelist is NULL and we keep old zonelist.
- * If that old zonelist has no remaining mems_allowed nodes,
- * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
- */
+ for (i = 0; i <= MPOL_LOCAL; i++) {
+ if (!strcmp(str, policy_types[i])) {
+ mode = i;
+ break;
+ }
+ }
+ if (i > MPOL_LOCAL)
+ goto out;
- if (!IS_ERR(zonelist)) {
- /* Good - got mem - substitute new zonelist */
- kfree(pol->v.zonelist);
- pol->v.zonelist = zonelist;
+ switch (mode) {
+ case MPOL_PREFERRED:
+ /*
+ * Insist on a nodelist of one node only
+ */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (!*rest)
+ err = 0;
}
- *mpolmask = *newmask;
break;
- }
- default:
- BUG();
+ case MPOL_INTERLEAVE:
+ /*
+ * Default to online nodes with memory if no nodelist
+ */
+ if (!nodelist)
+ nodes = node_states[N_HIGH_MEMORY];
+ err = 0;
+ break;
+ case MPOL_LOCAL:
+ /*
+ * Don't allow a nodelist; mpol_new() checks flags
+ */
+ if (nodelist)
+ goto out;
+ mode = MPOL_PREFERRED;
break;
- }
-}
-
-/*
- * Wrapper for mpol_rebind_policy() that just requires task
- * pointer, and updates task mempolicy.
- */
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
-{
- mpol_rebind_policy(tsk->mempolicy, new);
-}
+ /*
+ * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
+ * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
+ */
+ }
-/*
- * Rebind each vma in mm to new nodemask.
- *
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
- */
+ mode_flags = 0;
+ if (flags) {
+ /*
+ * Currently, we only support two mutually exclusive
+ * mode flags.
+ */
+ if (!strcmp(flags, "static"))
+ mode_flags |= MPOL_F_STATIC_NODES;
+ else if (!strcmp(flags, "relative"))
+ mode_flags |= MPOL_F_RELATIVE_NODES;
+ else
+ err = 1;
+ }
-void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
-{
- struct vm_area_struct *vma;
+ new = mpol_new(mode, mode_flags, &nodes);
+ if (IS_ERR(new))
+ err = 1;
+ else if (no_context)
+ new->w.user_nodemask = nodes; /* save for contextualization */
- down_write(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new);
- up_write(&mm->mmap_sem);
+out:
+ /* Restore string for error message */
+ if (nodelist)
+ *--nodelist = ':';
+ if (flags)
+ *--flags = '=';
+ if (!err)
+ *mpol = new;
+ return err;
}
+#endif /* CONFIG_TMPFS */
-/*
- * Display pages allocated per node and memory policy via /proc.
- */
-
-static const char * const policy_types[] =
- { "default", "prefer", "bind", "interleave" };
-
-/*
+/**
+ * mpol_to_str - format a mempolicy structure for printing
+ * @buffer: to contain formatted mempolicy string
+ * @maxlen: length of @buffer
+ * @pol: pointer to mempolicy to be formatted
+ * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
+ *
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
*/
-static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
{
char *p = buffer;
int l;
nodemask_t nodes;
- int mode = pol ? pol->policy : MPOL_DEFAULT;
+ unsigned short mode;
+ unsigned short flags = pol ? pol->flags : 0;
+
+ /*
+ * Sanity check: room for longest mode, flag and some nodes
+ */
+ VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+
+ if (!pol || pol == &default_policy)
+ mode = MPOL_DEFAULT;
+ else
+ mode = pol->mode;
switch (mode) {
case MPOL_DEFAULT:
@@ -1867,33 +2122,50 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_PREFERRED:
nodes_clear(nodes);
- node_set(pol->v.preferred_node, nodes);
+ if (flags & MPOL_F_LOCAL)
+ mode = MPOL_LOCAL; /* pseudo-policy */
+ else
+ node_set(pol->v.preferred_node, nodes);
break;
case MPOL_BIND:
- get_zonemask(pol, &nodes);
- break;
-
+ /* Fall through */
case MPOL_INTERLEAVE:
- nodes = pol->v.nodes;
+ if (no_context)
+ nodes = pol->w.user_nodemask;
+ else
+ nodes = pol->v.nodes;
break;
default:
BUG();
- return -EFAULT;
}
l = strlen(policy_types[mode]);
- if (buffer + maxlen < p + l + 1)
- return -ENOSPC;
+ if (buffer + maxlen < p + l + 1)
+ return -ENOSPC;
strcpy(p, policy_types[mode]);
p += l;
- if (!nodes_empty(nodes)) {
+ if (flags & MPOL_MODE_FLAGS) {
if (buffer + maxlen < p + 2)
return -ENOSPC;
*p++ = '=';
+
+ /*
+ * Currently, the only defined flags are mutually exclusive
+ */
+ if (flags & MPOL_F_STATIC_NODES)
+ p += snprintf(p, buffer + maxlen - p, "static");
+ else if (flags & MPOL_F_RELATIVE_NODES)
+ p += snprintf(p, buffer + maxlen - p, "relative");
+ }
+
+ if (!nodes_empty(nodes)) {
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = ':';
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
return p - buffer;
@@ -1971,6 +2243,9 @@ static inline void check_huge_range(struct vm_area_struct *vma,
}
#endif
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
int show_numa_map(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
@@ -1990,12 +2265,8 @@ int show_numa_map(struct seq_file *m, void *v)
return 0;
pol = get_vma_policy(priv->task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol);
- /*
- * unref shared or other task's mempolicy
- */
- if (pol != &default_policy && pol != current->mempolicy)
- __mpol_free(pol);
+ mpol_to_str(buffer, sizeof(buffer), pol, 0);
+ mpol_cond_put(pol);
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
diff --git a/mm/migrate.c b/mm/migrate.c
index 4e0eccca5e2..449d77d409f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -383,7 +383,14 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
if (PageDirty(page)) {
clear_page_dirty_for_io(page);
- set_page_dirty(newpage);
+ /*
+ * Want to mark the page and the radix tree as dirty, and
+ * redo the accounting that clear_page_dirty_for_io undid,
+ * but we can't use set_page_dirty because that function
+ * is actually a signal that all of the page has become dirty.
+ * Wheras only part of our page may be dirty.
+ */
+ __set_page_dirty_nobuffers(newpage);
}
#ifdef CONFIG_SWAP
diff --git a/mm/mincore.c b/mm/mincore.c
index 5efe0ded69b..5178800bc12 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -33,7 +33,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* When tmpfs swaps out a page from a file, any process mapping that
* file will not get a swp_entry_t in its pte, but rather it is like
* any other file mapping (ie. marked !present and faulted in with
- * tmpfs's .nopage). So swapped out tmpfs mappings are tested here.
+ * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*
* However when tmpfs moves the page from pagecache and into swapcache,
* it is still in core, but the find_get_page below won't find it.
diff --git a/mm/mmap.c b/mm/mmap.c
index a32d28ce31c..fac66337da2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -230,9 +230,12 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
- mpol_free(vma_policy(vma));
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
+ mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
}
@@ -623,10 +626,13 @@ again: remove_next = 1 + (end > next->vm_end);
spin_unlock(&mapping->i_mmap_lock);
if (remove_next) {
- if (file)
+ if (file) {
fput(file);
+ if (next->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
mm->map_count--;
- mpol_free(vma_policy(next));
+ mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
@@ -1068,7 +1074,6 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
mapping_cap_account_dirty(vma->vm_file->f_mapping);
}
-
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
unsigned int vm_flags, unsigned long pgoff,
@@ -1155,6 +1160,8 @@ munmap_back:
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
+ if (vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
@@ -1181,22 +1188,22 @@ munmap_back:
if (vma_wants_writenotify(vma))
vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
- if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
+ if (file && vma_merge(mm, prev, addr, vma->vm_end,
vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
- file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- } else {
- if (file) {
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- fput(file);
- }
- mpol_free(vma_policy(vma));
+ mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
+ fput(file);
+ if (vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ } else {
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ file = vma->vm_file;
}
-out:
+
+ /* Once vma denies write, undo our temporary denial count */
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+out:
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
@@ -1813,15 +1820,18 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
- pol = mpol_copy(vma_policy(vma));
+ pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
kmem_cache_free(vm_area_cachep, new);
return PTR_ERR(pol);
}
vma_set_policy(new, pol);
- if (new->vm_file)
+ if (new->vm_file) {
get_file(new->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
@@ -2129,7 +2139,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
- pol = mpol_copy(vma_policy(vma));
+ pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
kmem_cache_free(vm_area_cachep, new_vma);
return NULL;
@@ -2138,8 +2148,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
- if (new_vma->vm_file)
+ if (new_vma->vm_file) {
get_file(new_vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ added_exe_file_vma(mm);
+ }
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb5838634f1..486ed595ee6 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
return zone;
}
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+ return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+ return 1;
+#endif /* CONFIG_NUMA */
+}
+
+/* Returns the next zone at or below highest_zoneidx in a zonelist */
+struct zoneref *next_zones_zonelist(struct zoneref *z,
+ enum zone_type highest_zoneidx,
+ nodemask_t *nodes,
+ struct zone **zone)
+{
+ /*
+ * Find the next suitable zone to use for the allocation.
+ * Only filter based on nodemask if it's set
+ */
+ if (likely(nodes == NULL))
+ while (zonelist_zone_idx(z) > highest_zoneidx)
+ z++;
+ else
+ while (zonelist_zone_idx(z) > highest_zoneidx ||
+ (z->zone && !zref_in_nodemask(z, nodes)))
+ z++;
+
+ *zone = zonelist_zone(z++);
+ return z;
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index 5d8ae086f74..ef8c62cec69 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -105,7 +105,11 @@ unsigned int kobjsize(const void *objp)
{
struct page *page;
- if (!objp || !((page = virt_to_page(objp))))
+ /*
+ * If the object we have should not have ksize performed on it,
+ * return size of 0
+ */
+ if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp))))
return 0;
if (PageSlab(page))
@@ -962,8 +966,13 @@ unsigned long do_mmap_pgoff(struct file *file,
INIT_LIST_HEAD(&vma->anon_vma_node);
atomic_set(&vma->vm_usage, 1);
- if (file)
+ if (file) {
get_file(file);
+ if (vm_flags & VM_EXECUTABLE) {
+ added_exe_file_vma(current->mm);
+ vma->vm_mm = current->mm;
+ }
+ }
vma->vm_file = file;
vma->vm_flags = vm_flags;
vma->vm_start = addr;
@@ -1018,8 +1027,11 @@ unsigned long do_mmap_pgoff(struct file *file,
up_write(&nommu_vma_sem);
kfree(vml);
if (vma) {
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(vma->vm_mm);
+ }
kfree(vma);
}
return ret;
@@ -1049,7 +1061,7 @@ EXPORT_SYMBOL(do_mmap_pgoff);
/*
* handle mapping disposal for uClinux
*/
-static void put_vma(struct vm_area_struct *vma)
+static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
if (vma) {
down_write(&nommu_vma_sem);
@@ -1071,8 +1083,11 @@ static void put_vma(struct vm_area_struct *vma)
realalloc -= kobjsize(vma);
askedalloc -= sizeof(*vma);
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
+ if (vma->vm_flags & VM_EXECUTABLE)
+ removed_exe_file_vma(mm);
+ }
kfree(vma);
}
@@ -1109,7 +1124,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
found:
vml = *parent;
- put_vma(vml->vma);
+ put_vma(mm, vml->vma);
*parent = vml->next;
realalloc -= kobjsize(vml);
@@ -1154,7 +1169,7 @@ void exit_mmap(struct mm_struct * mm)
while ((tmp = mm->context.vmlist)) {
mm->context.vmlist = tmp->next;
- put_vma(tmp->vma);
+ put_vma(mm, tmp->vma);
realalloc -= kobjsize(tmp);
askedalloc -= sizeof(*tmp);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index beb592fe938..8a5467ee626 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,8 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
* of least surprise ... (be careful when you change it)
*/
-unsigned long badness(struct task_struct *p, unsigned long uptime,
- struct mem_cgroup *mem)
+unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
struct mm_struct *mm;
@@ -175,12 +174,14 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
gfp_t gfp_mask)
{
#ifdef CONFIG_NUMA
- struct zone **z;
+ struct zone *zone;
+ struct zoneref *z;
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
nodemask_t nodes = node_states[N_HIGH_MEMORY];
- for (z = zonelist->zones; *z; z++)
- if (cpuset_zone_allowed_softwall(*z, gfp_mask))
- node_clear(zone_to_nid(*z), nodes);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ if (cpuset_zone_allowed_softwall(zone, gfp_mask))
+ node_clear(zone_to_nid(zone), nodes);
else
return CONSTRAINT_CPUSET;
@@ -254,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
if (p->oomkilladj == OOM_DISABLE)
continue;
- points = badness(p, uptime.tv_sec, mem);
+ points = badness(p, uptime.tv_sec);
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
@@ -460,29 +461,29 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
* if a parallel OOM killing is already taking place that includes a zone in
* the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
*/
-int try_set_zone_oom(struct zonelist *zonelist)
+int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
{
- struct zone **z;
+ struct zoneref *z;
+ struct zone *zone;
int ret = 1;
- z = zonelist->zones;
-
spin_lock(&zone_scan_mutex);
- do {
- if (zone_is_oom_locked(*z)) {
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ if (zone_is_oom_locked(zone)) {
ret = 0;
goto out;
}
- } while (*(++z) != NULL);
+ }
+
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ /*
+ * Lock each zone in the zonelist under zone_scan_mutex so a
+ * parallel invocation of try_set_zone_oom() doesn't succeed
+ * when it shouldn't.
+ */
+ zone_set_flag(zone, ZONE_OOM_LOCKED);
+ }
- /*
- * Lock each zone in the zonelist under zone_scan_mutex so a parallel
- * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
- */
- z = zonelist->zones;
- do {
- zone_set_flag(*z, ZONE_OOM_LOCKED);
- } while (*(++z) != NULL);
out:
spin_unlock(&zone_scan_mutex);
return ret;
@@ -493,16 +494,15 @@ out:
* allocation attempts with zonelists containing them may now recall the OOM
* killer, if necessary.
*/
-void clear_zonelist_oom(struct zonelist *zonelist)
+void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
{
- struct zone **z;
-
- z = zonelist->zones;
+ struct zoneref *z;
+ struct zone *zone;
spin_lock(&zone_scan_mutex);
- do {
- zone_clear_flag(*z, ZONE_OOM_LOCKED);
- } while (*(++z) != NULL);
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ }
spin_unlock(&zone_scan_mutex);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5e00f1772c2..789b6adbef3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -164,9 +164,20 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
*/
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
- __prop_inc_percpu(&vm_completions, &bdi->completions);
+ __prop_inc_percpu_max(&vm_completions, &bdi->completions,
+ bdi->max_prop_frac);
}
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bdi_writeout_inc(bdi);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(bdi_writeout_inc);
+
static inline void task_dirty_inc(struct task_struct *tsk)
{
prop_inc_single(&vm_dirties, &tsk->dirties);
@@ -200,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
avail_dirty = dirty -
(global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_WRITEBACK) +
- global_page_state(NR_UNSTABLE_NFS));
+ global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK_TEMP));
if (avail_dirty < 0)
avail_dirty = 0;
@@ -243,6 +255,55 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
}
/*
+ *
+ */
+static DEFINE_SPINLOCK(bdi_lock);
+static unsigned int bdi_min_ratio;
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bdi_lock, flags);
+ if (min_ratio > bdi->max_ratio) {
+ ret = -EINVAL;
+ } else {
+ min_ratio -= bdi->min_ratio;
+ if (bdi_min_ratio + min_ratio < 100) {
+ bdi_min_ratio += min_ratio;
+ bdi->min_ratio += min_ratio;
+ } else {
+ ret = -EINVAL;
+ }
+ }
+ spin_unlock_irqrestore(&bdi_lock, flags);
+
+ return ret;
+}
+
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (max_ratio > 100)
+ return -EINVAL;
+
+ spin_lock_irqsave(&bdi_lock, flags);
+ if (bdi->min_ratio > max_ratio) {
+ ret = -EINVAL;
+ } else {
+ bdi->max_ratio = max_ratio;
+ bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+ }
+ spin_unlock_irqrestore(&bdi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_ratio);
+
+/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
*
@@ -300,7 +361,7 @@ static unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-static void
+void
get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
struct backing_dev_info *bdi)
{
@@ -330,7 +391,7 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
*pdirty = dirty;
if (bdi) {
- u64 bdi_dirty = dirty;
+ u64 bdi_dirty;
long numerator, denominator;
/*
@@ -338,8 +399,12 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
*/
bdi_writeout_fraction(bdi, &numerator, &denominator);
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
bdi_dirty *= numerator;
do_div(bdi_dirty, denominator);
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
*pbdi_dirty = bdi_dirty;
clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
@@ -1192,7 +1257,7 @@ int test_clear_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_writeback_dirty(bdi)) {
+ if (bdi_cap_account_writeback(bdi)) {
__dec_bdi_stat(bdi, BDI_WRITEBACK);
__bdi_writeout_inc(bdi);
}
@@ -1221,7 +1286,7 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_writeback_dirty(bdi))
+ if (bdi_cap_account_writeback(bdi))
__inc_bdi_stat(bdi, BDI_WRITEBACK);
}
if (!PageDirty(page))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 402a504f122..bdd5c432c42 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -45,6 +45,7 @@
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/memcontrol.h>
+#include <linux/debugobjects.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -532,8 +533,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
if (reserved)
return;
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
@@ -546,7 +550,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
/*
* permit the bootmem allocator to evade page validation on high-order frees
*/
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __free_pages_bootmem(struct page *page, unsigned int order)
{
if (order == 0) {
__ClearPageReserved(page);
@@ -632,7 +636,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
if (PageReserved(page))
return 1;
- page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
+ page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
set_page_private(page, 0);
@@ -995,8 +999,10 @@ static void free_hot_cold_page(struct page *page, int cold)
if (free_pages_check(page))
return;
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+ debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
+ }
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
@@ -1050,7 +1056,7 @@ void split_page(struct page *page, unsigned int order)
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static struct page *buffered_rmqueue(struct zonelist *zonelist,
+static struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
@@ -1102,7 +1108,7 @@ again:
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(zonelist, zone);
+ zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
put_cpu();
@@ -1284,7 +1290,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
if (!zlc)
return NULL;
- if (time_after(jiffies, zlc->last_full_zap + HZ)) {
+ if (time_after(jiffies, zlc->last_full_zap + HZ)) {
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
zlc->last_full_zap = jiffies;
}
@@ -1317,7 +1323,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
* We are low on memory in the second scan, and should leave no stone
* unturned looking for a free page.
*/
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
@@ -1328,7 +1334,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
if (!zlc)
return 1;
- i = z - zonelist->zones;
+ i = z - zonelist->_zonerefs;
n = zlc->z_to_n[i];
/* This zone is worth trying if it is allowed but not full */
@@ -1340,7 +1346,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
* zlc->fullzones, so that subsequent attempts to allocate a page
* from that zone don't waste time re-examining it.
*/
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
@@ -1349,7 +1355,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
if (!zlc)
return;
- i = z - zonelist->zones;
+ i = z - zonelist->_zonerefs;
set_bit(i, zlc->fullzones);
}
@@ -1361,13 +1367,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
return NULL;
}
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
return 1;
}
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
}
#endif /* CONFIG_NUMA */
@@ -1377,42 +1383,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
* a page.
*/
static struct page *
-get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, int alloc_flags)
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+ struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
{
- struct zone **z;
+ struct zoneref *z;
struct page *page = NULL;
- int classzone_idx = zone_idx(zonelist->zones[0]);
- struct zone *zone;
+ int classzone_idx;
+ struct zone *zone, *preferred_zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
- enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
+
+ (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
+ &preferred_zone);
+ classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- z = zonelist->zones;
-
- do {
- /*
- * In NUMA, this could be a policy zonelist which contains
- * zones that may not be allowed by the current gfp_mask.
- * Check the zone is allowed by the current flags
- */
- if (unlikely(alloc_should_filter_zonelist(zonelist))) {
- if (highest_zoneidx == -1)
- highest_zoneidx = gfp_zone(gfp_mask);
- if (zone_idx(*z) > highest_zoneidx)
- continue;
- }
-
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
- zone = *z;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
goto try_next_zone;
@@ -1433,7 +1428,7 @@ zonelist_scan:
}
}
- page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
+ page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
if (page)
break;
this_zone_full:
@@ -1446,7 +1441,7 @@ try_next_zone:
zlc_active = 1;
did_zlc_setup = 1;
}
- } while (*(++z) != NULL);
+ }
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
@@ -1459,18 +1454,21 @@ try_next_zone:
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist)
+static struct page *
+__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
- struct zone **z;
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zoneref *z;
+ struct zone *zone;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int do_retry;
int alloc_flags;
- int did_some_progress;
+ unsigned long did_some_progress;
+ unsigned long pages_reclaimed = 0;
might_sleep_if(wait);
@@ -1478,9 +1476,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
return NULL;
restart:
- z = zonelist->zones; /* the list of zones suitable for gfp_mask */
+ z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
- if (unlikely(*z == NULL)) {
+ if (unlikely(!z->zone)) {
/*
* Happens if we have an empty zonelist as a result of
* GFP_THISNODE being used on a memoryless node
@@ -1488,8 +1486,8 @@ restart:
return NULL;
}
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
if (page)
goto got_pg;
@@ -1504,8 +1502,8 @@ restart:
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
- for (z = zonelist->zones; *z; z++)
- wakeup_kswapd(*z, order);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -1533,7 +1531,8 @@ restart:
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+ high_zoneidx, alloc_flags);
if (page)
goto got_pg;
@@ -1545,8 +1544,8 @@ rebalance:
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
- page = get_page_from_freelist(gfp_mask, order,
- zonelist, ALLOC_NO_WATERMARKS);
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;
if (gfp_mask & __GFP_NOFAIL) {
@@ -1569,7 +1568,7 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -1580,12 +1579,12 @@ nofail_alloc:
drain_all_pages();
if (likely(did_some_progress)) {
- page = get_page_from_freelist(gfp_mask, order,
- zonelist, alloc_flags);
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, alloc_flags);
if (page)
goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- if (!try_set_zone_oom(zonelist)) {
+ if (!try_set_zone_oom(zonelist, gfp_mask)) {
schedule_timeout_uninterruptible(1);
goto restart;
}
@@ -1596,21 +1595,22 @@ nofail_alloc:
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+ order, zonelist, high_zoneidx,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET);
if (page) {
- clear_zonelist_oom(zonelist);
+ clear_zonelist_oom(zonelist, gfp_mask);
goto got_pg;
}
/* The OOM killer will not help higher order allocs so fail */
if (order > PAGE_ALLOC_COSTLY_ORDER) {
- clear_zonelist_oom(zonelist);
+ clear_zonelist_oom(zonelist, gfp_mask);
goto nopage;
}
out_of_memory(zonelist, gfp_mask, order);
- clear_zonelist_oom(zonelist);
+ clear_zonelist_oom(zonelist, gfp_mask);
goto restart;
}
@@ -1618,14 +1618,26 @@ nofail_alloc:
* Don't let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry.
*
- * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
- * <= 3, but that may not be true in other implementations.
+ * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+ * means __GFP_NOFAIL, but that may not be true in other
+ * implementations.
+ *
+ * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+ * specified, then we retry until we no longer reclaim any pages
+ * (above), or we've reclaimed an order of pages at least as
+ * large as the allocation's order. In both cases, if the
+ * allocation still fails, we stop retrying.
*/
+ pages_reclaimed += did_some_progress;
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
- if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
- (gfp_mask & __GFP_REPEAT))
+ if (order <= PAGE_ALLOC_COSTLY_ORDER) {
do_retry = 1;
+ } else {
+ if (gfp_mask & __GFP_REPEAT &&
+ pages_reclaimed < (1 << order))
+ do_retry = 1;
+ }
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
}
@@ -1646,6 +1658,20 @@ got_pg:
return page;
}
+struct page *
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+}
+
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
+{
+ return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
+}
+
EXPORT_SYMBOL(__alloc_pages);
/*
@@ -1712,15 +1738,15 @@ EXPORT_SYMBOL(free_pages);
static unsigned int nr_free_zone_pages(int offset)
{
+ struct zoneref *z;
+ struct zone *zone;
+
/* Just pick one node, since fallback list is circular */
- pg_data_t *pgdat = NODE_DATA(numa_node_id());
unsigned int sum = 0;
- struct zonelist *zonelist = pgdat->node_zonelists + offset;
- struct zone **zonep = zonelist->zones;
- struct zone *zone;
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
- for (zone = *zonep++; zone; zone = *zonep++) {
+ for_each_zone_zonelist(zone, z, zonelist, offset) {
unsigned long size = zone->present_pages;
unsigned long high = zone->pages_high;
if (size > high)
@@ -1889,6 +1915,12 @@ void show_free_areas(void)
show_swap_cache_info();
}
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+ zoneref->zone = zone;
+ zoneref->zone_idx = zone_idx(zone);
+}
+
/*
* Builds allocation fallback zone lists.
*
@@ -1906,7 +1938,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
zone_type--;
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {
- zonelist->zones[nr_zones++] = zone;
+ zoneref_set_zone(zone,
+ &zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
@@ -2029,6 +2062,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
int min_val = INT_MAX;
int best_node = -1;
+ node_to_cpumask_ptr(tmp, 0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -2037,7 +2071,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
}
for_each_node_state(n, N_HIGH_MEMORY) {
- cpumask_t tmp;
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -2050,8 +2083,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = node_to_cpumask(n);
- if (!cpus_empty(tmp))
+ node_to_cpumask_ptr_next(tmp, n);
+ if (!cpus_empty(*tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -2078,17 +2111,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
{
- enum zone_type i;
int j;
struct zonelist *zonelist;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + i;
- for (j = 0; zonelist->zones[j] != NULL; j++)
- ;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- zonelist->zones[j] = NULL;
- }
+ zonelist = &pgdat->node_zonelists[0];
+ for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
+ ;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
}
/*
@@ -2096,15 +2128,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- enum zone_type i;
int j;
struct zonelist *zonelist;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
- j = build_zonelists_node(pgdat, zonelist, 0, i);
- zonelist->zones[j] = NULL;
- }
+ zonelist = &pgdat->node_zonelists[1];
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
}
/*
@@ -2117,27 +2147,26 @@ static int node_order[MAX_NUMNODES];
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
- enum zone_type i;
int pos, j, node;
int zone_type; /* needs to be signed */
struct zone *z;
struct zonelist *zonelist;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + i;
- pos = 0;
- for (zone_type = i; zone_type >= 0; zone_type--) {
- for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
- z = &NODE_DATA(node)->node_zones[zone_type];
- if (populated_zone(z)) {
- zonelist->zones[pos++] = z;
- check_highest_zone(zone_type);
- }
+ zonelist = &pgdat->node_zonelists[0];
+ pos = 0;
+ for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
+ for (j = 0; j < nr_nodes; j++) {
+ node = node_order[j];
+ z = &NODE_DATA(node)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ zoneref_set_zone(z,
+ &zonelist->_zonerefs[pos++]);
+ check_highest_zone(zone_type);
}
}
- zonelist->zones[pos] = NULL;
}
+ zonelist->_zonerefs[pos].zone = NULL;
+ zonelist->_zonerefs[pos].zone_idx = 0;
}
static int default_zonelist_order(void)
@@ -2214,7 +2243,8 @@ static void build_zonelists(pg_data_t *pgdat)
/* initialize zonelists */
for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist = pgdat->node_zonelists + i;
- zonelist->zones[0] = NULL;
+ zonelist->_zonerefs[0].zone = NULL;
+ zonelist->_zonerefs[0].zone_idx = 0;
}
/* NUMA-aware ordering of nodes */
@@ -2264,19 +2294,15 @@ static void build_zonelists(pg_data_t *pgdat)
/* Construct the zonelist performance cache - see further mmzone.h */
static void build_zonelist_cache(pg_data_t *pgdat)
{
- int i;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zonelist *zonelist;
- struct zonelist_cache *zlc;
- struct zone **z;
+ struct zonelist *zonelist;
+ struct zonelist_cache *zlc;
+ struct zoneref *z;
- zonelist = pgdat->node_zonelists + i;
- zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- for (z = zonelist->zones; *z; z++)
- zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
- }
+ zonelist = &pgdat->node_zonelists[0];
+ zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ for (z = zonelist->_zonerefs; z->zone; z++)
+ zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
}
@@ -2290,45 +2316,44 @@ static void set_zonelist_order(void)
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type i,j;
+ enum zone_type j;
+ struct zonelist *zonelist;
local_node = pgdat->node_id;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zonelist *zonelist;
-
- zonelist = pgdat->node_zonelists + i;
- j = build_zonelists_node(pgdat, zonelist, 0, i);
- /*
- * Now we build the zonelist so that it contains the zones
- * of all the other nodes.
- * We don't want to pressure a particular node, so when
- * building the zones for node N, we make sure that the
- * zones coming right after the local ones are those from
- * node N+1 (modulo N)
- */
- for (node = local_node + 1; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- }
- for (node = 0; node < local_node; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- }
+ zonelist = &pgdat->node_zonelists[0];
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
- zonelist->zones[j] = NULL;
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
}
+ for (node = 0; node < local_node; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
+ }
+
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
}
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
static void build_zonelist_cache(pg_data_t *pgdat)
{
- int i;
-
- for (i = 0; i < MAX_NR_ZONES; i++)
- pgdat->node_zonelists[i].zlcache_ptr = NULL;
+ pgdat->node_zonelists[0].zlcache_ptr = NULL;
+ pgdat->node_zonelists[1].zlcache_ptr = NULL;
}
#endif /* CONFIG_NUMA */
@@ -2518,7 +2543,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
+ struct zone *z;
+ z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
@@ -2536,7 +2563,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
init_page_count(page);
reset_page_mapcount(page);
SetPageReserved(page);
-
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -2545,8 +2571,15 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
*/
- if ((pfn & (pageblock_nr_pages-1)))
+ if ((z->zone_start_pfn <= pfn)
+ && (pfn < z->zone_start_pfn + z->spanned_pages)
+ && !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
@@ -4339,9 +4372,7 @@ void *__init alloc_large_system_hash(const char *tablename,
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
- unsigned long order;
- for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
- ;
+ unsigned long order = get_order(size);
table = (void*) __get_free_pages(GFP_ATOMIC, order);
/*
* If bucketsize is not a power-of-two, we may free
@@ -4460,6 +4491,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ VM_BUG_ON(pfn < zone->zone_start_pfn);
+ VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
if (flags & value)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 1cf1417ef8b..0afd2387e50 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -9,11 +9,15 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
int err = 0;
pte = pte_offset_map(pmd, addr);
- do {
+ for (;;) {
err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
if (err)
break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ addr += PAGE_SIZE;
+ if (addr == end)
+ break;
+ pte++;
+ }
pte_unmap(pte);
return err;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e..1c96cfc9e04 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -17,8 +17,8 @@
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/fs.h> // Needed by writeback.h
-#include <linux/writeback.h> // Prototypes pdflush_operation()
+#include <linux/fs.h> /* Needed by writeback.h */
+#include <linux/writeback.h> /* Prototypes pdflush_operation() */
#include <linux/kthread.h>
#include <linux/cpuset.h>
#include <linux/freezer.h>
@@ -187,8 +187,8 @@ static int pdflush(void *dummy)
* This is needed as pdflush's are dynamically created and destroyed.
* The boottime pdflush's are easily placed w/o these 2 lines.
*/
- cpus_allowed = cpuset_cpus_allowed(current);
- set_cpus_allowed(current, cpus_allowed);
+ cpuset_cpus_allowed(current, &cpus_allowed);
+ set_cpus_allowed_ptr(current, &cpus_allowed);
return __pdflush(&my_work);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 8762e898897..d8723a5f649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -235,7 +235,13 @@ unsigned long max_sane_readahead(unsigned long nr)
static int __init readahead_init(void)
{
- return bdi_init(&default_backing_dev_info);
+ int err;
+
+ err = bdi_init(&default_backing_dev_info);
+ if (!err)
+ bdi_register(&default_backing_dev_info, NULL, "default");
+
+ return err;
}
subsys_initcall(readahead_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index 997f06907b6..bf0a5b7cfb8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -413,9 +413,6 @@ int page_referenced(struct page *page, int is_locked,
{
int referenced = 0;
- if (page_test_and_clear_young(page))
- referenced++;
-
if (TestClearPageReferenced(page))
referenced++;
@@ -433,6 +430,10 @@ int page_referenced(struct page *page, int is_locked,
unlock_page(page);
}
}
+
+ if (page_test_and_clear_young(page))
+ referenced++;
+
return referenced;
}
@@ -661,7 +662,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
if (vma->vm_ops) {
- print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
}
if (vma->vm_file && vma->vm_file->f_op)
diff --git a/mm/shmem.c b/mm/shmem.c
index f514dd392cd..e2a6ae1a44e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -201,7 +201,7 @@ static struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
.ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
.unplug_io_fn = default_unplug_io_fn,
};
@@ -1079,104 +1079,47 @@ redirty:
#ifdef CONFIG_NUMA
#ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
- char *nodelist = strchr(value, ':');
- int err = 1;
+ char buffer[64];
- if (nodelist) {
- /* NUL-terminate policy string */
- *nodelist++ = '\0';
- if (nodelist_parse(nodelist, *policy_nodes))
- goto out;
- if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
- goto out;
- }
- if (!strcmp(value, "default")) {
- *policy = MPOL_DEFAULT;
- /* Don't allow a nodelist */
- if (!nodelist)
- err = 0;
- } else if (!strcmp(value, "prefer")) {
- *policy = MPOL_PREFERRED;
- /* Insist on a nodelist of one node only */
- if (nodelist) {
- char *rest = nodelist;
- while (isdigit(*rest))
- rest++;
- if (!*rest)
- err = 0;
- }
- } else if (!strcmp(value, "bind")) {
- *policy = MPOL_BIND;
- /* Insist on a nodelist */
- if (nodelist)
- err = 0;
- } else if (!strcmp(value, "interleave")) {
- *policy = MPOL_INTERLEAVE;
- /*
- * Default to online nodes with memory if no nodelist
- */
- if (!nodelist)
- *policy_nodes = node_states[N_HIGH_MEMORY];
- err = 0;
- }
-out:
- /* Restore string for error message */
- if (nodelist)
- *--nodelist = ':';
- return err;
-}
-
-static void shmem_show_mpol(struct seq_file *seq, int policy,
- const nodemask_t policy_nodes)
-{
- char *policy_string;
+ if (!mpol || mpol->mode == MPOL_DEFAULT)
+ return; /* show nothing */
- switch (policy) {
- case MPOL_PREFERRED:
- policy_string = "prefer";
- break;
- case MPOL_BIND:
- policy_string = "bind";
- break;
- case MPOL_INTERLEAVE:
- policy_string = "interleave";
- break;
- default:
- /* MPOL_DEFAULT */
- return;
- }
+ mpol_to_str(buffer, sizeof(buffer), mpol, 1);
- seq_printf(seq, ",mpol=%s", policy_string);
-
- if (policy != MPOL_INTERLEAVE ||
- !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
- char buffer[64];
- int len;
+ seq_printf(seq, ",mpol=%s", buffer);
+}
- len = nodelist_scnprintf(buffer, sizeof(buffer), policy_nodes);
- if (len < sizeof(buffer))
- seq_printf(seq, ":%s", buffer);
- else
- seq_printf(seq, ":?");
+static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ struct mempolicy *mpol = NULL;
+ if (sbinfo->mpol) {
+ spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ mpol = sbinfo->mpol;
+ mpol_get(mpol);
+ spin_unlock(&sbinfo->stat_lock);
}
+ return mpol;
}
#endif /* CONFIG_TMPFS */
static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
struct shmem_inode_info *info, unsigned long idx)
{
+ struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
struct page *page;
+ spol = mpol_cond_copy(&mpol,
+ mpol_shared_policy_lookup(&info->policy, idx));
+
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
pvma.vm_pgoff = idx;
pvma.vm_ops = NULL;
- pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+ pvma.vm_policy = spol;
page = swapin_readahead(entry, gfp, &pvma, 0);
- mpol_free(pvma.vm_policy);
return page;
}
@@ -1184,27 +1127,21 @@ static struct page *shmem_alloc_page(gfp_t gfp,
struct shmem_inode_info *info, unsigned long idx)
{
struct vm_area_struct pvma;
- struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
pvma.vm_pgoff = idx;
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
- page = alloc_page_vma(gfp, &pvma, 0);
- mpol_free(pvma.vm_policy);
- return page;
+
+ /*
+ * alloc_page_vma() will drop the shared policy reference
+ */
+ return alloc_page_vma(gfp, &pvma, 0);
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, int *policy,
- nodemask_t *policy_nodes)
-{
- return 1;
-}
-
-static inline void shmem_show_mpol(struct seq_file *seq, int policy,
- const nodemask_t policy_nodes)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
{
}
#endif /* CONFIG_TMPFS */
@@ -1222,6 +1159,13 @@ static inline struct page *shmem_alloc_page(gfp_t gfp,
}
#endif /* CONFIG_NUMA */
+#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ return NULL;
+}
+#endif
+
/*
* shmem_getpage - either get the page from swap or allocate a new one
*
@@ -1576,8 +1520,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
case S_IFREG:
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
- mpol_shared_policy_init(&info->policy, sbinfo->policy,
- &sbinfo->policy_nodes);
+ mpol_shared_policy_init(&info->policy,
+ shmem_get_sbmpol(sbinfo));
break;
case S_IFDIR:
inc_nlink(inode);
@@ -1591,8 +1535,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
* Must not load anything in the rbtree,
* mpol_free_shared_policy will not be called.
*/
- mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
- NULL);
+ mpol_shared_policy_init(&info->policy, NULL);
break;
}
} else
@@ -2207,8 +2150,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- if (shmem_parse_mpol(value, &sbinfo->policy,
- &sbinfo->policy_nodes))
+ if (mpol_parse_str(value, &sbinfo->mpol, 1))
goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2259,8 +2201,9 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
sbinfo->free_blocks = config.max_blocks - blocks;
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
- sbinfo->policy = config.policy;
- sbinfo->policy_nodes = config.policy_nodes;
+
+ mpol_put(sbinfo->mpol);
+ sbinfo->mpol = config.mpol; /* transfers initial ref */
out:
spin_unlock(&sbinfo->stat_lock);
return error;
@@ -2281,7 +2224,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_printf(seq, ",uid=%u", sbinfo->uid);
if (sbinfo->gid != 0)
seq_printf(seq, ",gid=%u", sbinfo->gid);
- shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes);
+ shmem_show_mpol(seq, sbinfo->mpol);
return 0;
}
#endif /* CONFIG_TMPFS */
@@ -2311,8 +2254,7 @@ static int shmem_fill_super(struct super_block *sb,
sbinfo->mode = S_IRWXUGO | S_ISVTX;
sbinfo->uid = current->fsuid;
sbinfo->gid = current->fsgid;
- sbinfo->policy = MPOL_DEFAULT;
- sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
+ sbinfo->mpol = NULL;
sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS
diff --git a/mm/slab.c b/mm/slab.c
index 04b308c3bc5..06236e4ddc1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -110,6 +110,7 @@
#include <linux/fault-inject.h>
#include <linux/rtmutex.h>
#include <linux/reciprocal_div.h>
+#include <linux/debugobjects.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -139,10 +140,6 @@
#define BYTES_PER_WORD sizeof(void *)
#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef cache_line_size
-#define cache_line_size() L1_CACHE_BYTES
-#endif
-
#ifndef ARCH_KMALLOC_MINALIGN
/*
* Enforce a minimum alignment for the kmalloc caches.
@@ -178,12 +175,14 @@
SLAB_CACHE_DMA | \
SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_DEBUG_OBJECTS)
#else
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
+ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_DEBUG_OBJECTS)
#endif
/*
@@ -862,7 +861,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
-#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
+#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
static void __slab_error(const char *function, struct kmem_cache *cachep,
char *msg)
@@ -1160,14 +1159,13 @@ static void __cpuinit cpuup_canceled(long cpu)
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
int node = cpu_to_node(cpu);
+ node_to_cpumask_ptr(mask, node);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
struct array_cache **alien;
- cpumask_t mask;
- mask = node_to_cpumask(node);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
@@ -1183,7 +1181,7 @@ static void __cpuinit cpuup_canceled(long cpu)
if (nc)
free_block(cachep, nc->entry, nc->avail, node);
- if (!cpus_empty(mask)) {
+ if (!cpus_empty(*mask)) {
spin_unlock_irq(&l3->list_lock);
goto free_array_cache;
}
@@ -2158,7 +2156,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
size > KMALLOC_MAX_SIZE) {
- printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
+ printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
name);
BUG();
}
@@ -3243,15 +3241,16 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
{
struct zonelist *zonelist;
gfp_t local_flags;
- struct zone **z;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
if (flags & __GFP_THISNODE)
return NULL;
- zonelist = &NODE_DATA(slab_node(current->mempolicy))
- ->node_zonelists[gfp_zone(flags)];
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
retry:
@@ -3259,10 +3258,10 @@ retry:
* Look through allowed nodes for objects available
* from existing per node queues.
*/
- for (z = zonelist->zones; *z && !obj; z++) {
- nid = zone_to_nid(*z);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ nid = zone_to_nid(zone);
- if (cpuset_zone_allowed_hardwall(*z, flags) &&
+ if (cpuset_zone_allowed_hardwall(zone, flags) &&
cache->nodelists[nid] &&
cache->nodelists[nid]->free_objects)
obj = ____cache_alloc_node(cache,
@@ -3764,6 +3763,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
local_irq_save(flags);
debug_check_no_locks_freed(objp, obj_size(cachep));
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, obj_size(cachep));
__cache_free(cachep, objp);
local_irq_restore(flags);
}
@@ -3789,6 +3790,7 @@ void kfree(const void *objp)
kfree_debugcheck(objp);
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, obj_size(c));
+ debug_check_no_obj_freed(objp, obj_size(c));
__cache_free(c, (void *)objp);
local_irq_restore(flags);
}
diff --git a/mm/slob.c b/mm/slob.c
index e2c3c0ec546..6038cbadf79 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -533,7 +533,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
{
struct kmem_cache *c;
- c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
+ c = slob_alloc(sizeof(struct kmem_cache),
+ flags, ARCH_KMALLOC_MINALIGN, -1);
if (c) {
c->name = name;
diff --git a/mm/slub.c b/mm/slub.c
index acc975fcc8c..32b62623846 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,8 +19,10 @@
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
+#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/memory.h>
+#include <linux/math64.h>
/*
* Lock order:
@@ -149,25 +151,6 @@ static inline void ClearSlabDebug(struct page *page)
/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST
-#if PAGE_SHIFT <= 12
-
-/*
- * Small page size. Make sure that we do not fragment memory
- */
-#define DEFAULT_MAX_ORDER 1
-#define DEFAULT_MIN_OBJECTS 4
-
-#else
-
-/*
- * Large page machines are customarily able to handle larger
- * page orders.
- */
-#define DEFAULT_MAX_ORDER 2
-#define DEFAULT_MIN_OBJECTS 8
-
-#endif
-
/*
* Mininum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -204,13 +187,6 @@ static inline void ClearSlabDebug(struct page *page)
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
-#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */
-#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */
-
-/* Not all arches define cache_line_size */
-#ifndef cache_line_size
-#define cache_line_size() L1_CACHE_BYTES
-#endif
static int kmem_size = sizeof(struct kmem_cache);
@@ -301,7 +277,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
base = page_address(page);
- if (object < base || object >= base + s->objects * s->size ||
+ if (object < base || object >= base + page->objects * s->size ||
(object - base) % s->size) {
return 0;
}
@@ -327,8 +303,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
}
/* Loop over all objects in a slab */
-#define for_each_object(__p, __s, __addr) \
- for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
+#define for_each_object(__p, __s, __addr, __objects) \
+ for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
__p += (__s)->size)
/* Scan freelist */
@@ -341,6 +317,26 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
+static inline struct kmem_cache_order_objects oo_make(int order,
+ unsigned long size)
+{
+ struct kmem_cache_order_objects x = {
+ (order << 16) + (PAGE_SIZE << order) / size
+ };
+
+ return x;
+}
+
+static inline int oo_order(struct kmem_cache_order_objects x)
+{
+ return x.x >> 16;
+}
+
+static inline int oo_objects(struct kmem_cache_order_objects x)
+{
+ return x.x & ((1 << 16) - 1);
+}
+
#ifdef CONFIG_SLUB_DEBUG
/*
* Debug settings:
@@ -451,8 +447,8 @@ static void print_tracking(struct kmem_cache *s, void *object)
static void print_page_info(struct page *page)
{
- printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
- page, page->inuse, page->freelist, page->flags);
+ printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->objects, page->inuse, page->freelist, page->flags);
}
@@ -521,7 +517,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
static void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
- slab_bug(s, reason);
+ slab_bug(s, "%s", reason);
print_trailer(s, page, object);
}
@@ -533,7 +529,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- slab_bug(s, fmt);
+ slab_bug(s, "%s", buf);
print_page_info(page);
dump_stack();
}
@@ -652,6 +648,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
p + off, POISON_INUSE, s->size - off);
}
+/* Check the pad bytes at the end of a slab page */
static int slab_pad_check(struct kmem_cache *s, struct page *page)
{
u8 *start;
@@ -664,20 +661,20 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- end = start + (PAGE_SIZE << s->order);
- length = s->objects * s->size;
- remainder = end - (start + length);
+ length = (PAGE_SIZE << compound_order(page));
+ end = start + length;
+ remainder = length % s->size;
if (!remainder)
return 1;
- fault = check_bytes(start + length, POISON_INUSE, remainder);
+ fault = check_bytes(end - remainder, POISON_INUSE, remainder);
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section("Padding", start, length);
+ print_section("Padding", end - remainder, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, start, end);
return 0;
@@ -739,15 +736,24 @@ static int check_object(struct kmem_cache *s, struct page *page,
static int check_slab(struct kmem_cache *s, struct page *page)
{
+ int maxobj;
+
VM_BUG_ON(!irqs_disabled());
if (!PageSlab(page)) {
slab_err(s, page, "Not a valid slab page");
return 0;
}
- if (page->inuse > s->objects) {
+
+ maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+ if (page->objects > maxobj) {
+ slab_err(s, page, "objects %u > max %u",
+ s->name, page->objects, maxobj);
+ return 0;
+ }
+ if (page->inuse > page->objects) {
slab_err(s, page, "inuse %u > max %u",
- s->name, page->inuse, s->objects);
+ s->name, page->inuse, page->objects);
return 0;
}
/* Slab_pad_check fixes things up after itself */
@@ -764,8 +770,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
int nr = 0;
void *fp = page->freelist;
void *object = NULL;
+ unsigned long max_objects;
- while (fp && nr <= s->objects) {
+ while (fp && nr <= page->objects) {
if (fp == search)
return 1;
if (!check_valid_pointer(s, page, fp)) {
@@ -777,7 +784,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
} else {
slab_err(s, page, "Freepointer corrupt");
page->freelist = NULL;
- page->inuse = s->objects;
+ page->inuse = page->objects;
slab_fix(s, "Freelist cleared");
return 0;
}
@@ -788,10 +795,20 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- if (page->inuse != s->objects - nr) {
+ max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+ if (max_objects > 65535)
+ max_objects = 65535;
+
+ if (page->objects != max_objects) {
+ slab_err(s, page, "Wrong number of objects. Found %d but "
+ "should be %d", page->objects, max_objects);
+ page->objects = max_objects;
+ slab_fix(s, "Number of objects adjusted.");
+ }
+ if (page->inuse != page->objects - nr) {
slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", page->inuse, s->objects - nr);
- page->inuse = s->objects - nr;
+ "counted were %d", page->inuse, page->objects - nr);
+ page->inuse = page->objects - nr;
slab_fix(s, "Object count adjusted.");
}
return search == NULL;
@@ -837,6 +854,38 @@ static void remove_full(struct kmem_cache *s, struct page *page)
spin_unlock(&n->list_lock);
}
+/* Tracking of the number of slabs for debugging purposes */
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ return atomic_long_read(&n->nr_slabs);
+}
+
+static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ /*
+ * May be called early in order to allocate a slab for the
+ * kmem_cache_node structure. Solve the chicken-egg
+ * dilemma by deferring the increment of the count during
+ * bootstrap (see early_kmem_cache_node_alloc).
+ */
+ if (!NUMA_BUILD || n) {
+ atomic_long_inc(&n->nr_slabs);
+ atomic_long_add(objects, &n->total_objects);
+ }
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ atomic_long_dec(&n->nr_slabs);
+ atomic_long_sub(objects, &n->total_objects);
+}
+
+/* Object debug checks for alloc/free paths */
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
@@ -881,7 +930,7 @@ bad:
* as used avoids touching the remaining objects.
*/
slab_fix(s, "Marking all objects used");
- page->inuse = s->objects;
+ page->inuse = page->objects;
page->freelist = NULL;
}
return 0;
@@ -1028,29 +1077,55 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
return flags;
}
#define slub_debug 0
+
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+ { return 0; }
+static inline void inc_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
#endif
+
/*
* Slab allocation and freeing
*/
+static inline struct page *alloc_slab_page(gfp_t flags, int node,
+ struct kmem_cache_order_objects oo)
+{
+ int order = oo_order(oo);
+
+ if (node == -1)
+ return alloc_pages(flags, order);
+ else
+ return alloc_pages_node(node, flags, order);
+}
+
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- int pages = 1 << s->order;
+ struct kmem_cache_order_objects oo = s->oo;
flags |= s->allocflags;
- if (node == -1)
- page = alloc_pages(flags, s->order);
- else
- page = alloc_pages_node(node, flags, s->order);
-
- if (!page)
- return NULL;
+ page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node,
+ oo);
+ if (unlikely(!page)) {
+ oo = s->min;
+ /*
+ * Allocation may have failed due to fragmentation.
+ * Try a lower order alloc if possible
+ */
+ page = alloc_slab_page(flags, node, oo);
+ if (!page)
+ return NULL;
+ stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
+ }
+ page->objects = oo_objects(oo);
mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- pages);
+ 1 << oo_order(oo));
return page;
}
@@ -1066,7 +1141,6 @@ static void setup_object(struct kmem_cache *s, struct page *page,
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- struct kmem_cache_node *n;
void *start;
void *last;
void *p;
@@ -1078,9 +1152,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
goto out;
- n = get_node(s, page_to_nid(page));
- if (n)
- atomic_long_inc(&n->nr_slabs);
+ inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
page->flags |= 1 << PG_slab;
if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1090,10 +1162,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << s->order);
+ memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
last = start;
- for_each_object(p, s, start) {
+ for_each_object(p, s, start, page->objects) {
setup_object(s, page, last);
set_freepointer(s, last, p);
last = p;
@@ -1109,13 +1181,15 @@ out:
static void __free_slab(struct kmem_cache *s, struct page *page)
{
- int pages = 1 << s->order;
+ int order = compound_order(page);
+ int pages = 1 << order;
if (unlikely(SlabDebug(page))) {
void *p;
slab_pad_check(s, page);
- for_each_object(p, s, page_address(page))
+ for_each_object(p, s, page_address(page),
+ page->objects)
check_object(s, page, p, 0);
ClearSlabDebug(page);
}
@@ -1125,7 +1199,9 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
- __free_pages(page, s->order);
+ __ClearPageSlab(page);
+ reset_page_mapcount(page);
+ __free_pages(page, order);
}
static void rcu_free_slab(struct rcu_head *h)
@@ -1151,11 +1227,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
static void discard_slab(struct kmem_cache *s, struct page *page)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-
- atomic_long_dec(&n->nr_slabs);
- reset_page_mapcount(page);
- __ClearPageSlab(page);
+ dec_slabs_node(s, page_to_nid(page), page->objects);
free_slab(s, page);
}
@@ -1255,7 +1327,9 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
- struct zone **z;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(flags);
struct page *page;
/*
@@ -1280,14 +1354,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
- zonelist = &NODE_DATA(
- slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)];
- for (z = zonelist->zones; *z; z++) {
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
- n = get_node(s, zone_to_nid(*z));
+ n = get_node(s, zone_to_nid(zone));
- if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
+ if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > MIN_PARTIAL) {
page = get_partial_node(n);
if (page)
@@ -1490,7 +1563,7 @@ load_freelist:
goto debug;
c->freelist = object[c->offset];
- c->page->inuse = s->objects;
+ c->page->inuse = c->page->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
unlock_out:
@@ -1527,27 +1600,6 @@ new_slab:
c->page = new;
goto load_freelist;
}
-
- /*
- * No memory available.
- *
- * If the slab uses higher order allocs but the object is
- * smaller than a page size then we can fallback in emergencies
- * to the page allocator via kmalloc_large. The page allocator may
- * have failed to obtain a higher order page and we can try to
- * allocate a single page if the object fits into a single page.
- * That is only possible if certain conditions are met that are being
- * checked when a slab is created.
- */
- if (!(gfpflags & __GFP_NORETRY) &&
- (s->flags & __PAGE_ALLOC_FALLBACK)) {
- if (gfpflags & __GFP_WAIT)
- local_irq_enable();
- object = kmalloc_large(s->objsize, gfpflags);
- if (gfpflags & __GFP_WAIT)
- local_irq_disable();
- return object;
- }
return NULL;
debug:
if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1697,6 +1749,8 @@ static __always_inline void slab_free(struct kmem_cache *s,
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
debug_check_no_locks_freed(object, c->objsize);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(object, s->objsize);
if (likely(page == c->page && c->node >= 0)) {
object[c->offset] = c->freelist;
c->freelist = object;
@@ -1748,8 +1802,8 @@ static struct page *get_object_page(const void *x)
* take the list_lock.
*/
static int slub_min_order;
-static int slub_max_order = DEFAULT_MAX_ORDER;
-static int slub_min_objects = DEFAULT_MIN_OBJECTS;
+static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static int slub_min_objects;
/*
* Merge control. If this is set then no merging of slab caches will occur.
@@ -1764,7 +1818,7 @@ static int slub_nomerge;
* system components. Generally order 0 allocations should be preferred since
* order 0 does not cause fragmentation in the page allocator. Larger objects
* be problematic to put into order 0 slabs because there may be too much
- * unused space left. We go to a higher order if more than 1/8th of the slab
+ * unused space left. We go to a higher order if more than 1/16th of the slab
* would be wasted.
*
* In order to reach satisfactory performance we must ensure that a minimum
@@ -1789,6 +1843,9 @@ static inline int slab_order(int size, int min_objects,
int rem;
int min_order = slub_min_order;
+ if ((PAGE_SIZE << min_order) / size > 65535)
+ return get_order(size * 65535) - 1;
+
for (order = max(min_order,
fls(min_objects * size - 1) - PAGE_SHIFT);
order <= max_order; order++) {
@@ -1823,8 +1880,10 @@ static inline int calculate_order(int size)
* we reduce the minimum objects required in a slab.
*/
min_objects = slub_min_objects;
+ if (!min_objects)
+ min_objects = 4 * (fls(nr_cpu_ids) + 1);
while (min_objects > 1) {
- fraction = 8;
+ fraction = 16;
while (fraction >= 4) {
order = slab_order(size, min_objects,
slub_max_order, fraction);
@@ -1886,15 +1945,18 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
c->node = 0;
c->offset = s->offset / sizeof(void *);
c->objsize = s->objsize;
+#ifdef CONFIG_SLUB_STATS
+ memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
+#endif
}
static void init_kmem_cache_node(struct kmem_cache_node *n)
{
n->nr_partial = 0;
- atomic_long_set(&n->nr_slabs, 0);
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_set(&n->nr_slabs, 0);
INIT_LIST_HEAD(&n->full);
#endif
}
@@ -2063,7 +2125,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
init_tracking(kmalloc_caches, n);
#endif
init_kmem_cache_node(n);
- atomic_long_inc(&n->nr_slabs);
+ inc_slabs_node(kmalloc_caches, node, page->objects);
/*
* lockdep requires consistent irq usage for each lock
@@ -2139,11 +2201,12 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
*/
-static int calculate_sizes(struct kmem_cache *s)
+static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
unsigned long flags = s->flags;
unsigned long size = s->objsize;
unsigned long align = s->align;
+ int order;
/*
* Round up object size to the next word boundary. We can only
@@ -2227,26 +2290,16 @@ static int calculate_sizes(struct kmem_cache *s)
*/
size = ALIGN(size, align);
s->size = size;
+ if (forced_order >= 0)
+ order = forced_order;
+ else
+ order = calculate_order(size);
- if ((flags & __KMALLOC_CACHE) &&
- PAGE_SIZE / size < slub_min_objects) {
- /*
- * Kmalloc cache that would not have enough objects in
- * an order 0 page. Kmalloc slabs can fallback to
- * page allocator order 0 allocs so take a reasonably large
- * order that will allows us a good number of objects.
- */
- s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
- s->flags |= __PAGE_ALLOC_FALLBACK;
- s->allocflags |= __GFP_NOWARN;
- } else
- s->order = calculate_order(size);
-
- if (s->order < 0)
+ if (order < 0)
return 0;
s->allocflags = 0;
- if (s->order)
+ if (order)
s->allocflags |= __GFP_COMP;
if (s->flags & SLAB_CACHE_DMA)
@@ -2258,9 +2311,12 @@ static int calculate_sizes(struct kmem_cache *s)
/*
* Determine the number of objects per slab
*/
- s->objects = (PAGE_SIZE << s->order) / size;
+ s->oo = oo_make(order, size);
+ s->min = oo_make(get_order(size), size);
+ if (oo_objects(s->oo) > oo_objects(s->max))
+ s->max = s->oo;
- return !!s->objects;
+ return !!oo_objects(s->oo);
}
@@ -2276,7 +2332,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
s->align = align;
s->flags = kmem_cache_flags(size, flags, name, ctor);
- if (!calculate_sizes(s))
+ if (!calculate_sizes(s, -1))
goto error;
s->refcount = 1;
@@ -2293,7 +2349,7 @@ error:
if (flags & SLAB_PANIC)
panic("Cannot create slab %s size=%lu realsize=%u "
"order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)size, s->size, s->order,
+ s->name, (unsigned long)size, s->size, oo_order(s->oo),
s->offset, flags);
return 0;
}
@@ -2339,26 +2395,52 @@ const char *kmem_cache_name(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_name);
+static void list_slab_objects(struct kmem_cache *s, struct page *page,
+ const char *text)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ void *addr = page_address(page);
+ void *p;
+ DECLARE_BITMAP(map, page->objects);
+
+ bitmap_zero(map, page->objects);
+ slab_err(s, page, "%s", text);
+ slab_lock(page);
+ for_each_free_object(p, s, page->freelist)
+ set_bit(slab_index(p, s, addr), map);
+
+ for_each_object(p, s, addr, page->objects) {
+
+ if (!test_bit(slab_index(p, s, addr), map)) {
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
+ p, p - addr);
+ print_tracking(s, p);
+ }
+ }
+ slab_unlock(page);
+#endif
+}
+
/*
- * Attempt to free all slabs on a node. Return the number of slabs we
- * were unable to free.
+ * Attempt to free all partial slabs on a node.
*/
-static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
- struct list_head *list)
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
- int slabs_inuse = 0;
unsigned long flags;
struct page *page, *h;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry_safe(page, h, list, lru)
+ list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
list_del(&page->lru);
discard_slab(s, page);
- } else
- slabs_inuse++;
+ n->nr_partial--;
+ } else {
+ list_slab_objects(s, page,
+ "Objects remaining on kmem_cache_close()");
+ }
+ }
spin_unlock_irqrestore(&n->list_lock, flags);
- return slabs_inuse;
}
/*
@@ -2375,8 +2457,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
- n->nr_partial -= free_list(s, n, &n->partial);
- if (atomic_long_read(&n->nr_slabs))
+ free_partial(s, n);
+ if (n->nr_partial || slabs_node(s, node))
return 1;
}
free_kmem_cache_nodes(s);
@@ -2394,8 +2476,11 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (!s->refcount) {
list_del(&s->list);
up_write(&slub_lock);
- if (kmem_cache_close(s))
- WARN_ON(1);
+ if (kmem_cache_close(s)) {
+ printk(KERN_ERR "SLUB %s: %s called for cache that "
+ "still has objects.\n", s->name, __func__);
+ dump_stack();
+ }
sysfs_slab_remove(s);
} else
up_write(&slub_lock);
@@ -2409,10 +2494,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
EXPORT_SYMBOL(kmalloc_caches);
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
-#endif
-
static int __init setup_slub_min_order(char *str)
{
get_option(&str, &slub_min_order);
@@ -2458,7 +2539,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
down_write(&slub_lock);
if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
- flags | __KMALLOC_CACHE, NULL))
+ flags, NULL))
goto panic;
list_add(&s->list, &slab_caches);
@@ -2472,6 +2553,7 @@ panic:
}
#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
static void sysfs_add_func(struct work_struct *w)
{
@@ -2688,21 +2770,6 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLABINFO)
-static unsigned long count_partial(struct kmem_cache_node *n)
-{
- unsigned long flags;
- unsigned long x = 0;
- struct page *page;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
- x += page->inuse;
- spin_unlock_irqrestore(&n->list_lock, flags);
- return x;
-}
-#endif
-
/*
* kmem_cache_shrink removes empty slabs from the partial lists and sorts
* the remaining slabs by the number of items in use. The slabs with the
@@ -2720,8 +2787,9 @@ int kmem_cache_shrink(struct kmem_cache *s)
struct kmem_cache_node *n;
struct page *page;
struct page *t;
+ int objects = oo_objects(s->max);
struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
+ kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
unsigned long flags;
if (!slabs_by_inuse)
@@ -2734,7 +2802,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
if (!n->nr_partial)
continue;
- for (i = 0; i < s->objects; i++)
+ for (i = 0; i < objects; i++)
INIT_LIST_HEAD(slabs_by_inuse + i);
spin_lock_irqsave(&n->list_lock, flags);
@@ -2766,7 +2834,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
* Rebuild the partial list with the slabs filled up most
* first and the least used slabs at the end.
*/
- for (i = s->objects - 1; i >= 0; i--)
+ for (i = objects - 1; i >= 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2816,7 +2884,7 @@ static void slab_mem_offline_callback(void *arg)
* and offline_pages() function shoudn't call this
* callback. So, we must fail.
*/
- BUG_ON(atomic_long_read(&n->nr_slabs));
+ BUG_ON(slabs_node(s, offline_node));
s->node[offline_node] = NULL;
kmem_cache_free(kmalloc_caches, n);
@@ -2914,7 +2982,7 @@ void __init kmem_cache_init(void)
kmalloc_caches[0].refcount = -1;
caches++;
- hotplug_memory_notifier(slab_memory_callback, 1);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif
/* Able to allocate the per node structures */
@@ -2987,9 +3055,6 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
- if ((s->flags & __PAGE_ALLOC_FALLBACK))
- return 1;
-
if (s->ctor)
return 1;
@@ -3181,6 +3246,37 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return slab_alloc(s, gfpflags, node, caller);
}
+#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO)
+static unsigned long count_partial(struct kmem_cache_node *n,
+ int (*get_count)(struct page *))
+{
+ unsigned long flags;
+ unsigned long x = 0;
+ struct page *page;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, lru)
+ x += get_count(page);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return x;
+}
+
+static int count_inuse(struct page *page)
+{
+ return page->inuse;
+}
+
+static int count_total(struct page *page)
+{
+ return page->objects;
+}
+
+static int count_free(struct page *page)
+{
+ return page->objects - page->inuse;
+}
+#endif
+
#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
@@ -3193,7 +3289,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
return 0;
/* Now we know that a valid freelist exists */
- bitmap_zero(map, s->objects);
+ bitmap_zero(map, page->objects);
for_each_free_object(p, s, page->freelist) {
set_bit(slab_index(p, s, addr), map);
@@ -3201,7 +3297,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
return 0;
}
- for_each_object(p, s, addr)
+ for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
if (!check_object(s, page, p, 1))
return 0;
@@ -3267,7 +3363,7 @@ static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) *
+ unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
sizeof(unsigned long), GFP_KERNEL);
if (!map)
@@ -3470,14 +3566,14 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc)
{
void *addr = page_address(page);
- DECLARE_BITMAP(map, s->objects);
+ DECLARE_BITMAP(map, page->objects);
void *p;
- bitmap_zero(map, s->objects);
+ bitmap_zero(map, page->objects);
for_each_free_object(p, s, page->freelist)
set_bit(slab_index(p, s, addr), map);
- for_each_object(p, s, addr)
+ for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
add_location(t, s, get_track(s, p, alloc));
}
@@ -3526,12 +3622,10 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf + len, "<not-available>");
if (l->sum_time != l->min_time) {
- unsigned long remainder;
-
len += sprintf(buf + len, " age=%ld/%ld/%ld",
- l->min_time,
- div_long_long_rem(l->sum_time, l->count, &remainder),
- l->max_time);
+ l->min_time,
+ (long)div_u64(l->sum_time, l->count),
+ l->max_time);
} else
len += sprintf(buf + len, " age=%ld",
l->min_time);
@@ -3567,22 +3661,23 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
enum slab_stat_type {
- SL_FULL,
- SL_PARTIAL,
- SL_CPU,
- SL_OBJECTS
+ SL_ALL, /* All slabs */
+ SL_PARTIAL, /* Only partially allocated slabs */
+ SL_CPU, /* Only slabs used for cpu caches */
+ SL_OBJECTS, /* Determine allocated objects not slabs */
+ SL_TOTAL /* Determine object capacity not slabs */
};
-#define SO_FULL (1 << SL_FULL)
+#define SO_ALL (1 << SL_ALL)
#define SO_PARTIAL (1 << SL_PARTIAL)
#define SO_CPU (1 << SL_CPU)
#define SO_OBJECTS (1 << SL_OBJECTS)
+#define SO_TOTAL (1 << SL_TOTAL)
static ssize_t show_slab_objects(struct kmem_cache *s,
char *buf, unsigned long flags)
{
unsigned long total = 0;
- int cpu;
int node;
int x;
unsigned long *nodes;
@@ -3593,56 +3688,60 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
return -ENOMEM;
per_cpu = nodes + nr_node_ids;
- for_each_possible_cpu(cpu) {
- struct page *page;
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+ if (flags & SO_CPU) {
+ int cpu;
- if (!c)
- continue;
+ for_each_possible_cpu(cpu) {
+ struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
- page = c->page;
- node = c->node;
- if (node < 0)
- continue;
- if (page) {
- if (flags & SO_CPU) {
- if (flags & SO_OBJECTS)
- x = page->inuse;
+ if (!c || c->node < 0)
+ continue;
+
+ if (c->page) {
+ if (flags & SO_TOTAL)
+ x = c->page->objects;
+ else if (flags & SO_OBJECTS)
+ x = c->page->inuse;
else
x = 1;
+
total += x;
- nodes[node] += x;
+ nodes[c->node] += x;
}
- per_cpu[node]++;
+ per_cpu[c->node]++;
}
}
- for_each_node_state(node, N_NORMAL_MEMORY) {
- struct kmem_cache_node *n = get_node(s, node);
+ if (flags & SO_ALL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (flags & SO_TOTAL)
+ x = atomic_long_read(&n->total_objects);
+ else if (flags & SO_OBJECTS)
+ x = atomic_long_read(&n->total_objects) -
+ count_partial(n, count_free);
- if (flags & SO_PARTIAL) {
- if (flags & SO_OBJECTS)
- x = count_partial(n);
else
- x = n->nr_partial;
+ x = atomic_long_read(&n->nr_slabs);
total += x;
nodes[node] += x;
}
- if (flags & SO_FULL) {
- int full_slabs = atomic_long_read(&n->nr_slabs)
- - per_cpu[node]
- - n->nr_partial;
+ } else if (flags & SO_PARTIAL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
- if (flags & SO_OBJECTS)
- x = full_slabs * s->objects;
+ if (flags & SO_TOTAL)
+ x = count_partial(n, count_total);
+ else if (flags & SO_OBJECTS)
+ x = count_partial(n, count_inuse);
else
- x = full_slabs;
+ x = n->nr_partial;
total += x;
nodes[node] += x;
}
}
-
x = sprintf(buf, "%lu", total);
#ifdef CONFIG_NUMA
for_each_node_state(node, N_NORMAL_MEMORY)
@@ -3657,14 +3756,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
static int any_slab_objects(struct kmem_cache *s)
{
int node;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-
- if (c && c->page)
- return 1;
- }
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3672,7 +3763,7 @@ static int any_slab_objects(struct kmem_cache *s)
if (!n)
continue;
- if (n->nr_partial || atomic_long_read(&n->nr_slabs))
+ if (atomic_read(&n->total_objects))
return 1;
}
return 0;
@@ -3714,15 +3805,27 @@ SLAB_ATTR_RO(object_size);
static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->objects);
+ return sprintf(buf, "%d\n", oo_objects(s->oo));
}
SLAB_ATTR_RO(objs_per_slab);
+static ssize_t order_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int order = simple_strtoul(buf, NULL, 10);
+
+ if (order > slub_max_order || order < slub_min_order)
+ return -EINVAL;
+
+ calculate_sizes(s, order);
+ return length;
+}
+
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->order);
+ return sprintf(buf, "%d\n", oo_order(s->oo));
}
-SLAB_ATTR_RO(order);
+SLAB_ATTR(order);
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
@@ -3743,7 +3846,7 @@ SLAB_ATTR_RO(aliases);
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
- return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
+ return show_slab_objects(s, buf, SO_ALL);
}
SLAB_ATTR_RO(slabs);
@@ -3761,10 +3864,22 @@ SLAB_ATTR_RO(cpu_slabs);
static ssize_t objects_show(struct kmem_cache *s, char *buf)
{
- return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
+ return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
}
SLAB_ATTR_RO(objects);
+static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects_partial);
+
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
+}
+SLAB_ATTR_RO(total_objects);
+
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
@@ -3844,7 +3959,7 @@ static ssize_t red_zone_store(struct kmem_cache *s,
s->flags &= ~SLAB_RED_ZONE;
if (buf[0] == '1')
s->flags |= SLAB_RED_ZONE;
- calculate_sizes(s);
+ calculate_sizes(s, -1);
return length;
}
SLAB_ATTR(red_zone);
@@ -3863,7 +3978,7 @@ static ssize_t poison_store(struct kmem_cache *s,
s->flags &= ~SLAB_POISON;
if (buf[0] == '1')
s->flags |= SLAB_POISON;
- calculate_sizes(s);
+ calculate_sizes(s, -1);
return length;
}
SLAB_ATTR(poison);
@@ -3882,7 +3997,7 @@ static ssize_t store_user_store(struct kmem_cache *s,
s->flags &= ~SLAB_STORE_USER;
if (buf[0] == '1')
s->flags |= SLAB_STORE_USER;
- calculate_sizes(s);
+ calculate_sizes(s, -1);
return length;
}
SLAB_ATTR(store_user);
@@ -3979,10 +4094,12 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
len = sprintf(buf, "%lu", sum);
+#ifdef CONFIG_SMP
for_each_online_cpu(cpu) {
if (data[cpu] && len < PAGE_SIZE - 20)
- len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]);
+ len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
}
+#endif
kfree(data);
return len + sprintf(buf + len, "\n");
}
@@ -4011,7 +4128,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
-
+STAT_ATTR(ORDER_FALLBACK, order_fallback);
#endif
static struct attribute *slab_attrs[] = {
@@ -4020,6 +4137,8 @@ static struct attribute *slab_attrs[] = {
&objs_per_slab_attr.attr,
&order_attr.attr,
&objects_attr.attr,
+ &objects_partial_attr.attr,
+ &total_objects_attr.attr,
&slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
@@ -4062,6 +4181,7 @@ static struct attribute *slab_attrs[] = {
&deactivate_to_head_attr.attr,
&deactivate_to_tail_attr.attr,
&deactivate_remote_frees_attr.attr,
+ &order_fallback_attr.attr,
#endif
NULL
};
@@ -4348,7 +4468,8 @@ static int s_show(struct seq_file *m, void *p)
unsigned long nr_partials = 0;
unsigned long nr_slabs = 0;
unsigned long nr_inuse = 0;
- unsigned long nr_objs;
+ unsigned long nr_objs = 0;
+ unsigned long nr_free = 0;
struct kmem_cache *s;
int node;
@@ -4362,14 +4483,15 @@ static int s_show(struct seq_file *m, void *p)
nr_partials += n->nr_partial;
nr_slabs += atomic_long_read(&n->nr_slabs);
- nr_inuse += count_partial(n);
+ nr_objs += atomic_long_read(&n->total_objects);
+ nr_free += count_partial(n, count_free);
}
- nr_objs = nr_slabs * s->objects;
- nr_inuse += (nr_slabs - nr_partials) * s->objects;
+ nr_inuse = nr_objs - nr_free;
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
- nr_objs, s->size, s->objects, (1 << s->order));
+ nr_objs, s->size, oo_objects(s->oo),
+ (1 << oo_order(s->oo)));
seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
0UL);
diff --git a/mm/sparse.c b/mm/sparse.c
index 98d6b39c347..36511c7b5e2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+#include "internal.h"
#include <asm/dma.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
@@ -208,12 +209,12 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
}
/*
- * We need this if we ever free the mem_maps. While not implemented yet,
- * this function is included for parity with its sibling.
+ * Decode mem_map from the coded memmap
*/
-static __attribute((unused))
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
+ /* mask off the extra low bits of information */
+ coded_mem_map &= SECTION_MAP_MASK;
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
@@ -232,7 +233,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
return 1;
}
-static unsigned long usemap_size(void)
+unsigned long usemap_size(void)
{
unsigned long size_bytes;
size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
@@ -260,7 +261,7 @@ static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
/* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
nid = 0;
- printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+ printk(KERN_WARNING "%s: allocation failed\n", __func__);
return NULL;
}
@@ -273,8 +274,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
if (map)
return map;
- map = alloc_bootmem_node(NODE_DATA(nid),
- sizeof(struct page) * PAGES_PER_SECTION);
+ map = alloc_bootmem_pages_node(NODE_DATA(nid),
+ PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -290,11 +291,14 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
return map;
printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __FUNCTION__);
+ "some memory will not be available.\n", __func__);
ms->section_mem_map = 0;
return NULL;
}
+void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+{
+}
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -304,22 +308,50 @@ void __init sparse_init(void)
unsigned long pnum;
struct page *map;
unsigned long *usemap;
+ unsigned long **usemap_map;
+ int size;
+
+ /*
+ * map is using big page (aka 2M in x86 64 bit)
+ * usemap is less one page (aka 24 bytes)
+ * so alloc 2M (with 2M align) and 24 bytes in turn will
+ * make next 2M slip to one more 2M later.
+ * then in big system, the memory will have a lot of holes...
+ * here try to allocate 2M pages continously.
+ *
+ * powerpc need to call sparse_init_one_section right after each
+ * sparse_early_mem_map_alloc, so allocate usemap_map at first.
+ */
+ size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
+ usemap_map = alloc_bootmem(size);
+ if (!usemap_map)
+ panic("can not allocate usemap_map\n");
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
continue;
+ usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+ }
- map = sparse_early_mem_map_alloc(pnum);
- if (!map)
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ if (!present_section_nr(pnum))
continue;
- usemap = sparse_early_usemap_alloc(pnum);
+ usemap = usemap_map[pnum];
if (!usemap)
continue;
+ map = sparse_early_mem_map_alloc(pnum);
+ if (!map)
+ continue;
+
sparse_init_one_section(__nr_to_section(pnum), pnum, map,
usemap);
}
+
+ vmemmap_populate_print_last();
+
+ free_bootmem(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -334,6 +366,9 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
return; /* XXX: Not implemented yet */
}
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+}
#else
static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
{
@@ -371,8 +406,69 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
free_pages((unsigned long)memmap,
get_order(sizeof(struct page) * nr_pages));
}
+
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+ unsigned long maps_section_nr, removing_section_nr, i;
+ int magic;
+
+ for (i = 0; i < nr_pages; i++, page++) {
+ magic = atomic_read(&page->_mapcount);
+
+ BUG_ON(magic == NODE_INFO);
+
+ maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
+ removing_section_nr = page->private;
+
+ /*
+ * When this function is called, the removing section is
+ * logical offlined state. This means all pages are isolated
+ * from page allocator. If removing section's memmap is placed
+ * on the same section, it must not be freed.
+ * If it is freed, page allocator may allocate it which will
+ * be removed physically soon.
+ */
+ if (maps_section_nr != removing_section_nr)
+ put_page_bootmem(page);
+ }
+}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+ struct page *usemap_page;
+ unsigned long nr_pages;
+
+ if (!usemap)
+ return;
+
+ usemap_page = virt_to_page(usemap);
+ /*
+ * Check to see if allocation came from hot-plug-add
+ */
+ if (PageSlab(usemap_page)) {
+ kfree(usemap);
+ if (memmap)
+ __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+ return;
+ }
+
+ /*
+ * The usemap came from bootmem. This is packed with other usemaps
+ * on the section which has pgdat at boot time. Just keep it as is now.
+ */
+
+ if (memmap) {
+ struct page *memmap_page;
+ memmap_page = virt_to_page(memmap);
+
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
+ free_map_bootmem(memmap_page, nr_pages);
+ }
+}
+
/*
* returns the number of sections whose mem_maps were properly
* set. If this is <=0, then that means that the passed-in
@@ -425,4 +521,20 @@ out:
}
return ret;
}
+
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+{
+ struct page *memmap = NULL;
+ unsigned long *usemap = NULL;
+
+ if (ms->section_mem_map) {
+ usemap = ms->pageblock_flags;
+ memmap = sparse_decode_mem_map(ms->section_mem_map,
+ __section_nr(ms));
+ ms->section_mem_map = 0;
+ ms->pageblock_flags = NULL;
+ }
+
+ free_section_usemap(memmap, usemap);
+}
#endif
diff --git a/mm/swap.c b/mm/swap.c
index aa1139ccf3a..91e194445a5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -132,34 +132,21 @@ static void pagevec_move_tail(struct pagevec *pvec)
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
- *
- * Returns zero if it cleared PG_writeback.
*/
-int rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
{
- struct pagevec *pvec;
- unsigned long flags;
-
- if (PageLocked(page))
- return 1;
- if (PageDirty(page))
- return 1;
- if (PageActive(page))
- return 1;
- if (!PageLRU(page))
- return 1;
-
- page_cache_get(page);
- local_irq_save(flags);
- pvec = &__get_cpu_var(lru_rotate_pvecs);
- if (!pagevec_add(pvec, page))
- pagevec_move_tail(pvec);
- local_irq_restore(flags);
-
- if (!test_clear_page_writeback(page))
- BUG();
+ if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
+ PageLRU(page)) {
+ struct pagevec *pvec;
+ unsigned long flags;
- return 0;
+ page_cache_get(page);
+ local_irq_save(flags);
+ pvec = &__get_cpu_var(lru_rotate_pvecs);
+ if (!pagevec_add(pvec, page))
+ pagevec_move_tail(pvec);
+ local_irq_restore(flags);
+ }
}
/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 50757ee3f9f..d8aadaf2a0b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
};
static struct backing_dev_info swap_backing_dev_info = {
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
.unplug_io_fn = swap_unplug_io_fn,
};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2da149cfc9a..bd1bb592030 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1426,11 +1426,7 @@ static const struct file_operations proc_swaps_operations = {
static int __init procswaps_init(void)
{
- struct proc_dir_entry *entry;
-
- entry = create_proc_entry("swaps", 0, NULL);
- if (entry)
- entry->proc_fops = &proc_swaps_operations;
+ proc_create("swaps", 0, NULL, &proc_swaps_operations);
return 0;
}
__initcall(procswaps_init);
@@ -1582,6 +1578,14 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = -EINVAL;
goto bad_swap;
case 2:
+ /* swap partition endianess hack... */
+ if (swab32(swap_header->info.version) == 1) {
+ swab32s(&swap_header->info.version);
+ swab32s(&swap_header->info.last_page);
+ swab32s(&swap_header->info.nr_badpages);
+ for (i = 0; i < swap_header->info.nr_badpages; i++)
+ swab32s(&swap_header->info.badpages[i]);
+ }
/* Check the swap header's sub-version and the size of
the swap file and bad block lists */
if (swap_header->info.version != 1) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 7d20ce41ecf..b8961cb6341 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -391,6 +391,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t next;
int i;
int ret = 0;
+ int ret2 = 0;
int did_range_unmap = 0;
int wrapped = 0;
@@ -438,9 +439,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
}
BUG_ON(page_mapped(page));
- ret = do_launder_page(mapping, page);
- if (ret == 0 && !invalidate_complete_page2(mapping, page))
- ret = -EIO;
+ ret2 = do_launder_page(mapping, page);
+ if (ret2 == 0) {
+ if (!invalidate_complete_page2(mapping, page))
+ ret2 = -EIO;
+ }
+ if (ret2 < 0)
+ ret = ret2;
unlock_page(page);
}
pagevec_release(&pvec);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecf91f8034b..6e45b0f3d12 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,8 +14,10 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
-
+#include <linux/seq_file.h>
+#include <linux/debugobjects.h>
#include <linux/vmalloc.h>
+#include <linux/kallsyms.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
@@ -25,7 +27,7 @@ DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
- int node);
+ int node, void *caller);
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
{
@@ -204,9 +206,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
}
EXPORT_SYMBOL(vmalloc_to_pfn);
-static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end,
- int node, gfp_t gfp_mask)
+static struct vm_struct *
+__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
+ unsigned long end, int node, gfp_t gfp_mask, void *caller)
{
struct vm_struct **p, *tmp, *area;
unsigned long align = 1;
@@ -269,6 +271,7 @@ found:
area->pages = NULL;
area->nr_pages = 0;
area->phys_addr = 0;
+ area->caller = caller;
write_unlock(&vmlist_lock);
return area;
@@ -284,7 +287,8 @@ out:
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end)
{
- return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
+ return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -299,14 +303,22 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ -1, GFP_KERNEL, __builtin_return_address(0));
+}
+
+struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+ void *caller)
+{
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+ -1, GFP_KERNEL, caller);
}
struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
int node, gfp_t gfp_mask)
{
return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
- gfp_mask);
+ gfp_mask, __builtin_return_address(0));
}
/* Caller must hold vmlist_lock */
@@ -383,6 +395,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
}
debug_check_no_locks_freed(addr, area->size);
+ debug_check_no_obj_freed(addr, area->size);
if (deallocate_pages) {
int i;
@@ -455,9 +468,11 @@ void *vmap(struct page **pages, unsigned int count,
if (count > num_physpages)
return NULL;
- area = get_vm_area((count << PAGE_SHIFT), flags);
+ area = get_vm_area_caller((count << PAGE_SHIFT), flags,
+ __builtin_return_address(0));
if (!area)
return NULL;
+
if (map_vm_area(area, prot, &pages)) {
vunmap(area->addr);
return NULL;
@@ -468,7 +483,7 @@ void *vmap(struct page **pages, unsigned int count,
EXPORT_SYMBOL(vmap);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, int node, void *caller)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
@@ -480,7 +495,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
- PAGE_KERNEL, node);
+ PAGE_KERNEL, node, caller);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size,
@@ -488,6 +503,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
node);
}
area->pages = pages;
+ area->caller = caller;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
@@ -521,7 +537,8 @@ fail:
void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_area_node(area, gfp_mask, prot, -1);
+ return __vmalloc_area_node(area, gfp_mask, prot, -1,
+ __builtin_return_address(0));
}
/**
@@ -530,13 +547,14 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
+ * @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
- int node)
+ int node, void *caller)
{
struct vm_struct *area;
@@ -544,16 +562,19 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
if (!size || (size >> PAGE_SHIFT) > num_physpages)
return NULL;
- area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
+ area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
+ node, gfp_mask, caller);
+
if (!area)
return NULL;
- return __vmalloc_area_node(area, gfp_mask, prot, node);
+ return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
}
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_node(size, gfp_mask, prot, -1);
+ return __vmalloc_node(size, gfp_mask, prot, -1,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
@@ -568,7 +589,8 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ -1, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
@@ -608,7 +630,8 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ node, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -843,7 +866,8 @@ struct vm_struct *alloc_vm_area(size_t size)
{
struct vm_struct *area;
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area_caller(size, VM_IOREMAP,
+ __builtin_return_address(0));
if (area == NULL)
return NULL;
@@ -873,3 +897,85 @@ void free_vm_area(struct vm_struct *area)
kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);
+
+
+#ifdef CONFIG_PROC_FS
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+ struct vm_struct *v;
+
+ read_lock(&vmlist_lock);
+ v = vmlist;
+ while (n > 0 && v) {
+ n--;
+ v = v->next;
+ }
+ if (!n)
+ return v;
+
+ return NULL;
+
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct vm_struct *v = p;
+
+ ++*pos;
+ return v->next;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ read_unlock(&vmlist_lock);
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct vm_struct *v = p;
+
+ seq_printf(m, "0x%p-0x%p %7ld",
+ v->addr, v->addr + v->size, v->size);
+
+ if (v->caller) {
+ char buff[2 * KSYM_NAME_LEN];
+
+ seq_putc(m, ' ');
+ sprint_symbol(buff, (unsigned long)v->caller);
+ seq_puts(m, buff);
+ }
+
+ if (v->nr_pages)
+ seq_printf(m, " pages=%d", v->nr_pages);
+
+ if (v->phys_addr)
+ seq_printf(m, " phys=%lx", v->phys_addr);
+
+ if (v->flags & VM_IOREMAP)
+ seq_printf(m, " ioremap");
+
+ if (v->flags & VM_ALLOC)
+ seq_printf(m, " vmalloc");
+
+ if (v->flags & VM_MAP)
+ seq_printf(m, " vmap");
+
+ if (v->flags & VM_USERMAP)
+ seq_printf(m, " user");
+
+ if (v->flags & VM_VPAGES)
+ seq_printf(m, " vpages");
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+const struct seq_operations vmalloc_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+#endif
+
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4046434046e..9a29901ad3b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
shrinker->nr += delta;
if (shrinker->nr < 0) {
printk(KERN_ERR "%s: nr=%ld\n",
- __FUNCTION__, shrinker->nr);
+ __func__, shrinker->nr);
shrinker->nr = max_pass;
}
@@ -339,7 +339,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
if (PagePrivate(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
- printk("%s: orphaned page\n", __FUNCTION__);
+ printk("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
@@ -1246,17 +1246,16 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zone **zones,
+static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
+ enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long nr_reclaimed = 0;
- int i;
-
+ struct zoneref *z;
+ struct zone *zone;
sc->all_unreclaimable = 1;
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
-
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
if (!populated_zone(zone))
continue;
/*
@@ -1300,9 +1299,12 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
* hope that some of these pages can be written. But if the allocating task
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
+ *
+ * returns: 0, if no pages reclaimed
+ * else, the number of pages reclaimed
*/
-static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
- struct scan_control *sc)
+static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ struct scan_control *sc)
{
int priority;
int ret = 0;
@@ -1310,7 +1312,9 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
- int i;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
if (scan_global_lru(sc))
count_vm_event(ALLOCSTALL);
@@ -1318,8 +1322,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
* mem_cgroup will not do shrink_slab.
*/
if (scan_global_lru(sc)) {
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
@@ -1333,13 +1336,13 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zones, sc);
+ nr_reclaimed += shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
if (scan_global_lru(sc)) {
- shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+ shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -1347,7 +1350,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
}
total_scanned += sc->nr_scanned;
if (nr_reclaimed >= sc->swap_cluster_max) {
- ret = 1;
+ ret = nr_reclaimed;
goto out;
}
@@ -1370,7 +1373,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
}
/* top priority shrink_caches still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
- ret = 1;
+ ret = nr_reclaimed;
out:
/*
* Now that we've scanned all the zones at this priority level, note
@@ -1383,8 +1386,7 @@ out:
priority = 0;
if (scan_global_lru(sc)) {
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
@@ -1397,7 +1399,8 @@ out:
return ret;
}
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+ gfp_t gfp_mask)
{
struct scan_control sc = {
.gfp_mask = gfp_mask,
@@ -1410,7 +1413,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
.isolate_pages = isolate_pages_global,
};
- return do_try_to_free_pages(zones, gfp_mask, &sc);
+ return do_try_to_free_pages(zonelist, &sc);
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1419,7 +1422,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask)
{
struct scan_control sc = {
- .gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
@@ -1428,13 +1430,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.mem_cgroup = mem_cont,
.isolate_pages = mem_cgroup_isolate_pages,
};
- struct zone **zones;
- int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
+ struct zonelist *zonelist;
- zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
- if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
- return 1;
- return 0;
+ sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+ zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ return do_try_to_free_pages(zonelist, &sc);
}
#endif
@@ -1647,11 +1648,10 @@ static int kswapd(void *p)
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
- cpumask_t cpumask;
+ node_to_cpumask_ptr(cpumask, pgdat->node_id);
- cpumask = node_to_cpumask(pgdat->node_id);
- if (!cpus_empty(cpumask))
- set_cpus_allowed(tsk, cpumask);
+ if (!cpus_empty(*cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
/*
@@ -1880,17 +1880,16 @@ out:
static int __devinit cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- pg_data_t *pgdat;
- cpumask_t mask;
int nid;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_node_state(nid, N_HIGH_MEMORY) {
- pgdat = NODE_DATA(nid);
- mask = node_to_cpumask(pgdat->node_id);
- if (any_online_cpu(mask) != NR_CPUS)
+ pg_data_t *pgdat = NODE_DATA(nid);
+ node_to_cpumask_ptr(mask, pgdat->node_id);
+
+ if (any_online_cpu(*mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
- set_cpus_allowed(pgdat->kswapd, mask);
+ set_cpus_allowed_ptr(pgdat->kswapd, mask);
}
}
return NOTIFY_OK;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7c7286e9506..1a32130b958 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -322,6 +322,7 @@ void refresh_cpu_vm_stats(int cpu)
p->expire = 3;
#endif
}
+ cond_resched();
#ifdef CONFIG_NUMA
/*
* Deal with draining the remote pageset of this
@@ -364,13 +365,13 @@ void refresh_cpu_vm_stats(int cpu)
*
* Must be called with interrupts disabled.
*/
-void zone_statistics(struct zonelist *zonelist, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
- if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
+ if (z->zone_pgdat == preferred_zone->zone_pgdat) {
__inc_zone_state(z, NUMA_HIT);
} else {
__inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
+ __inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
if (z->node == numa_node_id())
__inc_zone_state(z, NUMA_LOCAL);
@@ -547,6 +548,10 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ return 0;
+
seq_printf(m, "Page block order: %d\n", pageblock_order);
seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
seq_putc(m, '\n');
@@ -607,6 +612,7 @@ static const char * const vmstat_text[] = {
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
+ "nr_writeback_temp",
#ifdef CONFIG_NUMA
"numa_hit",
@@ -645,6 +651,10 @@ static const char * const vmstat_text[] = {
"allocstall",
"pgrotated",
+#ifdef CONFIG_HUGETLB_PAGE
+ "htlb_buddy_alloc_success",
+ "htlb_buddy_alloc_fail",
+#endif
#endif
};