aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig11
-rw-r--r--mm/Makefile7
-rw-r--r--mm/allocpercpu.c9
-rw-r--r--mm/backing-dev.c69
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/bounce.c302
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c318
-rw-r--r--mm/filemap.h1
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/highmem.c281
-rw-r--r--mm/hugetlb.c55
-rw-r--r--mm/memory.c183
-rw-r--r--mm/memory_hotplug.c72
-rw-r--r--mm/mempolicy.c19
-rw-r--r--mm/migrate.c26
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c38
-rw-r--r--mm/mmzone.c5
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c280
-rw-r--r--mm/oom_kill.c95
-rw-r--r--mm/page-writeback.c268
-rw-r--r--mm/page_alloc.c1156
-rw-r--r--mm/page_io.c45
-rw-r--r--mm/pdflush.c1
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c41
-rw-r--r--mm/shmem.c231
-rw-r--r--mm/shmem_acl.c197
-rw-r--r--mm/slab.c546
-rw-r--r--mm/slob.c3
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swapfile.c103
-rw-r--r--mm/thrash.c116
-rw-r--r--mm/tiny-shmem.c4
-rw-r--r--mm/truncate.c89
-rw-r--r--mm/util.c22
-rw-r--r--mm/vmalloc.c90
-rw-r--r--mm/vmscan.c118
-rw-r--r--mm/vmstat.c28
44 files changed, 3533 insertions, 1366 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8f5b45615f7..db7c55de92c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -92,7 +92,7 @@ config HAVE_MEMORY_PRESENT
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
-# allocations when memory_present() is called. If this can not
+# allocations when memory_present() is called. If this cannot
# be done on your architecture, select this option. However,
# statically allocating the mem_section[] array can potentially
# consume vast quantities of .bss, so be careful.
@@ -104,7 +104,7 @@ config SPARSEMEM_STATIC
def_bool n
#
-# Architectecture platforms which require a two level mem_section in SPARSEMEM
+# Architecture platforms which require a two level mem_section in SPARSEMEM
# must select this option. This is usually for architecture platforms with
# an extremely sparse physical address space.
#
@@ -115,12 +115,17 @@ config SPARSEMEM_EXTREME
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
- depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on SPARSEMEM || X86_64_ACPI_NUMA
+ depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC64)
comment "Memory hotplug is currently incompatible with Software Suspend"
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+config MEMORY_HOTPLUG_SPARSE
+ def_bool y
+ depends on SPARSEMEM && MEMORY_HOTPLUG
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 60c56c0b5e1..f3c077eb0b8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,13 +10,18 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
- prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
+ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+ $(mmu-y)
+ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
+obj-y += bounce.o
+endif
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
+obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index eaa9abeea53..b2486cf887a 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -17,10 +17,9 @@
void percpu_depopulate(void *__pdata, int cpu)
{
struct percpu_data *pdata = __percpu_disguise(__pdata);
- if (pdata->ptrs[cpu]) {
- kfree(pdata->ptrs[cpu]);
- pdata->ptrs[cpu] = NULL;
- }
+
+ kfree(pdata->ptrs[cpu]);
+ pdata->ptrs[cpu] = NULL;
}
EXPORT_SYMBOL_GPL(percpu_depopulate);
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
*/
void percpu_free(void *__pdata)
{
+ if (unlikely(!__pdata))
+ return;
__percpu_depopulate_mask(__pdata, &cpu_possible_map);
kfree(__percpu_disguise(__pdata));
}
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 00000000000..f50a2811f9d
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,69 @@
+
+#include <linux/wait.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static wait_queue_head_t congestion_wqh[2] = {
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+ };
+
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+ enum bdi_state bit;
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ clear_bit(bit, &bdi->state);
+ smp_mb__after_clear_bit();
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+
+void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+ enum bdi_state bit;
+
+ bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ set_bit(bit, &bdi->state);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @rw: READ or WRITE
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion. If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int rw, long timeout)
+{
+ long ret;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+
+/**
+ * congestion_end - wake up sleepers on a congested backing_dev_info
+ * @rw: READ or WRITE
+ */
+void congestion_end(int rw)
+{
+ wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+EXPORT_SYMBOL(congestion_end);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112fcb40..00a96970b23 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -27,8 +27,6 @@ unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
-
static LIST_HEAD(bdata_list);
#ifdef CONFIG_CRASH_DUMP
/*
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
if (limit && bdata->node_boot_start >= limit)
return NULL;
+ /* on nodes without memory - bootmem_map is NULL */
+ if (!bdata->node_bootmem_map)
+ return NULL;
+
end_pfn = bdata->node_low_pfn;
limit = PFN_DOWN(limit);
if (limit && end_pfn > limit)
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 00000000000..e4b62d2a402
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,302 @@
+/* bounce buffer handling for block devices
+ *
+ * - Split from highmem.c
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/blktrace_api.h>
+#include <asm/tlbflush.h>
+
+#define POOL_SIZE 64
+#define ISA_POOL_SIZE 16
+
+static mempool_t *page_pool, *isa_page_pool;
+
+#ifdef CONFIG_HIGHMEM
+static __init int init_emergency_pool(void)
+{
+ struct sysinfo i;
+ si_meminfo(&i);
+ si_swapinfo(&i);
+
+ if (!i.totalhigh)
+ return 0;
+
+ page_pool = mempool_create_page_pool(POOL_SIZE, 0);
+ BUG_ON(!page_pool);
+ printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+
+ return 0;
+}
+
+__initcall(init_emergency_pool);
+
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+ unsigned long flags;
+ unsigned char *vto;
+
+ local_irq_save(flags);
+ vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
+ memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+ kunmap_atomic(vto, KM_BOUNCE_READ);
+ local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom) \
+ memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * allocate pages in the DMA region for the ISA pool
+ */
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
+{
+ return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
+}
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+ if (isa_page_pool)
+ return 0;
+
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+ mempool_free_pages, (void *) 0);
+ BUG_ON(!isa_page_pool);
+
+ printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+ return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+ unsigned char *vfrom;
+ struct bio_vec *tovec, *fromvec;
+ int i;
+
+ __bio_for_each_segment(tovec, to, i, 0) {
+ fromvec = from->bi_io_vec + i;
+
+ /*
+ * not bounced
+ */
+ if (tovec->bv_page == fromvec->bv_page)
+ continue;
+
+ /*
+ * fromvec->bv_offset and fromvec->bv_len might have been
+ * modified by the block layer, so use the original copy,
+ * bounce_copy_vec already uses tovec->bv_len
+ */
+ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+ flush_dcache_page(tovec->bv_page);
+ bounce_copy_vec(tovec, vfrom);
+ }
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+ struct bio_vec *bvec, *org_vec;
+ int i;
+
+ if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+ set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+ /*
+ * free up bounce indirect pages used
+ */
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ org_vec = bio_orig->bi_io_vec + i;
+ if (bvec->bv_page == org_vec->bv_page)
+ continue;
+
+ dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+ mempool_free(bvec->bv_page, pool);
+ }
+
+ bio_endio(bio_orig, bio_orig->bi_size, err);
+ bio_put(bio);
+}
+
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+ if (bio->bi_size)
+ return 1;
+
+ bounce_end_io(bio, page_pool, err);
+ return 0;
+}
+
+static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+ if (bio->bi_size)
+ return 1;
+
+ bounce_end_io(bio, isa_page_pool, err);
+ return 0;
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ copy_to_high_bio_irq(bio_orig, bio);
+
+ bounce_end_io(bio, pool, err);
+}
+
+static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+ if (bio->bi_size)
+ return 1;
+
+ __bounce_end_io_read(bio, page_pool, err);
+ return 0;
+}
+
+static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+ if (bio->bi_size)
+ return 1;
+
+ __bounce_end_io_read(bio, isa_page_pool, err);
+ return 0;
+}
+
+static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
+ mempool_t *pool)
+{
+ struct page *page;
+ struct bio *bio = NULL;
+ int i, rw = bio_data_dir(*bio_orig);
+ struct bio_vec *to, *from;
+
+ bio_for_each_segment(from, *bio_orig, i) {
+ page = from->bv_page;
+
+ /*
+ * is destination page below bounce pfn?
+ */
+ if (page_to_pfn(page) < q->bounce_pfn)
+ continue;
+
+ /*
+ * irk, bounce it
+ */
+ if (!bio)
+ bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
+
+ to = bio->bi_io_vec + i;
+
+ to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ inc_zone_page_state(to->bv_page, NR_BOUNCE);
+
+ if (rw == WRITE) {
+ char *vto, *vfrom;
+
+ flush_dcache_page(from->bv_page);
+ vto = page_address(to->bv_page) + to->bv_offset;
+ vfrom = kmap(from->bv_page) + from->bv_offset;
+ memcpy(vto, vfrom, to->bv_len);
+ kunmap(from->bv_page);
+ }
+ }
+
+ /*
+ * no pages bounced
+ */
+ if (!bio)
+ return;
+
+ /*
+ * at least one page was bounced, fill in possible non-highmem
+ * pages
+ */
+ __bio_for_each_segment(from, *bio_orig, i, 0) {
+ to = bio_iovec_idx(bio, i);
+ if (!to->bv_page) {
+ to->bv_page = from->bv_page;
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ }
+ }
+
+ bio->bi_bdev = (*bio_orig)->bi_bdev;
+ bio->bi_flags |= (1 << BIO_BOUNCED);
+ bio->bi_sector = (*bio_orig)->bi_sector;
+ bio->bi_rw = (*bio_orig)->bi_rw;
+
+ bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+ bio->bi_idx = (*bio_orig)->bi_idx;
+ bio->bi_size = (*bio_orig)->bi_size;
+
+ if (pool == page_pool) {
+ bio->bi_end_io = bounce_end_io_write;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read;
+ } else {
+ bio->bi_end_io = bounce_end_io_write_isa;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read_isa;
+ }
+
+ bio->bi_private = *bio_orig;
+ *bio_orig = bio;
+}
+
+void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
+{
+ mempool_t *pool;
+
+ /*
+ * for non-isa bounce case, just check if the bounce pfn is equal
+ * to or bigger than the highest pfn in the system -- in that case,
+ * don't waste time iterating over bio segments
+ */
+ if (!(q->bounce_gfp & GFP_DMA)) {
+ if (q->bounce_pfn >= blk_max_pfn)
+ return;
+ pool = page_pool;
+ } else {
+ BUG_ON(!isa_page_pool);
+ pool = isa_page_pool;
+ }
+
+ blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
+ /*
+ * slow path
+ */
+ __blk_queue_bounce(q, bio_orig, pool);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 168c78a121b..0df4c899e97 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
if (!file)
return -EBADF;
- if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+ if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index afcdc72b5e9..8332c77b1bd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -75,8 +75,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->mmap_sem
* ->lock_page (access_process_vm)
*
- * ->mmap_sem
- * ->i_mutex (msync)
+ * ->i_mutex (generic_file_buffered_write)
+ * ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
* ->i_mutex
* ->i_alloc_sem (various)
@@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
}
#ifdef CONFIG_NUMA
-struct page *page_cache_alloc(struct address_space *x)
+struct page *__page_cache_alloc(gfp_t gfp)
{
if (cpuset_do_page_mem_spread()) {
int n = cpuset_mem_spread_node();
- return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+ return alloc_pages_node(n, gfp, 0);
}
- return alloc_pages(mapping_gfp_mask(x), 0);
+ return alloc_pages(gfp, 0);
}
-EXPORT_SYMBOL(page_cache_alloc);
-
-struct page *page_cache_alloc_cold(struct address_space *x)
-{
- if (cpuset_do_page_mem_spread()) {
- int n = cpuset_mem_spread_node();
- return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
- }
- return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
-}
-EXPORT_SYMBOL(page_cache_alloc_cold);
+EXPORT_SYMBOL(__page_cache_alloc);
#endif
static int __sleep_on_page_lock(void *word)
@@ -826,7 +816,6 @@ struct page *
grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
struct page *page = find_get_page(mapping, index);
- gfp_t gfp_mask;
if (page) {
if (!TestSetPageLocked(page))
@@ -834,9 +823,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
page_cache_release(page);
return NULL;
}
- gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
- page = alloc_pages(gfp_mask, 0);
- if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
page_cache_release(page);
page = NULL;
}
@@ -1139,23 +1127,24 @@ success:
}
/**
- * __generic_file_aio_read - generic filesystem read routine
+ * generic_file_aio_read - generic filesystem read routine
* @iocb: kernel I/O control block
* @iov: io vector request
* @nr_segs: number of segments in the iovec
- * @ppos: current file position
+ * @pos: current file position
*
* This is the "read()" routine for all filesystems
* that can use the page cache directly.
*/
ssize_t
-__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg;
size_t count;
+ loff_t *ppos = &iocb->ki_pos;
count = 0;
for (seg = 0; seg < nr_segs; seg++) {
@@ -1179,7 +1168,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
- loff_t pos = *ppos, size;
+ loff_t size;
struct address_space *mapping;
struct inode *inode;
@@ -1192,13 +1181,13 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (pos < size) {
retval = generic_file_direct_IO(READ, iocb,
iov, pos, nr_segs);
- if (retval > 0 && !is_sync_kiocb(iocb))
- retval = -EIOCBQUEUED;
if (retval > 0)
*ppos = pos + retval;
}
- file_accessed(filp);
- goto out;
+ if (likely(retval != 0)) {
+ file_accessed(filp);
+ goto out;
+ }
}
retval = 0;
@@ -1223,33 +1212,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
out:
return retval;
}
-EXPORT_SYMBOL(__generic_file_aio_read);
-
-ssize_t
-generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
-{
- struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-
- BUG_ON(iocb->ki_pos != pos);
- return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
-}
EXPORT_SYMBOL(generic_file_aio_read);
-ssize_t
-generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
- struct iovec local_iov = { .iov_base = buf, .iov_len = count };
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, filp);
- ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-EXPORT_SYMBOL(generic_file_read);
-
int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{
ssize_t written;
@@ -1471,7 +1435,7 @@ outside_data_content:
* accessible..
*/
if (area->vm_mm == current->mm)
- return NULL;
+ return NOPAGE_SIGBUS;
/* Fall through to the non-read-ahead case */
no_cached_page:
/*
@@ -1479,7 +1443,6 @@ no_cached_page:
* effect.
*/
error = page_cache_read(file, pgoff);
- grab_swap_token();
/*
* The page we want has now been added to the page cache.
@@ -1496,7 +1459,7 @@ no_cached_page:
*/
if (error == -ENOMEM)
return NOPAGE_OOM;
- return NULL;
+ return NOPAGE_SIGBUS;
page_not_uptodate:
if (!did_readaround) {
@@ -1565,7 +1528,7 @@ page_not_uptodate:
*/
shrink_readahead_size_eio(file, ra);
page_cache_release(page);
- return NULL;
+ return NOPAGE_SIGBUS;
}
EXPORT_SYMBOL(filemap_nopage);
@@ -1906,11 +1869,10 @@ repeat:
* if suid or (sgid and xgrp)
* remove privs
*/
-int remove_suid(struct dentry *dentry)
+int should_remove_suid(struct dentry *dentry)
{
mode_t mode = dentry->d_inode->i_mode;
int kill = 0;
- int result = 0;
/* suid always must be killed */
if (unlikely(mode & S_ISUID))
@@ -1923,13 +1885,29 @@ int remove_suid(struct dentry *dentry)
if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
kill |= ATTR_KILL_SGID;
- if (unlikely(kill && !capable(CAP_FSETID))) {
- struct iattr newattrs;
+ if (unlikely(kill && !capable(CAP_FSETID)))
+ return kill;
- newattrs.ia_valid = ATTR_FORCE | kill;
- result = notify_change(dentry, &newattrs);
- }
- return result;
+ return 0;
+}
+EXPORT_SYMBOL(should_remove_suid);
+
+int __remove_suid(struct dentry *dentry, int kill)
+{
+ struct iattr newattrs;
+
+ newattrs.ia_valid = ATTR_FORCE | kill;
+ return notify_change(dentry, &newattrs);
+}
+
+int remove_suid(struct dentry *dentry)
+{
+ int kill = should_remove_suid(dentry);
+
+ if (unlikely(kill))
+ return __remove_suid(dentry, kill);
+
+ return 0;
}
EXPORT_SYMBOL(remove_suid);
@@ -2020,6 +1998,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
*count = inode->i_sb->s_maxbytes - *pos;
} else {
+#ifdef CONFIG_BLOCK
loff_t isize;
if (bdev_read_only(I_BDEV(inode)))
return -EPERM;
@@ -2031,6 +2010,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (*pos + *count > isize)
*count = isize - *pos;
+#else
+ return -EPERM;
+#endif
}
return 0;
}
@@ -2063,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
* Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO.
* i_mutex is held, which protects generic_osync_inode() from
- * livelocking.
+ * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
*/
- if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ if ((written >= 0 || written == -EIOCBQUEUED) &&
+ ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
if (err < 0)
written = err;
}
- if (written == count && !is_sync_kiocb(iocb))
- written = -EIOCBQUEUED;
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
@@ -2240,7 +2221,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
- const struct address_space * mapping = file->f_mapping;
+ struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
@@ -2285,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
- err = remove_suid(file->f_dentry);
+ err = remove_suid(file->f_path.dentry);
if (err)
goto out;
@@ -2293,8 +2274,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
- written = generic_file_direct_write(iocb, iov,
- &nr_segs, pos, ppos, count, ocount);
+ loff_t endbyte;
+ ssize_t written_buffered;
+
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+ ppos, count, ocount);
if (written < 0 || written == count)
goto out;
/*
@@ -2303,30 +2287,66 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
*/
pos += written;
count -= written;
- }
+ written_buffered = generic_file_buffered_write(iocb, iov,
+ nr_segs, pos, ppos, count,
+ written);
+ /*
+ * If generic_file_buffered_write() retuned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
+ }
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, ppos, count, written);
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + written_buffered - written - 1;
+ err = do_sync_file_range(file, pos, endbyte,
+ SYNC_FILE_RANGE_WAIT_BEFORE|
+ SYNC_FILE_RANGE_WRITE|
+ SYNC_FILE_RANGE_WAIT_AFTER);
+ if (err == 0) {
+ written = written_buffered;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+ } else {
+ written = generic_file_buffered_write(iocb, iov, nr_segs,
+ pos, ppos, count, written);
+ }
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
-EXPORT_SYMBOL(generic_file_aio_write_nolock);
-ssize_t
-generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
- loff_t pos = *ppos;
- ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+ BUG_ON(iocb->ki_pos != pos);
+
+ ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+ &iocb->ki_pos);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
+ ssize_t err;
err = sync_page_range_nolock(inode, mapping, pos, ret);
if (err < 0)
@@ -2334,51 +2354,21 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
}
return ret;
}
+EXPORT_SYMBOL(generic_file_aio_write_nolock);
-static ssize_t
-__generic_file_write_nolock(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, file);
- ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-
-ssize_t
-generic_file_write_nolock(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, file);
- ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-EXPORT_SYMBOL(generic_file_write_nolock);
-
-ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
- size_t count, loff_t pos)
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
- struct iovec local_iov = { .iov_base = (void __user *)buf,
- .iov_len = count };
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
- ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
- &iocb->ki_pos);
+ ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+ &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2392,66 +2382,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
}
EXPORT_SYMBOL(generic_file_aio_write);
-ssize_t generic_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
- struct iovec local_iov = { .iov_base = (void __user *)buf,
- .iov_len = count };
-
- mutex_lock(&inode->i_mutex);
- ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
- mutex_unlock(&inode->i_mutex);
-
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- ssize_t err;
-
- err = sync_page_range(inode, mapping, *ppos - ret, ret);
- if (err < 0)
- ret = err;
- }
- return ret;
-}
-EXPORT_SYMBOL(generic_file_write);
-
-ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, filp);
- ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-EXPORT_SYMBOL(generic_file_readv);
-
-ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
-
- mutex_lock(&inode->i_mutex);
- ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
- mutex_unlock(&inode->i_mutex);
-
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- err = sync_page_range(inode, mapping, *ppos - ret, ret);
- if (err < 0)
- ret = err;
- }
- return ret;
-}
-EXPORT_SYMBOL(generic_file_writev);
-
/*
* Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
* went wrong during pagecache shootdown.
@@ -2491,3 +2421,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
}
return retval;
}
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private). If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ *
+ * NOTE: @gfp_mask may go away, and this function may become non-blocking.
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+ struct address_space * const mapping = page->mapping;
+
+ BUG_ON(!PageLocked(page));
+ if (PageWriteback(page))
+ return 0;
+
+ if (mapping && mapping->a_ops->releasepage)
+ return mapping->a_ops->releasepage(page, gfp_mask);
+ return try_to_free_buffers(page);
+}
+
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/filemap.h b/mm/filemap.h
index 3f2a343c601..c2bff04c84e 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -12,7 +12,6 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/uio.h>
-#include <linux/config.h>
#include <linux/uaccess.h>
size_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b4fd0d7c9bf..8d667617f55 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (count == 0)
goto out_backing;
- ret = remove_suid(filp->f_dentry);
+ ret = remove_suid(filp->f_path.dentry);
if (ret)
goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index aa30618ec6b..b77a002c335 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte));
- pte_clear(mm, addr, ptep);
+ pte_clear_not_present_full(mm, addr, ptep, 0);
}
return !!page;
}
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
{
int err = -ENOMEM;
pte_t *pte;
- pte_t pte_val;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
}
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
- pte_val = *pte;
/*
* We don't need to run update_mmu_cache() here because the "file pte"
* being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/highmem.c b/mm/highmem.c
index ee5519b176e..0206e7e5018 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,13 +29,6 @@
#include <linux/blktrace_api.h>
#include <asm/tlbflush.h>
-static mempool_t *page_pool, *isa_page_pool;
-
-static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
-{
- return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
-
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -217,282 +210,8 @@ void fastcall kunmap_high(struct page *page)
}
EXPORT_SYMBOL(kunmap_high);
-
-#define POOL_SIZE 64
-
-static __init int init_emergency_pool(void)
-{
- struct sysinfo i;
- si_meminfo(&i);
- si_swapinfo(&i);
-
- if (!i.totalhigh)
- return 0;
-
- page_pool = mempool_create_page_pool(POOL_SIZE, 0);
- BUG_ON(!page_pool);
- printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
-
- return 0;
-}
-
-__initcall(init_emergency_pool);
-
-/*
- * highmem version, map in to vec
- */
-static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
-{
- unsigned long flags;
- unsigned char *vto;
-
- local_irq_save(flags);
- vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
- memcpy(vto + to->bv_offset, vfrom, to->bv_len);
- kunmap_atomic(vto, KM_BOUNCE_READ);
- local_irq_restore(flags);
-}
-
-#else /* CONFIG_HIGHMEM */
-
-#define bounce_copy_vec(to, vfrom) \
- memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
-
#endif
-#define ISA_POOL_SIZE 16
-
-/*
- * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
- * as the max address, so check if the pool has already been created.
- */
-int init_emergency_isa_pool(void)
-{
- if (isa_page_pool)
- return 0;
-
- isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
- mempool_free_pages, (void *) 0);
- BUG_ON(!isa_page_pool);
-
- printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
- return 0;
-}
-
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
- unsigned char *vfrom;
- struct bio_vec *tovec, *fromvec;
- int i;
-
- __bio_for_each_segment(tovec, to, i, 0) {
- fromvec = from->bi_io_vec + i;
-
- /*
- * not bounced
- */
- if (tovec->bv_page == fromvec->bv_page)
- continue;
-
- /*
- * fromvec->bv_offset and fromvec->bv_len might have been
- * modified by the block layer, so use the original copy,
- * bounce_copy_vec already uses tovec->bv_len
- */
- vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
-
- flush_dcache_page(tovec->bv_page);
- bounce_copy_vec(tovec, vfrom);
- }
-}
-
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
-{
- struct bio *bio_orig = bio->bi_private;
- struct bio_vec *bvec, *org_vec;
- int i;
-
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
- /*
- * free up bounce indirect pages used
- */
- __bio_for_each_segment(bvec, bio, i, 0) {
- org_vec = bio_orig->bi_io_vec + i;
- if (bvec->bv_page == org_vec->bv_page)
- continue;
-
- dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, pool);
- }
-
- bio_endio(bio_orig, bio_orig->bi_size, err);
- bio_put(bio);
-}
-
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- bounce_end_io(bio, page_pool, err);
- return 0;
-}
-
-static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- bounce_end_io(bio, isa_page_pool, err);
- return 0;
-}
-
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
-{
- struct bio *bio_orig = bio->bi_private;
-
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
- copy_to_high_bio_irq(bio_orig, bio);
-
- bounce_end_io(bio, pool, err);
-}
-
-static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- __bounce_end_io_read(bio, page_pool, err);
- return 0;
-}
-
-static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- __bounce_end_io_read(bio, isa_page_pool, err);
- return 0;
-}
-
-static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
- mempool_t *pool)
-{
- struct page *page;
- struct bio *bio = NULL;
- int i, rw = bio_data_dir(*bio_orig);
- struct bio_vec *to, *from;
-
- bio_for_each_segment(from, *bio_orig, i) {
- page = from->bv_page;
-
- /*
- * is destination page below bounce pfn?
- */
- if (page_to_pfn(page) < q->bounce_pfn)
- continue;
-
- /*
- * irk, bounce it
- */
- if (!bio)
- bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
-
- to = bio->bi_io_vec + i;
-
- to->bv_page = mempool_alloc(pool, q->bounce_gfp);
- to->bv_len = from->bv_len;
- to->bv_offset = from->bv_offset;
- inc_zone_page_state(to->bv_page, NR_BOUNCE);
-
- if (rw == WRITE) {
- char *vto, *vfrom;
-
- flush_dcache_page(from->bv_page);
- vto = page_address(to->bv_page) + to->bv_offset;
- vfrom = kmap(from->bv_page) + from->bv_offset;
- memcpy(vto, vfrom, to->bv_len);
- kunmap(from->bv_page);
- }
- }
-
- /*
- * no pages bounced
- */
- if (!bio)
- return;
-
- /*
- * at least one page was bounced, fill in possible non-highmem
- * pages
- */
- __bio_for_each_segment(from, *bio_orig, i, 0) {
- to = bio_iovec_idx(bio, i);
- if (!to->bv_page) {
- to->bv_page = from->bv_page;
- to->bv_len = from->bv_len;
- to->bv_offset = from->bv_offset;
- }
- }
-
- bio->bi_bdev = (*bio_orig)->bi_bdev;
- bio->bi_flags |= (1 << BIO_BOUNCED);
- bio->bi_sector = (*bio_orig)->bi_sector;
- bio->bi_rw = (*bio_orig)->bi_rw;
-
- bio->bi_vcnt = (*bio_orig)->bi_vcnt;
- bio->bi_idx = (*bio_orig)->bi_idx;
- bio->bi_size = (*bio_orig)->bi_size;
-
- if (pool == page_pool) {
- bio->bi_end_io = bounce_end_io_write;
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read;
- } else {
- bio->bi_end_io = bounce_end_io_write_isa;
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read_isa;
- }
-
- bio->bi_private = *bio_orig;
- *bio_orig = bio;
-}
-
-void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
-{
- mempool_t *pool;
-
- /*
- * for non-isa bounce case, just check if the bounce pfn is equal
- * to or bigger than the highest pfn in the system -- in that case,
- * don't waste time iterating over bio segments
- */
- if (!(q->bounce_gfp & GFP_DMA)) {
- if (q->bounce_pfn >= blk_max_pfn)
- return;
- pool = page_pool;
- } else {
- BUG_ON(!isa_page_pool);
- pool = isa_page_pool;
- }
-
- blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
-
- /*
- * slow path
- */
- __blk_queue_bounce(q, bio_orig, pool);
-}
-
-EXPORT_SYMBOL(blk_queue_bounce);
-
#if defined(HASHED_PAGE_VIRTUAL)
#define PA_HASH_ORDER 7
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c7d03dbf73..0ccc7f23025 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void)
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
if (page) {
- page[1].lru.next = (void *)free_huge_page; /* dtor */
+ set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
- add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
@@ -356,40 +355,66 @@ nomem:
return -ENOMEM;
}
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
+void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
struct page *page;
+ struct page *tmp;
+ /*
+ * A page gathering list, protected by per file i_mmap_lock. The
+ * lock is used to avoid list corruption from multiple unmapping
+ * of the same page since we are using page->lru.
+ */
+ LIST_HEAD(page_list);
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
spin_lock(&mm->page_table_lock);
-
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
-
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
+
pte = huge_ptep_get_and_clear(mm, address, ptep);
if (pte_none(pte))
continue;
page = pte_page(pte);
- put_page(page);
- add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
+ list_add(&page->lru, &page_list);
}
-
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ list_del(&page->lru);
+ put_page(page);
+ }
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ /*
+ * It is undesirable to test vma->vm_file as it should be non-null
+ * for valid hugetlb area. However, vm_file will be NULL in the error
+ * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
+ * do_mmap_pgoff() nullifies vma->vm_file before calling this function
+ * to clean up. Since no pte has actually been setup, it is safe to
+ * do nothing in this case.
+ */
+ if (vma->vm_file) {
+ spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ __unmap_hugepage_range(vma, start, end);
+ spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ }
}
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -454,6 +479,9 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
retry:
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto out;
if (hugetlb_get_quota(mapping))
goto out;
page = alloc_huge_page(vma, address);
@@ -488,7 +516,6 @@ retry:
if (!pte_none(*ptep))
goto backout;
- add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
@@ -626,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
+ spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
spin_lock(&mm->page_table_lock);
for (; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
if (!pte_none(*ptep)) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -639,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
+ spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
flush_tlb_range(vma, start, end);
}
diff --git a/mm/memory.c b/mm/memory.c
index 92a3ebd8d79..bf6100236e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -467,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_cow_mapping(vm_flags)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
- pte = *src_pte;
+ pte = pte_wrprotect(pte);
}
/*
@@ -506,6 +506,7 @@ again:
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ arch_enter_lazy_mmu_mode();
do {
/*
@@ -527,6 +528,7 @@ again:
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -628,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
int anon_rss = 0;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
@@ -690,10 +693,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
continue;
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
- pte_clear_full(mm, addr, pte, tlb->fullmm);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
add_mm_rss(mm, file_rss, anon_rss);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return addr;
@@ -1082,6 +1086,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
default:
BUG();
}
+ cond_resched();
}
if (pages) {
pages[i] = page;
@@ -1105,21 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
spinlock_t *ptl;
+ int err = 0;
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- return -ENOMEM;
+ return -EAGAIN;
+ arch_enter_lazy_mmu_mode();
do {
struct page *page = ZERO_PAGE(addr);
pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+
+ if (unlikely(!pte_none(*pte))) {
+ err = -EEXIST;
+ pte++;
+ break;
+ }
page_cache_get(page);
page_add_file_rmap(page);
inc_mm_counter(mm, file_rss);
- BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
- return 0;
+ return err;
}
static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1127,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int err;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- return -ENOMEM;
+ return -EAGAIN;
do {
next = pmd_addr_end(addr, end);
- if (zeromap_pte_range(mm, pmd, addr, next, prot))
- return -ENOMEM;
+ err = zeromap_pte_range(mm, pmd, addr, next, prot);
+ if (err)
+ break;
} while (pmd++, addr = next, addr != end);
- return 0;
+ return err;
}
static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1144,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
{
pud_t *pud;
unsigned long next;
+ int err;
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- return -ENOMEM;
+ return -EAGAIN;
do {
next = pud_addr_end(addr, end);
- if (zeromap_pmd_range(mm, pud, addr, next, prot))
- return -ENOMEM;
+ err = zeromap_pmd_range(mm, pud, addr, next, prot);
+ if (err)
+ break;
} while (pud++, addr = next, addr != end);
- return 0;
+ return err;
}
int zeromap_page_range(struct vm_area_struct *vma,
@@ -1275,11 +1292,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
+ arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1443,6 +1462,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
memset(kaddr, 0, PAGE_SIZE);
kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(dst);
return;
}
@@ -1577,7 +1597,14 @@ gotten:
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
lazy_mmu_prot_update(entry);
- ptep_establish(vma, address, page_table, entry);
+ /*
+ * Clear the pte entry and flush it first, before updating the
+ * pte with the new entry. This will avoid a race condition
+ * seen in the presence of one thread doing SMC and another
+ * thread doing COW.
+ */
+ ptep_clear_flush(vma, address, page_table);
+ set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
@@ -1885,7 +1912,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
return 0;
}
-EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
/**
* swapin_readahead - swap in pages in hope we need them soon
@@ -1974,6 +2000,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
+ grab_swap_token(); /* Contend for token _before_ read-in */
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
@@ -1991,7 +2018,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- grab_swap_token();
}
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2154,11 +2180,13 @@ retry:
* after the next truncate_count read.
*/
- /* no page was available -- either SIGBUS or OOM */
- if (new_page == NOPAGE_SIGBUS)
+ /* no page was available -- either SIGBUS, OOM or REFAULT */
+ if (unlikely(new_page == NOPAGE_SIGBUS))
return VM_FAULT_SIGBUS;
- if (new_page == NOPAGE_OOM)
+ else if (unlikely(new_page == NOPAGE_OOM))
return VM_FAULT_OOM;
+ else if (unlikely(new_page == NOPAGE_REFAULT))
+ return VM_FAULT_MINOR;
/*
* Should we do an early C-O-W break?
@@ -2256,6 +2284,54 @@ oom:
}
/*
+ * do_no_pfn() tries to create a new page mapping for a page without
+ * a struct_page backing it
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * It is expected that the ->nopfn handler always returns the same pfn
+ * for a given virtual mapping.
+ *
+ * Mark this `noinline' to prevent it from bloating the main pagefault code.
+ */
+static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
+{
+ spinlock_t *ptl;
+ pte_t entry;
+ unsigned long pfn;
+ int ret = VM_FAULT_MINOR;
+
+ pte_unmap(page_table);
+ BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+ BUG_ON(is_cow_mapping(vma->vm_flags));
+
+ pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+ if (pfn == NOPFN_OOM)
+ return VM_FAULT_OOM;
+ if (pfn == NOPFN_SIGBUS)
+ return VM_FAULT_SIGBUS;
+
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ /* Only go through if we didn't race with anybody else... */
+ if (pte_none(*page_table)) {
+ entry = pfn_pte(pfn, vma->vm_page_prot);
+ if (write_access)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ set_pte_at(mm, address, page_table, entry);
+ }
+ pte_unmap_unlock(page_table, ptl);
+ return ret;
+}
+
+/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
@@ -2317,11 +2393,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
old_entry = entry = *pte;
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, address,
- pte, pmd, write_access);
- return do_no_page(mm, vma, address,
- pte, pmd, write_access);
+ if (vma->vm_ops) {
+ if (vma->vm_ops->nopage)
+ return do_no_page(mm, vma, address,
+ pte, pmd,
+ write_access);
+ if (unlikely(vma->vm_ops->nopfn))
+ return do_no_pfn(mm, vma, address, pte,
+ pmd, write_access);
+ }
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
}
if (pte_file(entry))
return do_file_page(mm, vma, address,
@@ -2550,3 +2632,56 @@ int in_gate_area_no_task(unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ struct page *page;
+ void *old_buf = buf;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ /* ignore errors, just check how much was sucessfully transfered */
+ while (len) {
+ int bytes, ret, offset;
+ void *maddr;
+
+ ret = get_user_pages(tsk, mm, addr, 1,
+ write, 1, &page, &vma);
+ if (ret <= 0)
+ break;
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ page_cache_release(page);
+ len -= bytes;
+ buf += bytes;
+ addr += bytes;
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+
+ return buf - old_buf;
+}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c37319542b7..0c055a090f4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,6 +13,7 @@
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/pagevec.h>
+#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
@@ -21,11 +22,41 @@
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
+#include <linux/cpuset.h>
#include <asm/tlbflush.h>
-extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
- unsigned long size);
+/* add this memory to iomem resource */
+static struct resource *register_memory_resource(u64 start, u64 size)
+{
+ struct resource *res;
+ res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+ BUG_ON(!res);
+
+ res->name = "System RAM";
+ res->start = start;
+ res->end = start + size - 1;
+ res->flags = IORESOURCE_MEM;
+ if (request_resource(&iomem_resource, res) < 0) {
+ printk("System RAM resource %llx - %llx cannot be added\n",
+ (unsigned long long)res->start, (unsigned long long)res->end);
+ kfree(res);
+ res = NULL;
+ }
+ return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+ if (!res)
+ return;
+ release_resource(res);
+ kfree(res);
+ return;
+}
+
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
@@ -41,12 +72,9 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
return ret;
}
memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
- zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
return 0;
}
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
- int nr_pages);
static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
{
int nr_pages = PAGES_PER_SECTION;
@@ -191,8 +219,10 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
if (need_zonelists_rebuild)
build_all_zonelists();
vm_total_pages = nr_free_pagecache_pages();
+ writeback_set_ratelimit();
return 0;
}
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
{
@@ -222,36 +252,6 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
return;
}
-/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
-{
- struct resource *res;
- res = kzalloc(sizeof(struct resource), GFP_KERNEL);
- BUG_ON(!res);
-
- res->name = "System RAM";
- res->start = start;
- res->end = start + size - 1;
- res->flags = IORESOURCE_MEM;
- if (request_resource(&iomem_resource, res) < 0) {
- printk("System RAM resource %llx - %llx cannot be added\n",
- (unsigned long long)res->start, (unsigned long long)res->end);
- kfree(res);
- res = NULL;
- }
- return res;
-}
-
-static void release_memory_resource(struct resource *res)
-{
- if (!res)
- return;
- release_resource(res);
- kfree(res);
- return;
-}
-
-
int add_memory(int nid, u64 start, u64 size)
{
@@ -283,6 +283,8 @@ int add_memory(int nid, u64 start, u64 size)
/* we online node here. we can't roll back from here. */
node_set_online(nid);
+ cpuset_track_online_nodes();
+
if (new_pgdat) {
ret = register_one_node(nid);
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 38f89650bc8..da946394655 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
enum zone_type k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+ max++; /* space for zlcache_ptr (see mmzone.h) */
zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
+ zl->zlcache_ptr = NULL;
num = 0;
/* First put in the highest zones from all nodes, then all the next
lower zones etc. Avoid empty zones because the memory allocator
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
struct page *page;
- unsigned int nid;
+ int nid;
if (!pte_present(*pte))
continue;
@@ -727,7 +729,7 @@ int do_migrate_pages(struct mm_struct *mm,
return -ENOSYS;
}
-static struct page *new_vma_page(struct page *page, unsigned long private)
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
{
return NULL;
}
@@ -1136,7 +1138,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
*/
unsigned slab_node(struct mempolicy *policy)
{
- switch (policy->policy) {
+ int pol = policy ? policy->policy : MPOL_DEFAULT;
+
+ switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
@@ -1322,12 +1326,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
atomic_set(&new->refcnt, 1);
if (new->policy == MPOL_BIND) {
int sz = ksize(old->v.zonelist);
- new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+ new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
if (!new->v.zonelist) {
kmem_cache_free(policy_cache, new);
return ERR_PTR(-ENOMEM);
}
- memcpy(new->v.zonelist, old->v.zonelist, sz);
}
return new;
}
@@ -1704,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
* Display pages allocated per node and memory policy via /proc.
*/
-static const char *policy_types[] = { "default", "prefer", "bind",
- "interleave" };
+static const char * const policy_types[] =
+ { "default", "prefer", "bind", "interleave" };
/*
* Convert a mempolicy into a string.
@@ -1854,7 +1857,7 @@ int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_printf(m, " file=");
- seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+ seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_printf(m, " heap");
} else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/migrate.c b/mm/migrate.c
index 20a8c2687b1..e9b161bde95 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
- struct page **radix_pointer;
+ void **pslot;
if (!mapping) {
/* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
write_lock_irq(&mapping->tree_lock);
- radix_pointer = (struct page **)radix_tree_lookup_slot(
- &mapping->page_tree,
- page_index(page));
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
if (page_count(page) != 2 + !!PagePrivate(page) ||
- *radix_pointer != page) {
+ (struct page *)radix_tree_deref_slot(pslot) != page) {
write_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
/*
* Now we know that no one else is looking at the page.
*/
- get_page(newpage);
+ get_page(newpage); /* add cache reference */
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
#endif
- *radix_pointer = newpage;
+ radix_tree_replace_slot(pslot, newpage);
+
+ /*
+ * Drop cache reference from old page.
+ * We know this isn't the last reference.
+ */
__put_page(page);
+
write_unlock_irq(&mapping->tree_lock);
return 0;
@@ -409,6 +414,7 @@ int migrate_page(struct address_space *mapping,
}
EXPORT_SYMBOL(migrate_page);
+#ifdef CONFIG_BLOCK
/*
* Migration function for pages with buffers. This function can only be used
* if the underlying filesystem guarantees that no other references to "page"
@@ -466,6 +472,7 @@ int buffer_migrate_page(struct address_space *mapping,
return 0;
}
EXPORT_SYMBOL(buffer_migrate_page);
+#endif
/*
* Writeback a page to clean the dirty state
@@ -525,7 +532,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_buffers(page) &&
+ if (PagePrivate(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
@@ -950,7 +957,8 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
goto out;
pm[i].node = node;
- }
+ } else
+ pm[i].node = 0; /* anything to not match MAX_NUMNODES */
}
/* End marker */
pm[nr_pages].node = MAX_NUMNODES;
diff --git a/mm/mlock.c b/mm/mlock.c
index b90c59573ab..3446b7ef731 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -65,7 +65,7 @@ success:
ret = make_pages_present(start, end);
}
- vma->vm_mm->locked_vm -= pages;
+ mm->locked_vm -= pages;
out:
if (ret == -ENOMEM)
ret = -EAGAIN;
diff --git a/mm/mmap.c b/mm/mmap.c
index eea8eefd51a..9717337293c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file->f_dentry->d_inode->i_writecount);
+ atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable--;
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file->f_dentry->d_inode->i_writecount);
+ atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
@@ -900,17 +900,6 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
int accountable = 1;
unsigned long charged = 0, reqprot = prot;
- if (file) {
- if (is_file_hugepages(file))
- accountable = 0;
-
- if (!file->f_op || !file->f_op->mmap)
- return -ENODEV;
-
- if ((prot & PROT_EXEC) &&
- (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
- return -EPERM;
- }
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
@@ -918,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
@@ -971,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
return -EAGAIN;
}
- inode = file ? file->f_dentry->d_inode : NULL;
+ inode = file ? file->f_path.dentry->d_inode : NULL;
if (file) {
switch (flags & MAP_TYPE) {
@@ -1000,6 +989,16 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (vm_flags & VM_EXEC)
+ return -EPERM;
+ vm_flags &= ~VM_MAYEXEC;
+ }
+ if (is_file_hugepages(file))
+ accountable = 0;
+
+ if (!file->f_op || !file->f_op->mmap)
+ return -ENODEV;
break;
default:
@@ -1380,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
* Check if the given range is hugepage aligned, and
* can be made suitable for hugepages.
*/
- ret = prepare_hugepage_range(addr, len);
+ ret = prepare_hugepage_range(addr, len, pgoff);
} else {
/*
* Ensure that a normal request is not falling in a
@@ -1737,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
- new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -1881,6 +1880,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if ((addr + len) > TASK_SIZE || (addr + len) < addr)
return -EINVAL;
+ if (is_hugepage_only_range(mm, addr, len))
+ return -EINVAL;
+
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = arch_mmap_check(addr, len, flags);
@@ -2055,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
vma_start < new_vma->vm_end)
*vmap = new_vma;
} else {
- new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
pol = mpol_copy(vma_policy(vma));
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c9816..eb5838634f1 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void)
return NODE_DATA(first_online_node);
}
-EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
-
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
return NULL;
return NODE_DATA(nid);
}
-EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
-
/*
* next_zone - helper magic for for_each_zone()
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone)
}
return zone;
}
-EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 955f9d0e38a..3b8f3c0c63f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -34,6 +34,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
spinlock_t *ptl;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
if (pte_present(oldpte)) {
@@ -70,6 +71,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
}
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c15cf3373a..9c769fa29f3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ arch_enter_lazy_mmu_mode();
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
set_pte_at(mm, new_addr, new_pte, pte);
}
+ arch_leave_lazy_mmu_mode();
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
pte_unmap_nested(new_pte - 1);
diff --git a/mm/nommu.c b/mm/nommu.c
index d99dea31e44..23fb033e596 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp)
}
/*
- * The nommu dodgy version :-)
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ * slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
*/
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
{
+ struct vm_area_struct *vma;
+ unsigned long vm_flags;
int i;
- static struct vm_area_struct dummy_vma;
+
+ /* calculate required read or write permissions.
+ * - if 'force' is set, we only require the "MAY" flags.
+ */
+ vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
for (i = 0; i < len; i++) {
+ vma = find_vma(mm, start);
+ if (!vma)
+ goto finish_or_fault;
+
+ /* protect what we can, including chardevs */
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
+ !(vm_flags & vma->vm_flags))
+ goto finish_or_fault;
+
if (pages) {
pages[i] = virt_to_page(start);
if (pages[i])
page_cache_get(pages[i]);
}
if (vmas)
- vmas[i] = &dummy_vma;
+ vmas[i] = vma;
start += PAGE_SIZE;
}
- return(i);
+
+ return i;
+
+finish_or_fault:
+ return i ? : -EFAULT;
}
EXPORT_SYMBOL(get_user_pages);
@@ -197,7 +221,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
* Allocate enough pages to cover @size from the page level
* allocator and map them into continguos kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
@@ -286,6 +310,77 @@ static void show_process_blocks(void)
}
#endif /* DEBUG */
+/*
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * - should be called with mm->mmap_sem held writelocked
+ */
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+{
+ struct vm_list_struct **ppv;
+
+ for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
+ if ((*ppv)->vma->vm_start > vml->vma->vm_start)
+ break;
+
+ vml->next = *ppv;
+ *ppv = vml;
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_list_struct *loop, *vml;
+
+ /* search the vm_start ordered list */
+ vml = NULL;
+ for (loop = mm->context.vmlist; loop; loop = loop->next) {
+ if (loop->vma->vm_start > addr)
+ break;
+ vml = loop;
+ }
+
+ if (vml && vml->vma->vm_end > addr)
+ return vml->vma;
+
+ return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ return find_vma(mm, addr);
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_list_struct *vml;
+
+ /* search the vm_start ordered list */
+ for (vml = mm->context.vmlist; vml; vml = vml->next) {
+ if (vml->vma->vm_start == addr)
+ return vml->vma;
+ if (vml->vma->vm_start > addr)
+ break;
+ }
+
+ return NULL;
+}
+
+/*
+ * find a VMA in the global tree
+ */
static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
{
struct vm_area_struct *vma;
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
return NULL;
}
+/*
+ * add a VMA in the global tree
+ */
static void add_nommu_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *pvma;
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
}
+/*
+ * delete a VMA from the global list
+ */
static void delete_nommu_vma(struct vm_area_struct *vma)
{
struct address_space *mapping;
@@ -396,15 +497,17 @@ static int validate_mmap_request(struct file *file,
(flags & MAP_TYPE) != MAP_SHARED)
return -EINVAL;
- if (PAGE_ALIGN(len) == 0)
- return addr;
-
- if (len > TASK_SIZE)
+ if (!len)
return -EINVAL;
+ /* Careful about overflows.. */
+ len = PAGE_ALIGN(len);
+ if (!len || len > TASK_SIZE)
+ return -ENOMEM;
+
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EINVAL;
+ return -EOVERFLOW;
if (file) {
/* validate file mapping requests */
@@ -420,7 +523,7 @@ static int validate_mmap_request(struct file *file,
*/
mapping = file->f_mapping;
if (!mapping)
- mapping = file->f_dentry->d_inode->i_mapping;
+ mapping = file->f_path.dentry->d_inode->i_mapping;
capabilities = 0;
if (mapping && mapping->backing_dev_info)
@@ -429,7 +532,7 @@ static int validate_mmap_request(struct file *file,
if (!capabilities) {
/* no explicit capabilities set, so assume some
* defaults */
- switch (file->f_dentry->d_inode->i_mode & S_IFMT) {
+ switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
capabilities = BDI_CAP_MAP_COPY;
@@ -460,11 +563,11 @@ static int validate_mmap_request(struct file *file,
!(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (IS_APPEND(file->f_dentry->d_inode) &&
+ if (IS_APPEND(file->f_path.dentry->d_inode) &&
(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (locks_verify_locked(file->f_dentry->d_inode))
+ if (locks_verify_locked(file->f_path.dentry->d_inode))
return -EAGAIN;
if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -495,7 +598,7 @@ static int validate_mmap_request(struct file *file,
/* handle executable mappings and implied executable
* mappings */
- if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
}
@@ -705,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file,
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping if it works */
- vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+ vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
if (!vml)
goto error_getting_vml;
- memset(vml, 0, sizeof(*vml));
down_write(&nommu_vma_sem);
@@ -731,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file,
continue;
/* search for overlapping mappings on the same file */
- if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
+ if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
continue;
if (vma->vm_pgoff >= pgoff + pglen)
@@ -784,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file,
}
/* we're going to need a VMA struct as well */
- vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
if (!vma)
goto error_getting_vma;
- memset(vma, 0, sizeof(*vma));
INIT_LIST_HEAD(&vma->anon_vma_node);
atomic_set(&vma->vm_usage, 1);
if (file)
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file,
realalloc += kobjsize(vml);
askedalloc += sizeof(*vml);
- vml->next = current->mm->context.vmlist;
- current->mm->context.vmlist = vml;
+ add_vma_to_mm(current->mm, vml);
up_write(&nommu_vma_sem);
@@ -848,7 +948,8 @@ unsigned long do_mmap_pgoff(struct file *file,
up_write(&nommu_vma_sem);
kfree(vml);
if (vma) {
- fput(vma->vm_file);
+ if (vma->vm_file)
+ fput(vma->vm_file);
kfree(vma);
}
return ret;
@@ -908,6 +1009,11 @@ static void put_vma(struct vm_area_struct *vma)
}
}
+/*
+ * release a mapping
+ * - under NOMMU conditions the parameters must match exactly to the mapping to
+ * be removed
+ */
int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
{
struct vm_list_struct *vml, **parent;
@@ -917,10 +1023,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
printk("do_munmap:\n");
#endif
- for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
+ for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
+ if ((*parent)->vma->vm_start > addr)
+ break;
if ((*parent)->vma->vm_start == addr &&
((len == 0) || ((*parent)->vma->vm_end == end)))
goto found;
+ }
printk("munmap of non-mmaped memory by process %d (%s): %p\n",
current->pid, current->comm, (void *) addr);
@@ -946,7 +1055,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
return 0;
}
-/* Release all mmaps. */
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+
+ down_write(&mm->mmap_sem);
+ ret = do_munmap(mm, addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * Release all mappings
+ */
void exit_mmap(struct mm_struct * mm)
{
struct vm_list_struct *tmp;
@@ -973,37 +1095,26 @@ void exit_mmap(struct mm_struct * mm)
}
}
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
-{
- int ret;
- struct mm_struct *mm = current->mm;
-
- down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
- up_write(&mm->mmap_sem);
- return ret;
-}
-
unsigned long do_brk(unsigned long addr, unsigned long len)
{
return -ENOMEM;
}
/*
- * Expand (or shrink) an existing mapping, potentially moving it at the
- * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ * expand (or shrink) an existing mapping, potentially moving it at the same
+ * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
*
- * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
- * This option implies MREMAP_MAYMOVE.
+ * under NOMMU conditions, we only permit changing a mapping's size, and only
+ * as long as it stays within the hole allocated by the kmalloc() call in
+ * do_mmap_pgoff() and the block is not shareable
*
- * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the
- * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
+ * MREMAP_FIXED is not supported under NOMMU conditions
*/
unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
- struct vm_list_struct *vml = NULL;
+ struct vm_area_struct *vma;
/* insanity checks first */
if (new_len == 0)
@@ -1012,58 +1123,46 @@ unsigned long do_mremap(unsigned long addr,
if (flags & MREMAP_FIXED && new_addr != addr)
return (unsigned long) -EINVAL;
- for (vml = current->mm->context.vmlist; vml; vml = vml->next)
- if (vml->vma->vm_start == addr)
- goto found;
-
- return (unsigned long) -EINVAL;
+ vma = find_vma_exact(current->mm, addr);
+ if (!vma)
+ return (unsigned long) -EINVAL;
- found:
- if (vml->vma->vm_end != vml->vma->vm_start + old_len)
+ if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
- if (vml->vma->vm_flags & VM_MAYSHARE)
+ if (vma->vm_flags & VM_MAYSHARE)
return (unsigned long) -EPERM;
if (new_len > kobjsize((void *) addr))
return (unsigned long) -ENOMEM;
/* all checks complete - do it */
- vml->vma->vm_end = vml->vma->vm_start + new_len;
+ vma->vm_end = vma->vm_start + new_len;
askedalloc -= old_len;
askedalloc += new_len;
- return vml->vma->vm_start;
+ return vma->vm_start;
}
-/*
- * Look up the first VMA which satisfies addr < vm_end, NULL if none
- */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+asmlinkage unsigned long sys_mremap(unsigned long addr,
+ unsigned long old_len, unsigned long new_len,
+ unsigned long flags, unsigned long new_addr)
{
- struct vm_list_struct *vml;
-
- for (vml = mm->context.vmlist; vml; vml = vml->next)
- if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
- return vml->vma;
+ unsigned long ret;
- return NULL;
+ down_write(&current->mm->mmap_sem);
+ ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+ up_write(&current->mm->mmap_sem);
+ return ret;
}
-EXPORT_SYMBOL(find_vma);
-
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags)
{
return NULL;
}
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
- return NULL;
-}
-
int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
unsigned long to, unsigned long size, pgprot_t prot)
{
@@ -1206,3 +1305,44 @@ struct page *filemap_nopage(struct vm_area_struct *area,
BUG();
return NULL;
}
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+
+ if (addr + len < addr)
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (vma) {
+ /* don't overrun this mapping */
+ if (addr + len >= vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read or write mappings where it is permitted */
+ if (write && vma->vm_flags & VM_MAYWRITE)
+ len -= copy_to_user((void *) addr, buf, len);
+ else if (!write && vma->vm_flags & VM_MAYREAD)
+ len -= copy_from_user(buf, (void *) addr, len);
+ else
+ len = 0;
+ } else {
+ len = 0;
+ }
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return len;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bada3d03119..223d9ccb7d6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -15,6 +15,7 @@
* kernel subsystems and hints as to where to find out what things do.
*/
+#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
@@ -204,16 +205,30 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
do_posix_clock_monotonic_gettime(&uptime);
do_each_thread(g, p) {
unsigned long points;
- int releasing;
- /* skip kernel threads */
+ /*
+ * skip kernel threads and tasks which have already released
+ * their mm.
+ */
if (!p->mm)
continue;
- /* skip the init task with pid == 1 */
- if (p->pid == 1)
+ /* skip the init task */
+ if (is_init(p))
continue;
/*
+ * This task already has access to memory reserves and is
+ * being killed. Don't allow any other task access to the
+ * memory reserve.
+ *
+ * Note: this may have a chance of deadlock if it gets
+ * blocked waiting for another task which itself is waiting
+ * for memory. Is there a better alternative?
+ */
+ if (test_tsk_thread_flag(p, TIF_MEMDIE))
+ return ERR_PTR(-1UL);
+
+ /*
* This is in the process of releasing memory so wait for it
* to finish before killing some other task by mistake.
*
@@ -221,21 +236,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
* go ahead if it is exiting: this will simply set TIF_MEMDIE,
* which will allow it to gain access to memory reserves in
* the process of exiting and releasing its resources.
- * Otherwise we could get an OOM deadlock.
+ * Otherwise we could get an easy OOM deadlock.
*/
- releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
- p->flags & PF_EXITING;
- if (releasing) {
- /* PF_DEAD tasks have already released their mm */
- if (p->flags & PF_DEAD)
- continue;
- if (p->flags & PF_EXITING && p == current) {
- chosen = p;
- *ppoints = ULONG_MAX;
- break;
- }
- return ERR_PTR(-1UL);
+ if (p->flags & PF_EXITING) {
+ if (p != current)
+ return ERR_PTR(-1UL);
+
+ chosen = p;
+ *ppoints = ULONG_MAX;
}
+
if (p->oomkilladj == OOM_DISABLE)
continue;
@@ -245,6 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
*ppoints = points;
}
} while_each_thread(g, p);
+
return chosen;
}
@@ -253,27 +264,22 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
* set.
*/
-static void __oom_kill_task(struct task_struct *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, int verbose)
{
- if (p->pid == 1) {
+ if (is_init(p)) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill init!\n");
return;
}
- task_lock(p);
- if (!p->mm || p->mm == &init_mm) {
+ if (!p->mm) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill an mm-less task!\n");
- task_unlock(p);
return;
}
- task_unlock(p);
- if (message) {
- printk(KERN_ERR "%s: Killed process %d (%s).\n",
- message, p->pid, p->comm);
- }
+ if (verbose)
+ printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
/*
* We give our sacrificial lamb high priority and access to
@@ -286,7 +292,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
force_sig(SIGKILL, p);
}
-static int oom_kill_task(struct task_struct *p, const char *message)
+static int oom_kill_task(struct task_struct *p)
{
struct mm_struct *mm;
struct task_struct *g, *q;
@@ -302,18 +308,28 @@ static int oom_kill_task(struct task_struct *p, const char *message)
* However, this is of no concern to us.
*/
- if (mm == NULL || mm == &init_mm)
+ if (mm == NULL)
return 1;
- __oom_kill_task(p, message);
+ /*
+ * Don't kill the process if any threads are set to OOM_DISABLE
+ */
+ do_each_thread(g, q) {
+ if (q->mm == mm && p->oomkilladj == OOM_DISABLE)
+ return 1;
+ } while_each_thread(g, q);
+
+ __oom_kill_task(p, 1);
+
/*
* kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * but are in a different thread group. Don't let them have access
+ * to memory reserves though, otherwise we might deplete all memory.
*/
- do_each_thread(g, q)
+ do_each_thread(g, q) {
if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q, message);
- while_each_thread(g, q);
+ force_sig(SIGKILL, p);
+ } while_each_thread(g, q);
return 0;
}
@@ -329,21 +345,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
- __oom_kill_task(p, NULL);
+ __oom_kill_task(p, 0);
return 0;
}
- printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
- " and children.\n", p->pid, p->comm, points);
+ printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
+ message, p->pid, p->comm, points);
+
/* Try to kill a child first */
list_for_each(tsk, &p->children) {
c = list_entry(tsk, struct task_struct, sibling);
if (c->mm == p->mm)
continue;
- if (!oom_kill_task(c, message))
+ if (!oom_kill_task(c))
return 0;
}
- return oom_kill_task(p, message);
+ return oom_kill_task(p);
}
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 555752907dc..237107c1b08 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1,5 +1,5 @@
/*
- * mm/page-writeback.c.
+ * mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
*
@@ -21,6 +21,7 @@
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
@@ -30,6 +31,8 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
@@ -46,7 +49,6 @@
*/
static long ratelimit_pages = 32;
-static long total_pages; /* The total number of pages in the machine. */
static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
/*
@@ -126,7 +128,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
int unmapped_ratio;
long background;
long dirty;
- unsigned long available_memory = total_pages;
+ unsigned long available_memory = vm_total_pages;
struct task_struct *tsk;
#ifdef CONFIG_HIGHMEM
@@ -141,7 +143,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
- total_pages;
+ vm_total_pages;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
@@ -221,7 +223,7 @@ static void balance_dirty_pages(struct address_space *mapping)
if (pages_written >= write_chunk)
break; /* We've done our duty */
}
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
}
if (nr_reclaimable + global_page_state(NR_WRITEBACK)
@@ -313,7 +315,7 @@ void throttle_vm_writeout(void)
if (global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
}
}
@@ -350,7 +352,7 @@ static void background_writeout(unsigned long _min_pages)
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
if (!wbc.encountered_congestion)
break;
}
@@ -421,7 +423,7 @@ static void wb_kupdate(unsigned long arg)
writeback_inodes(&wbc);
if (wbc.nr_to_write > 0) {
if (wbc.encountered_congestion)
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
else
break; /* All the old data is written */
}
@@ -502,9 +504,9 @@ void laptop_sync_completion(void)
* will write six megabyte chunks, max.
*/
-static void set_ratelimit(void)
+void writeback_set_ratelimit(void)
{
- ratelimit_pages = total_pages / (num_online_cpus() * 32);
+ ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
@@ -514,7 +516,7 @@ static void set_ratelimit(void)
static int __cpuinit
ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
{
- set_ratelimit();
+ writeback_set_ratelimit();
return 0;
}
@@ -533,9 +535,7 @@ void __init page_writeback_init(void)
long buffer_pages = nr_free_buffer_pages();
long correction;
- total_pages = nr_free_pagecache_pages();
-
- correction = (100 * 4 * buffer_pages) / total_pages;
+ correction = (100 * 4 * buffer_pages) / vm_total_pages;
if (correction < 100) {
dirty_background_ratio *= correction;
@@ -549,10 +549,143 @@ void __init page_writeback_init(void)
vm_dirty_ratio = 1;
}
mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
- set_ratelimit();
+ writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
}
+/**
+ * generic_writepages - walk the list of dirty pages of the given
+ * address space and writepage() all of them.
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * If a page is already under I/O, generic_writepages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ *
+ * Derived from mpage_writepages() - if you fix this you should check that
+ * also!
+ */
+int generic_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
+ int (*writepage)(struct page *page, struct writeback_control *wbc);
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int range_whole = 0;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ return 0;
+ }
+
+ writepage = mapping->a_ops->writepage;
+
+ /* deal with chardevs and other special file */
+ if (!writepage)
+ return 0;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ scanned = 1;
+ }
+retry:
+ while (!done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or even
+ * swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ unlock_page(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ret = (*writepage)(page, wbc);
+ if (ret) {
+ if (ret == -ENOSPC)
+ set_bit(AS_ENOSPC, &mapping->flags);
+ else
+ set_bit(AS_EIO, &mapping->flags);
+ }
+
+ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+ unlock_page(page);
+ if (ret || (--(wbc->nr_to_write) <= 0))
+ done = 1;
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = index;
+ return ret;
+}
+
+EXPORT_SYMBOL(generic_writepages);
+
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -629,23 +762,24 @@ int __set_page_dirty_nobuffers(struct page *page)
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
- if (mapping) {
- write_lock_irq(&mapping->tree_lock);
- mapping2 = page_mapping(page);
- if (mapping2) { /* Race with truncate? */
- BUG_ON(mapping2 != mapping);
- if (mapping_cap_account_dirty(mapping))
- __inc_zone_page_state(page,
- NR_FILE_DIRTY);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
- write_unlock_irq(&mapping->tree_lock);
- if (mapping->host) {
- /* !PageAnon && !swapper_space */
- __mark_inode_dirty(mapping->host,
- I_DIRTY_PAGES);
+ if (!mapping)
+ return 1;
+
+ write_lock_irq(&mapping->tree_lock);
+ mapping2 = page_mapping(page);
+ if (mapping2) { /* Race with truncate? */
+ BUG_ON(mapping2 != mapping);
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ task_io_account_write(PAGE_CACHE_SIZE);
}
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ write_unlock_irq(&mapping->tree_lock);
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
@@ -675,9 +809,11 @@ int fastcall set_page_dirty(struct page *page)
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
- if (spd)
- return (*spd)(page);
- return __set_page_dirty_buffers(page);
+#ifdef CONFIG_BLOCK
+ if (!spd)
+ spd = __set_page_dirty_buffers;
+#endif
+ return (*spd)(page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
@@ -717,27 +853,26 @@ int test_clear_page_dirty(struct page *page)
struct address_space *mapping = page_mapping(page);
unsigned long flags;
- if (mapping) {
- write_lock_irqsave(&mapping->tree_lock, flags);
- if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- write_unlock_irqrestore(&mapping->tree_lock, flags);
- /*
- * We can continue to use `mapping' here because the
- * page is locked, which pins the address_space
- */
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
- dec_zone_page_state(page, NR_FILE_DIRTY);
- }
- return 1;
- }
+ if (!mapping)
+ return TestClearPageDirty(page);
+
+ write_lock_irqsave(&mapping->tree_lock, flags);
+ if (TestClearPageDirty(page)) {
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
write_unlock_irqrestore(&mapping->tree_lock, flags);
- return 0;
+ /*
+ * We can continue to use `mapping' here because the
+ * page is locked, which pins the address_space
+ */
+ if (mapping_cap_account_dirty(mapping)) {
+ page_mkclean(page);
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ }
+ return 1;
}
- return TestClearPageDirty(page);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
+ return 0;
}
EXPORT_SYMBOL(test_clear_page_dirty);
@@ -759,17 +894,17 @@ int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- if (mapping) {
- if (TestClearPageDirty(page)) {
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
- dec_zone_page_state(page, NR_FILE_DIRTY);
- }
- return 1;
+ if (!mapping)
+ return TestClearPageDirty(page);
+
+ if (TestClearPageDirty(page)) {
+ if (mapping_cap_account_dirty(mapping)) {
+ page_mkclean(page);
+ dec_zone_page_state(page, NR_FILE_DIRTY);
}
- return 0;
+ return 1;
}
- return TestClearPageDirty(page);
+ return 0;
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
@@ -822,15 +957,6 @@ int test_set_page_writeback(struct page *page)
EXPORT_SYMBOL(test_set_page_writeback);
/*
- * Wakes up tasks that are being throttled due to writeback congestion
- */
-void writeback_congestion_end(void)
-{
- blk_congestion_end(WRITE);
-}
-EXPORT_SYMBOL(writeback_congestion_end);
-
-/*
* Return true if any of the pages in the mapping are marged with the
* passed tag.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9810f0a60db..e6b17b2989e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,10 @@
#include <linux/vmalloc.h>
#include <linux/mempolicy.h>
#include <linux/stop_machine.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -80,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
EXPORT_SYMBOL(totalram_pages);
-/*
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
"DMA",
#ifdef CONFIG_ZONE_DMA32
"DMA32",
@@ -102,6 +99,38 @@ int min_free_kbytes = 1024;
unsigned long __meminitdata nr_kernel_pages;
unsigned long __meminitdata nr_all_pages;
+static unsigned long __initdata dma_reserve;
+
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ /*
+ * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
+ * ranges of memory (RAM) that may be registered with add_active_range().
+ * Ranges passed to add_active_range() will be merged if possible
+ * so the number of times add_active_range() can be called is
+ * related to the number of nodes and the number of holes
+ */
+ #ifdef CONFIG_MAX_ACTIVE_REGIONS
+ /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
+ #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
+ #else
+ #if MAX_NUMNODES >= 32
+ /* If there can be many nodes, allow up to 50 holes per node */
+ #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
+ #else
+ /* By default, allow up to 256 distinct regions */
+ #define MAX_ACTIVE_REGIONS 256
+ #endif
+ #endif
+
+ struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+ int __initdata nr_nodemap_entries;
+ unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+ unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
+ unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -202,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- page[1].lru.next = (void *)free_compound_page; /* set dtor */
+ set_compound_page_dtor(page, free_compound_page);
page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
@@ -451,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __free_one_page(page, zone ,order);
+ __free_one_page(page, zone, order);
spin_unlock(&zone->lock);
}
@@ -461,17 +490,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
int i;
int reserved = 0;
- arch_free_page(page, order);
- if (!PageHighMem(page))
- debug_check_no_locks_freed(page_address(page),
- PAGE_SIZE<<order);
-
for (i = 0 ; i < (1 << order) ; ++i)
reserved += free_pages_check(page + i);
if (reserved)
return;
+ if (!PageHighMem(page))
+ debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
+
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, order);
@@ -571,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
1 << PG_checked | 1 << PG_mappedtodisk);
set_page_private(page, 0);
set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
if (gfp_flags & __GFP_ZERO)
@@ -656,9 +686,15 @@ void drain_node_pages(int nodeid)
pcp = &pset->pcp[i];
if (pcp->count) {
+ int to_drain;
+
local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
- pcp->count = 0;
+ if (pcp->count >= pcp->batch)
+ to_drain = pcp->batch;
+ else
+ to_drain = pcp->count;
+ free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ pcp->count -= to_drain;
local_irq_restore(flags);
}
}
@@ -666,7 +702,6 @@ void drain_node_pages(int nodeid)
}
#endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
unsigned long flags;
@@ -688,7 +723,6 @@ static void __drain_pages(unsigned int cpu)
}
}
}
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
#ifdef CONFIG_PM
@@ -747,13 +781,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
struct per_cpu_pages *pcp;
unsigned long flags;
- arch_free_page(page, 0);
-
if (PageAnon(page))
page->mapping = NULL;
if (free_pages_check(page))
return;
+ if (!PageHighMem(page))
+ debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+ arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
@@ -818,7 +853,7 @@ again:
pcp = &zone_pcp(zone, cpu)->pcp[cold];
local_irq_save(flags);
if (!pcp->count) {
- pcp->count += rmqueue_bulk(zone, 0,
+ pcp->count = rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
if (unlikely(!pcp->count))
goto failed;
@@ -858,6 +893,91 @@ failed:
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct fail_page_alloc_attr {
+ struct fault_attr attr;
+
+ u32 ignore_gfp_highmem;
+ u32 ignore_gfp_wait;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+ struct dentry *ignore_gfp_highmem_file;
+ struct dentry *ignore_gfp_wait_file;
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+ .ignore_gfp_highmem = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ if (gfp_mask & __GFP_NOFAIL)
+ return 0;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return 0;
+ if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+ return 0;
+
+ return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+ int err;
+
+ err = init_fault_attr_dentries(&fail_page_alloc.attr,
+ "fail_page_alloc");
+ if (err)
+ return err;
+ dir = fail_page_alloc.attr.dentries.dir;
+
+ fail_page_alloc.ignore_gfp_wait_file =
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_wait);
+
+ fail_page_alloc.ignore_gfp_highmem_file =
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+
+ if (!fail_page_alloc.ignore_gfp_wait_file ||
+ !fail_page_alloc.ignore_gfp_highmem_file) {
+ err = -ENOMEM;
+ debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+ debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+ cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+ }
+
+ return err;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
/*
* Return 1 if free pages are above 'mark'. This takes into account the order
* of the allocation.
@@ -866,7 +986,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
/* free_pages my go negative - that's OK */
- long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+ unsigned long min = mark;
+ long free_pages = z->free_pages - (1 << order) + 1;
int o;
if (alloc_flags & ALLOC_HIGH)
@@ -889,31 +1010,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return 1;
}
+#ifdef CONFIG_NUMA
/*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full. See further
+ * comments in mmzone.h. Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ nodemask_t *allowednodes; /* zonelist_cache approximation */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return NULL;
+
+ if (jiffies - zlc->last_full_zap > 1 * HZ) {
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ zlc->last_full_zap = jiffies;
+ }
+
+ allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+ &cpuset_current_mems_allowed :
+ &node_online_map;
+ return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ * 1) Check that the zone isn't thought to be full (doesn't have its
+ * bit set in the zonelist_cache fullzones BITMAP).
+ * 2) Check that the zones node (obtained from the zonelist_cache
+ * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+ nodemask_t *allowednodes)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+ int n; /* node that zone *z is on */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return 1;
+
+ i = z - zonelist->zones;
+ n = zlc->z_to_n[i];
+
+ /* This zone is worth trying if it is allowed but not full */
+ return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ i = z - zonelist->zones;
+
+ set_bit(i, zlc->fullzones);
+}
+
+#else /* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+ nodemask_t *allowednodes)
+{
+ return 1;
+}
+
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, int alloc_flags)
{
- struct zone **z = zonelist->zones;
+ struct zone **z;
struct page *page = NULL;
- int classzone_idx = zone_idx(*z);
+ int classzone_idx = zone_idx(zonelist->zones[0]);
struct zone *zone;
+ nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+ int zlc_active = 0; /* set if using zonelist_cache */
+ int did_zlc_setup = 0; /* just call zlc_setup() one time */
+zonelist_scan:
/*
- * Go through the zonelist once, looking for a zone with enough free.
+ * Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
+ z = zonelist->zones;
+
do {
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
zone = *z;
- if (unlikely((gfp_mask & __GFP_THISNODE) &&
+ if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
break;
if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed(zone, gfp_mask))
- continue;
+ !cpuset_zone_allowed(zone, gfp_mask))
+ goto try_next_zone;
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
@@ -923,18 +1173,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
mark = zone->pages_low;
else
mark = zone->pages_high;
- if (!zone_watermark_ok(zone , order, mark,
- classzone_idx, alloc_flags))
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags)) {
if (!zone_reclaim_mode ||
!zone_reclaim(zone, gfp_mask, order))
- continue;
+ goto this_zone_full;
+ }
}
page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
- if (page) {
+ if (page)
break;
+this_zone_full:
+ if (NUMA_BUILD)
+ zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+ if (NUMA_BUILD && !did_zlc_setup) {
+ /* we do zlc_setup after the first zone is tried */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
}
} while (*(++z) != NULL);
+
+ if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+ /* Disable zlc cache for second zonelist scan */
+ zlc_active = 0;
+ goto zonelist_scan;
+ }
return page;
}
@@ -956,6 +1222,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
might_sleep_if(wait);
+ if (should_fail_alloc_page(gfp_mask, order))
+ return NULL;
+
restart:
z = zonelist->zones; /* the list of zones suitable for gfp_mask */
@@ -969,9 +1238,19 @@ restart:
if (page)
goto got_pg;
- do {
+ /*
+ * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+ * __GFP_NOWARN set) should not cause reclaim since the subsystem
+ * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+ * using a larger set of nodes after it has established that the
+ * allowed per node queues are empty and that nodes are
+ * over allocated.
+ */
+ if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ goto nopage;
+
+ for (z = zonelist->zones; *z; z++)
wakeup_kswapd(*z, order);
- } while (*(++z));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -1005,6 +1284,7 @@ restart:
/* This allocation should allow future memory freeing. */
+rebalance:
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1015,7 +1295,7 @@ nofail_alloc:
if (page)
goto got_pg;
if (gfp_mask & __GFP_NOFAIL) {
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
goto nofail_alloc;
}
}
@@ -1026,7 +1306,6 @@ nofail_alloc:
if (!wait)
goto nopage;
-rebalance:
cond_resched();
/* We now go into synchronous reclaim */
@@ -1078,7 +1357,7 @@ rebalance:
do_retry = 1;
}
if (do_retry) {
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
goto rebalance;
}
@@ -1222,14 +1501,12 @@ unsigned int nr_free_pagecache_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
}
-#ifdef CONFIG_NUMA
-static void show_node(struct zone *zone)
+
+static inline void show_node(struct zone *zone)
{
- printk("Node %ld ", zone_to_nid(zone));
+ if (NUMA_BUILD)
+ printk("Node %d ", zone_to_nid(zone));
}
-#else
-#define show_node(zone) do { } while (0)
-#endif
void si_meminfo(struct sysinfo *val)
{
@@ -1271,34 +1548,30 @@ void si_meminfo_node(struct sysinfo *val, int nid)
*/
void show_free_areas(void)
{
- int cpu, temperature;
+ int cpu;
unsigned long active;
unsigned long inactive;
unsigned long free;
struct zone *zone;
for_each_zone(zone) {
- show_node(zone);
- printk("%s per-cpu:", zone->name);
-
- if (!populated_zone(zone)) {
- printk(" empty\n");
+ if (!populated_zone(zone))
continue;
- } else
- printk("\n");
+
+ show_node(zone);
+ printk("%s per-cpu:\n", zone->name);
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
pageset = zone_pcp(zone, cpu);
- for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: high %d, batch %d used:%d\n",
- cpu,
- temperature ? "cold" : "hot",
- pageset->pcp[temperature].high,
- pageset->pcp[temperature].batch,
- pageset->pcp[temperature].count);
+ printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
+ "Cold: hi:%5d, btch:%4d usd:%4d\n",
+ cpu, pageset->pcp[0].high,
+ pageset->pcp[0].batch, pageset->pcp[0].count,
+ pageset->pcp[1].high, pageset->pcp[1].batch,
+ pageset->pcp[1].count);
}
}
@@ -1320,6 +1593,9 @@ void show_free_areas(void)
for_each_zone(zone) {
int i;
+ if (!populated_zone(zone))
+ continue;
+
show_node(zone);
printk("%s"
" free:%lukB"
@@ -1352,12 +1628,11 @@ void show_free_areas(void)
for_each_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ if (!populated_zone(zone))
+ continue;
+
show_node(zone);
printk("%s: ", zone->name);
- if (!populated_zone(zone)) {
- printk("empty\n");
- continue;
- }
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
@@ -1510,6 +1785,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
}
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zonelist *zonelist;
+ struct zonelist_cache *zlc;
+ struct zone **z;
+
+ zonelist = pgdat->node_zonelists + i;
+ zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ for (z = zonelist->zones; *z; z++)
+ zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+ }
+}
+
#else /* CONFIG_NUMA */
static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1547,21 +1840,33 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
}
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
+
#endif /* CONFIG_NUMA */
/* return values int ....just for stop_machine_run() */
static int __meminit __build_all_zonelists(void *dummy)
{
int nid;
- for_each_online_node(nid)
+
+ for_each_online_node(nid) {
build_zonelists(NODE_DATA(nid));
+ build_zonelist_cache(NODE_DATA(nid));
+ }
return 0;
}
void __meminit build_all_zonelists(void)
{
if (system_state == SYSTEM_BOOTING) {
- __build_all_zonelists(0);
+ __build_all_zonelists(NULL);
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guaranntee there is no user
@@ -1642,25 +1947,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
-static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
-{
- unsigned long realtotalpages, totalpages = 0;
- enum zone_type i;
-
- for (i = 0; i < MAX_NR_ZONES; i++)
- totalpages += zones_size[i];
- pgdat->node_spanned_pages = totalpages;
-
- realtotalpages = totalpages;
- if (zholes_size)
- for (i = 0; i < MAX_NR_ZONES; i++)
- realtotalpages -= zholes_size[i];
- pgdat->node_present_pages = realtotalpages;
- printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
-}
-
-
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -1676,6 +1962,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!early_pfn_valid(pfn))
continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
@@ -1700,20 +1988,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
}
}
-#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
- unsigned long pfn, unsigned long size)
-{
- unsigned long snum = pfn_to_section_nr(pfn);
- unsigned long end = pfn_to_section_nr(pfn + size);
-
- if (FLAGS_HAS_NODE)
- zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
- else
- for (; snum <= end; snum++)
- zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1818,6 +2092,9 @@ static int __cpuinit process_zones(int cpu)
for_each_zone(zone) {
+ if (!populated_zone(zone))
+ continue;
+
zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
if (!zone_pcp(zone, cpu))
@@ -1863,16 +2140,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
int ret = NOTIFY_OK;
switch (action) {
- case CPU_UP_PREPARE:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- free_zone_pagesets(cpu);
- break;
- default:
- break;
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+ default:
+ break;
}
return ret;
}
@@ -1977,6 +2254,349 @@ __meminit int init_currently_empty_zone(struct zone *zone,
return 0;
}
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * Basic iterator support. Return the first range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns first region regardless of node
+ */
+static int __init first_active_region_index_in_nid(int nid)
+{
+ int i;
+
+ for (i = 0; i < nr_nodemap_entries; i++)
+ if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Basic iterator support. Return the next active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardles of node
+ */
+static int __init next_active_region_index_in_nid(int index, int nid)
+{
+ for (index = index + 1; index < nr_nodemap_entries; index++)
+ if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+ return index;
+
+ return -1;
+}
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ * Architectures may implement their own version but if add_active_range()
+ * was used and there are no special requirements, this is a convenient
+ * alternative
+ */
+int __init early_pfn_to_nid(unsigned long pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_nodemap_entries; i++) {
+ unsigned long start_pfn = early_node_map[i].start_pfn;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+
+ if (start_pfn <= pfn && pfn < end_pfn)
+ return early_node_map[i].nid;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+/* Basic iterator support to walk early_node_map[] */
+#define for_each_active_range_index_in_nid(i, nid) \
+ for (i = first_active_region_index_in_nid(nid); i != -1; \
+ i = next_active_region_index_in_nid(i, nid))
+
+/**
+ * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
+ * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling free_bootmem() manually.
+ */
+void __init free_bootmem_with_active_regions(int nid,
+ unsigned long max_low_pfn)
+{
+ int i;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ unsigned long size_pages = 0;
+ unsigned long end_pfn = early_node_map[i].end_pfn;
+
+ if (early_node_map[i].start_pfn >= max_low_pfn)
+ continue;
+
+ if (end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
+
+ size_pages = end_pfn - early_node_map[i].start_pfn;
+ free_bootmem_node(NODE_DATA(early_node_map[i].nid),
+ PFN_PHYS(early_node_map[i].start_pfn),
+ size_pages << PAGE_SHIFT);
+ }
+}
+
+/**
+ * sparse_memory_present_with_active_regions - Call memory_present for each active range
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
+ *
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * function may be used instead of calling memory_present() manually.
+ */
+void __init sparse_memory_present_with_active_regions(int nid)
+{
+ int i;
+
+ for_each_active_range_index_in_nid(i, nid)
+ memory_present(early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
+}
+
+/**
+ * push_node_boundaries - Push node boundaries to at least the requested boundary
+ * @nid: The nid of the node to push the boundary for
+ * @start_pfn: The start pfn of the node
+ * @end_pfn: The end pfn of the node
+ *
+ * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
+ * time. Specifically, on x86_64, SRAT will report ranges that can potentially
+ * be hotplugged even though no physical memory exists. This function allows
+ * an arch to push out the node boundaries so mem_map is allocated that can
+ * be used later.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+ nid, start_pfn, end_pfn);
+
+ /* Initialise the boundary for this node if necessary */
+ if (node_boundary_end_pfn[nid] == 0)
+ node_boundary_start_pfn[nid] = -1UL;
+
+ /* Update the boundaries */
+ if (node_boundary_start_pfn[nid] > start_pfn)
+ node_boundary_start_pfn[nid] = start_pfn;
+ if (node_boundary_end_pfn[nid] < end_pfn)
+ node_boundary_end_pfn[nid] = end_pfn;
+}
+
+/* If necessary, push the node boundary out for reserve hotadd */
+static void __init account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+ nid, *start_pfn, *end_pfn);
+
+ /* Return if boundary information has not been provided */
+ if (node_boundary_end_pfn[nid] == 0)
+ return;
+
+ /* Check the boundaries and update if necessary */
+ if (node_boundary_start_pfn[nid] < *start_pfn)
+ *start_pfn = node_boundary_start_pfn[nid];
+ if (node_boundary_end_pfn[nid] > *end_pfn)
+ *end_pfn = node_boundary_end_pfn[nid];
+}
+#else
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn) {}
+
+static void __init account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn) {}
+#endif
+
+
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by an arch calling add_active_range(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0.
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ int i;
+ *start_pfn = -1UL;
+ *end_pfn = 0;
+
+ for_each_active_range_index_in_nid(i, nid) {
+ *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
+ *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ }
+
+ if (*start_pfn == -1UL) {
+ printk(KERN_WARNING "Node %u active with no memory\n", nid);
+ *start_pfn = 0;
+ }
+
+ /* Push the node boundaries out if requested */
+ account_node_boundary(nid, start_pfn, end_pfn);
+}
+
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+unsigned long __init zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ /* Get the start and end of the node and zone */
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+
+ /* Check that this node has pages within the zone's required range */
+ if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ return 0;
+
+ /* Move the zone boundaries inside the node if necessary */
+ zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+ zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+
+ /* Return the spanned pages */
+ return zone_end_pfn - zone_start_pfn;
+}
+
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for.
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+ unsigned long range_start_pfn,
+ unsigned long range_end_pfn)
+{
+ int i = 0;
+ unsigned long prev_end_pfn = 0, hole_pages = 0;
+ unsigned long start_pfn;
+
+ /* Find the end_pfn of the first active range of pfns in the node */
+ i = first_active_region_index_in_nid(nid);
+ if (i == -1)
+ return 0;
+
+ /* Account for ranges before physical memory on this node */
+ if (early_node_map[i].start_pfn > range_start_pfn)
+ hole_pages = early_node_map[i].start_pfn - range_start_pfn;
+
+ prev_end_pfn = early_node_map[i].start_pfn;
+
+ /* Find all holes for the zone within the node */
+ for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
+
+ /* No need to continue if prev_end_pfn is outside the zone */
+ if (prev_end_pfn >= range_end_pfn)
+ break;
+
+ /* Make sure the end of the zone is not within the hole */
+ start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
+ prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+
+ /* Update the hole size cound and move on */
+ if (start_pfn > range_start_pfn) {
+ BUG_ON(prev_end_pfn > start_pfn);
+ hole_pages += start_pfn - prev_end_pfn;
+ }
+ prev_end_pfn = early_node_map[i].end_pfn;
+ }
+
+ /* Account for ranges past physical memory on this node */
+ if (range_end_pfn > prev_end_pfn)
+ hole_pages += range_end_pfn -
+ max(range_start_pfn, prev_end_pfn);
+
+ return hole_pages;
+}
+
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * It returns the number of pages frames in memory holes within a range.
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+/* Return the number of page frames in holes in a zone on a node */
+unsigned long __init zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *ignored)
+{
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
+ node_start_pfn);
+ zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
+ node_end_pfn);
+
+ return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+}
+
+#else
+static inline unsigned long zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zones_size)
+{
+ return zones_size[zone_type];
+}
+
+static inline unsigned long zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long *zholes_size)
+{
+ if (!zholes_size)
+ return 0;
+
+ return zholes_size[zone_type];
+}
+
+#endif
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
+{
+ unsigned long realtotalpages, totalpages = 0;
+ enum zone_type i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
+ zones_size);
+ pgdat->node_spanned_pages = totalpages;
+
+ realtotalpages = totalpages;
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ realtotalpages -=
+ zone_absent_pages_in_node(pgdat->node_id, i,
+ zholes_size);
+ pgdat->node_present_pages = realtotalpages;
+ printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+ realtotalpages);
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1998,11 +2618,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize;
+ unsigned long size, realsize, memmap_pages;
- realsize = size = zones_size[j];
- if (zholes_size)
- realsize -= zholes_size[j];
+ size = zone_spanned_pages_in_node(nid, j, zones_size);
+ realsize = size - zone_absent_pages_in_node(nid, j,
+ zholes_size);
+
+ /*
+ * Adjust realsize so that it accounts for how much memory
+ * is used by this zone for memmap. This affects the watermark
+ * and per-cpu initialisations
+ */
+ memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
+ if (realsize >= memmap_pages) {
+ realsize -= memmap_pages;
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ printk(KERN_WARNING
+ " %s zone: %lu pages exceeds realsize %lu\n",
+ zone_names[j], memmap_pages, realsize);
+
+ /* Account for reserved DMA pages */
+ if (j == ZONE_DMA && realsize > dma_reserve) {
+ realsize -= dma_reserve;
+ printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
+ dma_reserve);
+ }
if (!is_highmem_idx(j))
nr_kernel_pages += realsize;
@@ -2011,6 +2654,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
zone->spanned_pages = size;
zone->present_pages = realsize;
#ifdef CONFIG_NUMA
+ zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -2022,7 +2666,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
- zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+ zone->prev_priority = DEF_PRIORITY;
zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
@@ -2036,7 +2680,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
if (!size)
continue;
- zonetable_add(zone, nid, j, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
zone_start_pfn += size;
@@ -2073,8 +2716,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
- if (pgdat == NODE_DATA(0))
+ if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= pgdat->node_start_pfn;
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+ }
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
@@ -2085,13 +2733,254 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
{
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+ calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
free_area_init_core(pgdat, zones_size, zholes_size);
}
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/**
+ * add_active_range - Register a range of PFNs backed by physical memory
+ * @nid: The node ID the range resides on
+ * @start_pfn: The start PFN of the available physical memory
+ * @end_pfn: The end PFN of the available physical memory
+ *
+ * These ranges are stored in an early_node_map[] and later used by
+ * free_area_init_nodes() to calculate zone sizes and holes. If the
+ * range spans a memory hole, it is up to the architecture to ensure
+ * the memory is not freed by the bootmem allocator. If possible
+ * the range being registered will be merged with existing ranges.
+ */
+void __init add_active_range(unsigned int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i;
+
+ printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
+ "%d entries of %d used\n",
+ nid, start_pfn, end_pfn,
+ nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+
+ /* Merge with existing active regions if possible */
+ for (i = 0; i < nr_nodemap_entries; i++) {
+ if (early_node_map[i].nid != nid)
+ continue;
+
+ /* Skip if an existing region covers this new one */
+ if (start_pfn >= early_node_map[i].start_pfn &&
+ end_pfn <= early_node_map[i].end_pfn)
+ return;
+
+ /* Merge forward if suitable */
+ if (start_pfn <= early_node_map[i].end_pfn &&
+ end_pfn > early_node_map[i].end_pfn) {
+ early_node_map[i].end_pfn = end_pfn;
+ return;
+ }
+
+ /* Merge backward if suitable */
+ if (start_pfn < early_node_map[i].end_pfn &&
+ end_pfn >= early_node_map[i].start_pfn) {
+ early_node_map[i].start_pfn = start_pfn;
+ return;
+ }
+ }
+
+ /* Check that early_node_map is large enough */
+ if (i >= MAX_ACTIVE_REGIONS) {
+ printk(KERN_CRIT "More than %d memory regions, truncating\n",
+ MAX_ACTIVE_REGIONS);
+ return;
+ }
+
+ early_node_map[i].nid = nid;
+ early_node_map[i].start_pfn = start_pfn;
+ early_node_map[i].end_pfn = end_pfn;
+ nr_nodemap_entries = i + 1;
+}
+
+/**
+ * shrink_active_range - Shrink an existing registered range of PFNs
+ * @nid: The node id the range is on that should be shrunk
+ * @old_end_pfn: The old end PFN of the range
+ * @new_end_pfn: The new PFN of the range
+ *
+ * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
+ * The map is kept at the end physical page range that has already been
+ * registered with add_active_range(). This function allows an arch to shrink
+ * an existing registered range.
+ */
+void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+ unsigned long new_end_pfn)
+{
+ int i;
+
+ /* Find the old active region end and shrink */
+ for_each_active_range_index_in_nid(i, nid)
+ if (early_node_map[i].end_pfn == old_end_pfn) {
+ early_node_map[i].end_pfn = new_end_pfn;
+ break;
+ }
+}
+
+/**
+ * remove_all_active_ranges - Remove all currently registered regions
+ *
+ * During discovery, it may be found that a table like SRAT is invalid
+ * and an alternative discovery method must be used. This function removes
+ * all currently registered regions.
+ */
+void __init remove_all_active_ranges(void)
+{
+ memset(early_node_map, 0, sizeof(early_node_map));
+ nr_nodemap_entries = 0;
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
+ memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+}
+
+/* Compare two active node_active_regions */
+static int __init cmp_node_active_region(const void *a, const void *b)
+{
+ struct node_active_region *arange = (struct node_active_region *)a;
+ struct node_active_region *brange = (struct node_active_region *)b;
+
+ /* Done this way to avoid overflows */
+ if (arange->start_pfn > brange->start_pfn)
+ return 1;
+ if (arange->start_pfn < brange->start_pfn)
+ return -1;
+
+ return 0;
+}
+
+/* sort the node_map by start_pfn */
+static void __init sort_node_map(void)
+{
+ sort(early_node_map, (size_t)nr_nodemap_entries,
+ sizeof(struct node_active_region),
+ cmp_node_active_region, NULL);
+}
+
+/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+unsigned long __init find_min_pfn_for_node(unsigned long nid)
+{
+ int i;
+
+ /* Regions in the early_node_map can be in any order */
+ sort_node_map();
+
+ /* Assuming a sorted map, the first range found has the starting pfn */
+ for_each_active_range_index_in_nid(i, nid)
+ return early_node_map[i].start_pfn;
+
+ printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
+ return 0;
+}
+
+/**
+ * find_min_pfn_with_active_regions - Find the minimum PFN registered
+ *
+ * It returns the minimum PFN based on information provided via
+ * add_active_range().
+ */
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+ return find_min_pfn_for_node(MAX_NUMNODES);
+}
+
+/**
+ * find_max_pfn_with_active_regions - Find the maximum PFN registered
+ *
+ * It returns the maximum PFN based on information provided via
+ * add_active_range().
+ */
+unsigned long __init find_max_pfn_with_active_regions(void)
+{
+ int i;
+ unsigned long max_pfn = 0;
+
+ for (i = 0; i < nr_nodemap_entries; i++)
+ max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+
+ return max_pfn;
+}
+
+/**
+ * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * @max_zone_pfn: an array of max PFNs for each zone
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by add_active_range(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+{
+ unsigned long nid;
+ enum zone_type i;
+
+ /* Record where the zone boundaries are */
+ memset(arch_zone_lowest_possible_pfn, 0,
+ sizeof(arch_zone_lowest_possible_pfn));
+ memset(arch_zone_highest_possible_pfn, 0,
+ sizeof(arch_zone_highest_possible_pfn));
+ arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
+ arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
+ for (i = 1; i < MAX_NR_ZONES; i++) {
+ arch_zone_lowest_possible_pfn[i] =
+ arch_zone_highest_possible_pfn[i-1];
+ arch_zone_highest_possible_pfn[i] =
+ max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+ }
+
+ /* Print out the zone ranges */
+ printk("Zone PFN ranges:\n");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(" %-8s %8lu -> %8lu\n",
+ zone_names[i],
+ arch_zone_lowest_possible_pfn[i],
+ arch_zone_highest_possible_pfn[i]);
+
+ /* Print out the early_node_map[] */
+ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
+ for (i = 0; i < nr_nodemap_entries; i++)
+ printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
+ early_node_map[i].start_pfn,
+ early_node_map[i].end_pfn);
+
+ /* Initialise every node */
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ free_area_init_node(nid, pgdat, NULL,
+ find_min_pfn_for_node(nid), NULL);
+ }
+}
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+/**
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+ dma_reserve = new_dma_reserve;
+}
+
#ifndef CONFIG_NEED_MULTIPLE_NODES
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2105,7 +2994,6 @@ void __init free_area_init(unsigned long *zones_size)
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-#ifdef CONFIG_HOTPLUG_CPU
static int page_alloc_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -2120,7 +3008,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
void __init page_alloc_init(void)
{
@@ -2198,10 +3085,11 @@ static void setup_per_zone_lowmem_reserve(void)
calculate_totalreserve_pages();
}
-/*
- * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
- * that the pages_{min,low,high} values for each zone are set correctly
- * with respect to min_free_kbytes.
+/**
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ *
+ * Ensures that the pages_{min,low,high} values for each zone are set correctly
+ * with respect to min_free_kbytes.
*/
void setup_per_zone_pages_min(void)
{
@@ -2423,7 +3311,7 @@ void *__init alloc_large_system_hash(const char *tablename,
/* allow the kernel cmdline to have a say */
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
- numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+ numentries = nr_kernel_pages;
numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
numentries >>= 20 - PAGE_SHIFT;
numentries <<= 20 - PAGE_SHIFT;
@@ -2445,7 +3333,7 @@ void *__init alloc_large_system_hash(const char *tablename,
if (numentries > max)
numentries = max;
- log2qty = long_log2(numentries);
+ log2qty = ilog2(numentries);
do {
size = bucketsize << log2qty;
@@ -2467,7 +3355,7 @@ void *__init alloc_large_system_hash(const char *tablename,
printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
tablename,
(1U << log2qty),
- long_log2(size) - PAGE_SHIFT,
+ ilog2(size) - PAGE_SHIFT,
size);
if (_hash_shift)
@@ -2490,3 +3378,19 @@ unsigned long page_to_pfn(struct page *page)
EXPORT_SYMBOL(pfn_to_page);
EXPORT_SYMBOL(page_to_pfn);
#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+#if MAX_NUMNODES > 1
+/*
+ * Find the highest possible node id.
+ */
+int highest_possible_node_id(void)
+{
+ unsigned int node;
+ unsigned int highest = 0;
+
+ for_each_node_mask(node, node_possible_map)
+ highest = node;
+ return highest;
+}
+EXPORT_SYMBOL(highest_possible_node_id);
+#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ecbf8f..dbffec0d78c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page)
out:
return ret;
}
-
-#ifdef CONFIG_SOFTWARE_SUSPEND
-/*
- * A scruffy utility function to read or write an arbitrary swap page
- * and wait on the I/O. The caller must have a ref on the page.
- *
- * We use end_swap_bio_read() even for writes, because it happens to do what
- * we want.
- */
-int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
- struct bio **bio_chain)
-{
- struct bio *bio;
- int ret = 0;
- int bio_rw;
-
- lock_page(page);
-
- bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
- if (bio == NULL) {
- unlock_page(page);
- ret = -ENOMEM;
- goto out;
- }
-
- bio_rw = rw;
- if (!bio_chain)
- bio_rw |= (1 << BIO_RW_SYNC);
- if (bio_chain)
- bio_get(bio);
- submit_bio(bio_rw, bio);
- if (bio_chain == NULL) {
- wait_on_page_locked(page);
-
- if (!PageUptodate(page) || PageError(page))
- ret = -EIO;
- }
- if (bio_chain) {
- bio->bi_private = *bio_chain;
- *bio_chain = bio;
- }
-out:
- return ret;
-}
-#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
index b02102feeb4..8ce0900dc95 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -21,6 +21,7 @@
#include <linux/writeback.h> // Prototypes pdflush_operation()
#include <linux/kthread.h>
#include <linux/cpuset.h>
+#include <linux/freezer.h>
/*
diff --git a/mm/readahead.c b/mm/readahead.c
index aa7ec424656..0f539e8e827 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
@@ -38,6 +39,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
ra->ra_pages = mapping->backing_dev_info->ra_pages;
ra->prev_page = -1;
}
+EXPORT_SYMBOL_GPL(file_ra_state_init);
/*
* Return max readahead size for this inode in number-of-pages.
@@ -147,15 +149,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
if (!pagevec_add(&lru_pvec, page))
__pagevec_lru_add(&lru_pvec);
if (ret) {
- while (!list_empty(pages)) {
- struct page *victim;
-
- victim = list_to_page(pages);
- list_del(&victim->lru);
- page_cache_release(victim);
- }
+ put_pages_list(pages);
break;
}
+ task_io_account_read(PAGE_CACHE_SIZE);
}
pagevec_lru_add(&lru_pvec);
return ret;
@@ -172,6 +169,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+ /* Clean up the remaining pages */
+ put_pages_list(pages);
goto out;
}
@@ -453,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
*
* Note that @filp is purely used for passing on to the ->readpage[s]()
* handler: it may refer to a different file from @mapping (so we may not use
- * @filp->f_mapping or @filp->f_dentry->d_inode here).
+ * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
* Also, @ra may not be equal to &@filp->f_ra.
*
*/
diff --git a/mm/rmap.c b/mm/rmap.c
index e2155d791d9..d8a842a586d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,27 +21,21 @@
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * inode->i_alloc_sem
- *
- * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
- * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
- * taken together; in truncation, i_mutex is taken outermost.
- *
- * mm->mmap_sem
- * page->flags PG_locked (lock_page)
- * mapping->i_mmap_lock
- * anon_vma->lock
- * mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * inode_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * mapping->tree_lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within inode_lock in __sync_single_inode)
+ * inode->i_alloc_sem (vmtruncate_range)
+ * mm->mmap_sem
+ * page->flags PG_locked (lock_page)
+ * mapping->i_mmap_lock
+ * anon_vma->lock
+ * mm->page_table_lock or pte_lock
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * inode_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * mapping->tree_lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within inode_lock in __sync_single_inode)
*/
#include <linux/mm.h>
@@ -576,15 +570,14 @@ void page_add_file_rmap(struct page *page)
void page_remove_rmap(struct page *page)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
-#ifdef CONFIG_DEBUG_VM
if (unlikely(page_mapcount(page) < 0)) {
printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
printk (KERN_EMERG " page->flags = %lx\n", page->flags);
printk (KERN_EMERG " page->count = %x\n", page_count(page));
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
+ BUG();
}
-#endif
- BUG_ON(page_mapcount(page) < 0);
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
diff --git a/mm/shmem.c b/mm/shmem.c
index 8631be45b40..4bb28d218eb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -26,6 +26,8 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/xattr.h>
+#include <linux/generic_acl.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/file.h>
@@ -46,6 +48,7 @@
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
+#include <linux/backing-dev.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
@@ -174,9 +177,10 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
static struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
-static struct file_operations shmem_file_operations;
+static const struct file_operations shmem_file_operations;
static struct inode_operations shmem_inode_operations;
static struct inode_operations shmem_dir_inode_operations;
+static struct inode_operations shmem_special_inode_operations;
static struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
@@ -637,7 +641,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
struct page *page = NULL;
int error;
- if (attr->ia_valid & ATTR_SIZE) {
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
if (attr->ia_size < inode->i_size) {
/*
* If truncating down to a partial page, then
@@ -670,6 +674,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
error = inode_change_ok(inode, attr);
if (!error)
error = inode_setattr(inode, attr);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ if (!error && (attr->ia_valid & ATTR_MODE))
+ error = generic_acl_chmod(inode, &shmem_acl_ops);
+#endif
if (page)
page_cache_release(page);
return error;
@@ -1124,7 +1132,7 @@ repeat:
page_cache_release(swappage);
if (error == -ENOMEM) {
/* let kswapd refresh zone for GFP_ATOMICs */
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
}
goto repeat;
}
@@ -1217,7 +1225,7 @@ failed:
struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct page *page = NULL;
unsigned long idx;
int error;
@@ -1240,7 +1248,7 @@ static int shmem_populate(struct vm_area_struct *vma,
unsigned long addr, unsigned long len,
pgprot_t prot, unsigned long pgoff, int nonblock)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct mm_struct *mm = vma->vm_mm;
enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
unsigned long size;
@@ -1285,14 +1293,14 @@ static int shmem_populate(struct vm_area_struct *vma,
#ifdef CONFIG_NUMA
int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
{
- struct inode *i = vma->vm_file->f_dentry->d_inode;
+ struct inode *i = vma->vm_file->f_path.dentry->d_inode;
return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
}
struct mempolicy *
shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
{
- struct inode *i = vma->vm_file->f_dentry->d_inode;
+ struct inode *i = vma->vm_file->f_path.dentry->d_inode;
unsigned long idx;
idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1302,7 +1310,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
@@ -1351,11 +1359,11 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
inode->i_mode = mode;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
- inode->i_blksize = PAGE_CACHE_SIZE;
inode->i_blocks = 0;
inode->i_mapping->a_ops = &shmem_aops;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_generation = get_seconds();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
@@ -1363,6 +1371,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
switch (mode & S_IFMT) {
default:
+ inode->i_op = &shmem_special_inode_operations;
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
@@ -1372,7 +1381,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
&sbinfo->policy_nodes);
break;
case S_IFDIR:
- inode->i_nlink++;
+ inc_nlink(inode);
/* Some things misbehave if size == 0 on a directory */
inode->i_size = 2 * BOGO_DIRENT_SIZE;
inode->i_op = &shmem_dir_inode_operations;
@@ -1413,7 +1422,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig
static ssize_t
shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
loff_t pos;
unsigned long written;
ssize_t err;
@@ -1433,7 +1442,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
if (err || !count)
goto out;
- err = remove_suid(file->f_dentry);
+ err = remove_suid(file->f_path.dentry);
if (err)
goto out;
@@ -1515,7 +1524,7 @@ out:
static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = filp->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
unsigned long index, offset;
@@ -1683,7 +1692,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
iput(inode);
return error;
}
- error = 0;
+ }
+ error = shmem_acl_init(inode, dir);
+ if (error) {
+ iput(inode);
+ return error;
}
if (dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
@@ -1704,7 +1717,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
return error;
- dir->i_nlink++;
+ inc_nlink(dir);
return 0;
}
@@ -1739,7 +1752,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- inode->i_nlink++;
+ inc_nlink(inode);
atomic_inc(&inode->i_count); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
@@ -1761,7 +1774,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- inode->i_nlink--;
+ drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
return 0;
}
@@ -1771,8 +1784,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
if (!simple_empty(dentry))
return -ENOTEMPTY;
- dentry->d_inode->i_nlink--;
- dir->i_nlink--;
+ drop_nlink(dentry->d_inode);
+ drop_nlink(dir);
return shmem_unlink(dir, dentry);
}
@@ -1793,10 +1806,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
if (new_dentry->d_inode) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs)
- old_dir->i_nlink--;
+ drop_nlink(old_dir);
} else if (they_are_dirs) {
- old_dir->i_nlink--;
- new_dir->i_nlink++;
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
}
old_dir->i_size -= BOGO_DIRENT_SIZE;
@@ -1898,6 +1911,132 @@ static struct inode_operations shmem_symlink_inode_operations = {
.put_link = shmem_put_link,
};
+#ifdef CONFIG_TMPFS_POSIX_ACL
+/**
+ * Superblocks without xattr inode operations will get security.* xattr
+ * support from the VFS "for free". As soon as we have any other xattrs
+ * like ACLs, we also need to implement the security.* handlers at
+ * filesystem level, though.
+ */
+
+static size_t shmem_xattr_security_list(struct inode *inode, char *list,
+ size_t list_len, const char *name,
+ size_t name_len)
+{
+ return security_inode_listsecurity(inode, list, list_len);
+}
+
+static int shmem_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return security_inode_getsecurity(inode, name, buffer, size,
+ -EOPNOTSUPP);
+}
+
+static int shmem_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return security_inode_setsecurity(inode, name, value, size, flags);
+}
+
+static struct xattr_handler shmem_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = shmem_xattr_security_list,
+ .get = shmem_xattr_security_get,
+ .set = shmem_xattr_security_set,
+};
+
+static struct xattr_handler *shmem_xattr_handlers[] = {
+ &shmem_xattr_acl_access_handler,
+ &shmem_xattr_acl_default_handler,
+ &shmem_xattr_security_handler,
+ NULL
+};
+#endif
+
+static struct dentry *shmem_get_parent(struct dentry *child)
+{
+ return ERR_PTR(-ESTALE);
+}
+
+static int shmem_match(struct inode *ino, void *vfh)
+{
+ __u32 *fh = vfh;
+ __u64 inum = fh[2];
+ inum = (inum << 32) | fh[1];
+ return ino->i_ino == inum && fh[0] == ino->i_generation;
+}
+
+static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh)
+{
+ struct dentry *de = NULL;
+ struct inode *inode;
+ __u32 *fh = vfh;
+ __u64 inum = fh[2];
+ inum = (inum << 32) | fh[1];
+
+ inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh);
+ if (inode) {
+ de = d_find_alias(inode);
+ iput(inode);
+ }
+
+ return de? de: ERR_PTR(-ESTALE);
+}
+
+static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh,
+ int len, int type,
+ int (*acceptable)(void *context, struct dentry *de),
+ void *context)
+{
+ if (len < 3)
+ return ERR_PTR(-ESTALE);
+
+ return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable,
+ context);
+}
+
+static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+ int connectable)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (*len < 3)
+ return 255;
+
+ if (hlist_unhashed(&inode->i_hash)) {
+ /* Unfortunately insert_inode_hash is not idempotent,
+ * so as we hash inodes here rather than at creation
+ * time, we need a lock to ensure we only try
+ * to do it once
+ */
+ static DEFINE_SPINLOCK(lock);
+ spin_lock(&lock);
+ if (hlist_unhashed(&inode->i_hash))
+ __insert_inode_hash(inode,
+ inode->i_ino + inode->i_generation);
+ spin_unlock(&lock);
+ }
+
+ fh[0] = inode->i_generation;
+ fh[1] = inode->i_ino;
+ fh[2] = ((__u64)inode->i_ino) >> 32;
+
+ *len = 3;
+ return 1;
+}
+
+static struct export_operations shmem_export_ops = {
+ .get_parent = shmem_get_parent,
+ .get_dentry = shmem_get_dentry,
+ .encode_fh = shmem_encode_fh,
+ .decode_fh = shmem_decode_fh,
+};
+
static int shmem_parse_options(char *options, int *mode, uid_t *uid,
gid_t *gid, unsigned long *blocks, unsigned long *inodes,
int *policy, nodemask_t *policy_nodes)
@@ -2070,6 +2209,7 @@ static int shmem_fill_super(struct super_block *sb,
&inodes, &policy, &policy_nodes))
return -EINVAL;
}
+ sb->s_export_op = &shmem_export_ops;
#else
sb->s_flags |= MS_NOUSER;
#endif
@@ -2095,6 +2235,10 @@ static int shmem_fill_super(struct super_block *sb,
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
sb->s_time_gran = 1;
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ sb->s_xattr = shmem_xattr_handlers;
+ sb->s_flags |= MS_POSIXACL;
+#endif
inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
if (!inode)
@@ -2119,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
struct shmem_inode_info *p;
- p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
+ p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
if (!p)
return NULL;
return &p->vfs_inode;
@@ -2131,6 +2275,7 @@ static void shmem_destroy_inode(struct inode *inode)
/* only struct inode is valid if it's an inline symlink */
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
}
+ shmem_acl_destroy_inode(inode);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
@@ -2142,6 +2287,10 @@ static void init_once(void *foo, struct kmem_cache *cachep,
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
inode_init_once(&p->vfs_inode);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ p->i_acl = NULL;
+ p->i_default_acl = NULL;
+#endif
}
}
@@ -2157,8 +2306,7 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
- if (kmem_cache_destroy(shmem_inode_cachep))
- printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
+ kmem_cache_destroy(shmem_inode_cachep);
}
static const struct address_space_operations shmem_aops = {
@@ -2171,7 +2319,7 @@ static const struct address_space_operations shmem_aops = {
.migratepage = migrate_page,
};
-static struct file_operations shmem_file_operations = {
+static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = generic_file_llseek,
@@ -2186,6 +2334,14 @@ static struct inode_operations shmem_inode_operations = {
.truncate = shmem_truncate,
.setattr = shmem_notify_change,
.truncate_range = shmem_truncate_range,
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = generic_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = shmem_permission,
+#endif
+
};
static struct inode_operations shmem_dir_inode_operations = {
@@ -2200,6 +2356,25 @@ static struct inode_operations shmem_dir_inode_operations = {
.mknod = shmem_mknod,
.rename = shmem_rename,
#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_notify_change,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = generic_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = shmem_permission,
+#endif
+};
+
+static struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_notify_change,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = generic_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = shmem_permission,
+#endif
};
static struct super_operations shmem_ops = {
@@ -2318,8 +2493,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
- file->f_vfsmnt = mntget(shm_mnt);
- file->f_dentry = dentry;
+ file->f_path.mnt = mntget(shm_mnt);
+ file->f_path.dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_op = &shmem_file_operations;
file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
new file mode 100644
index 00000000000..f5664c5b9eb
--- /dev/null
+++ b/mm/shmem_acl.c
@@ -0,0 +1,197 @@
+/*
+ * mm/shmem_acl.c
+ *
+ * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/xattr.h>
+#include <linux/generic_acl.h>
+
+/**
+ * shmem_get_acl - generic_acl_operations->getacl() operation
+ */
+static struct posix_acl *
+shmem_get_acl(struct inode *inode, int type)
+{
+ struct posix_acl *acl = NULL;
+
+ spin_lock(&inode->i_lock);
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+
+ return acl;
+}
+
+/**
+ * shmem_set_acl - generic_acl_operations->setacl() operation
+ */
+static void
+shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct posix_acl *free = NULL;
+
+ spin_lock(&inode->i_lock);
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ free = SHMEM_I(inode)->i_acl;
+ SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ free = SHMEM_I(inode)->i_default_acl;
+ SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+ posix_acl_release(free);
+}
+
+struct generic_acl_operations shmem_acl_ops = {
+ .getacl = shmem_get_acl,
+ .setacl = shmem_set_acl,
+};
+
+/**
+ * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
+ * shmem_xattr_acl_access_handler - plumbing code to implement the
+ * system.posix_acl_access xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
+ list, list_size);
+}
+
+static int
+shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
+ size);
+}
+
+static int
+shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
+ size);
+}
+
+struct xattr_handler shmem_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .list = shmem_list_acl_access,
+ .get = shmem_get_acl_access,
+ .set = shmem_set_acl_access,
+};
+
+/**
+ * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
+ * shmem_xattr_acl_default_handler - plumbing code to implement the
+ * system.posix_acl_default xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
+ list, list_size);
+}
+
+static int
+shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
+ size);
+}
+
+static int
+shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
+ size);
+}
+
+struct xattr_handler shmem_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .list = shmem_list_acl_default,
+ .get = shmem_get_acl_default,
+ .set = shmem_set_acl_default,
+};
+
+/**
+ * shmem_acl_init - Inizialize the acl(s) of a new inode
+ */
+int
+shmem_acl_init(struct inode *inode, struct inode *dir)
+{
+ return generic_acl_init(inode, dir, &shmem_acl_ops);
+}
+
+/**
+ * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
+ *
+ * This is done before destroying the actual inode.
+ */
+
+void
+shmem_acl_destroy_inode(struct inode *inode)
+{
+ if (SHMEM_I(inode)->i_acl)
+ posix_acl_release(SHMEM_I(inode)->i_acl);
+ SHMEM_I(inode)->i_acl = NULL;
+ if (SHMEM_I(inode)->i_default_acl)
+ posix_acl_release(SHMEM_I(inode)->i_default_acl);
+ SHMEM_I(inode)->i_default_acl = NULL;
+}
+
+/**
+ * shmem_check_acl - check_acl() callback for generic_permission()
+ */
+static int
+shmem_check_acl(struct inode *inode, int mask)
+{
+ struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
+
+ if (acl) {
+ int error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return error;
+ }
+ return -EAGAIN;
+}
+
+/**
+ * shmem_permission - permission() inode operation
+ */
+int
+shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+ return generic_permission(inode, mask, shmem_check_acl);
+}
diff --git a/mm/slab.c b/mm/slab.c
index 7a48eb1a60c..2c655532f5e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -86,7 +86,6 @@
* All object allocations for a node occur from node specific slab lists.
*/
-#include <linux/config.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/poison.h>
@@ -104,12 +103,13 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <linux/nodemask.h>
#include <linux/mempolicy.h>
#include <linux/mutex.h>
+#include <linux/fault-inject.h>
#include <linux/rtmutex.h>
-#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
@@ -314,7 +314,7 @@ static int drain_freelist(struct kmem_cache *cache,
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
int node);
static int enable_cpucache(struct kmem_cache *cachep);
-static void cache_reap(void *unused);
+static void cache_reap(struct work_struct *unused);
/*
* This function must be completely optimized away if a constant is passed to
@@ -731,7 +731,10 @@ static inline void init_lock_keys(void)
}
#endif
-/* Guard access to the cache-chain. */
+/*
+ * 1. Guard access to the cache-chain.
+ * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ */
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
@@ -754,7 +757,7 @@ int slab_is_available(void)
return g_cpucache_up == FULL;
}
-static DEFINE_PER_CPU(struct work_struct, reap_work);
+static DEFINE_PER_CPU(struct delayed_work, reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
@@ -867,6 +870,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
dump_stack();
}
+/*
+ * By default on NUMA we use alien caches to stage the freeing of
+ * objects allocated from other nodes. This causes massive memory
+ * inefficiencies when using fake NUMA setup to split memory into a
+ * large number of small nodes, so it can be disabled on the command
+ * line
+ */
+
+static int use_alien_caches __read_mostly = 1;
+static int __init noaliencache_setup(char *s)
+{
+ use_alien_caches = 0;
+ return 1;
+}
+__setup("noaliencache", noaliencache_setup);
+
#ifdef CONFIG_NUMA
/*
* Special reaping functions for NUMA systems called from cache_reap().
@@ -884,7 +903,7 @@ static void init_reap_node(int cpu)
if (node == MAX_NUMNODES)
node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
+ per_cpu(reap_node, cpu) = node;
}
static void next_reap_node(void)
@@ -917,17 +936,18 @@ static void next_reap_node(void)
*/
static void __devinit start_cpu_timer(int cpu)
{
- struct work_struct *reap_work = &per_cpu(reap_work, cpu);
+ struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
/*
* When this gets called from do_initcalls via cpucache_init(),
* init_workqueues() has already run, so keventd will be setup
* at that time.
*/
- if (keventd_up() && reap_work->func == NULL) {
+ if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_WORK(reap_work, cache_reap, NULL);
- schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
+ INIT_DELAYED_WORK(reap_work, cache_reap);
+ schedule_delayed_work_on(cpu, reap_work,
+ __round_jiffies_relative(HZ, cpu));
}
}
@@ -972,8 +992,40 @@ static int transfer_objects(struct array_cache *to,
return nr;
}
-#ifdef CONFIG_NUMA
-static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+#ifndef CONFIG_NUMA
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+ return (struct array_cache **)BAD_ALIEN_MAGIC;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ return 0;
+}
+
+static inline void *alternate_node_alloc(struct kmem_cache *cachep,
+ gfp_t flags)
+{
+ return NULL;
+}
+
+static inline void *____cache_alloc_node(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
+{
+ return NULL;
+}
+
+#else /* CONFIG_NUMA */
+
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -1075,15 +1127,18 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
int nodeid = slabp->nodeid;
struct kmem_list3 *l3;
struct array_cache *alien = NULL;
+ int node;
+
+ node = numa_node_id();
/*
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(slabp->nodeid == numa_node_id()))
+ if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
return 0;
- l3 = cachep->nodelists[numa_node_id()];
+ l3 = cachep->nodelists[node];
STATS_INC_NODEFREES(cachep);
if (l3->alien && l3->alien[nodeid]) {
alien = l3->alien[nodeid];
@@ -1101,26 +1156,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
return 1;
}
-
-#else
-
-#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
-
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
-{
- return (struct array_cache **)BAD_ALIEN_MAGIC;
-}
-
-static inline void free_alien_cache(struct array_cache **ac_ptr)
-{
-}
-
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
-{
- return 0;
-}
-
#endif
static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1178,7 +1213,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
- struct array_cache **alien;
+ struct array_cache **alien = NULL;
nc = alloc_arraycache(node, cachep->limit,
cachep->batchcount);
@@ -1190,9 +1225,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
if (!shared)
goto bad;
- alien = alloc_alien_cache(node, cachep->limit);
- if (!alien)
- goto bad;
+ if (use_alien_caches) {
+ alien = alloc_alien_cache(node, cachep->limit);
+ if (!alien)
+ goto bad;
+ }
cachep->array[cpu] = nc;
l3 = cachep->nodelists[node];
BUG_ON(!l3);
@@ -1216,12 +1253,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
kfree(shared);
free_alien_cache(alien);
}
- mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
+ mutex_unlock(&cache_chain_mutex);
start_cpu_timer(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DOWN_PREPARE:
+ mutex_lock(&cache_chain_mutex);
+ break;
+ case CPU_DOWN_FAILED:
+ mutex_unlock(&cache_chain_mutex);
+ break;
case CPU_DEAD:
/*
* Even if all the cpus of a node are down, we don't free the
@@ -1232,8 +1275,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
* gets destroyed at kmem_cache_destroy().
*/
/* fall thru */
+#endif
case CPU_UP_CANCELED:
- mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
@@ -1294,11 +1337,9 @@ free_array_cache:
}
mutex_unlock(&cache_chain_mutex);
break;
-#endif
}
return NOTIFY_OK;
bad:
- mutex_unlock(&cache_chain_mutex);
return NOTIFY_BAD;
}
@@ -1314,7 +1355,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
{
struct kmem_list3 *ptr;
- BUG_ON(cachep->nodelists[nodeid] != list);
ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
BUG_ON(!ptr);
@@ -1341,6 +1381,7 @@ void __init kmem_cache_init(void)
struct cache_names *names;
int i;
int order;
+ int node;
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
@@ -1375,12 +1416,14 @@ void __init kmem_cache_init(void)
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
+ node = numa_node_id();
+
/* 1) create the cache_cache */
INIT_LIST_HEAD(&cache_chain);
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
- cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
+ cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
cache_line_size());
@@ -1485,19 +1528,18 @@ void __init kmem_cache_init(void)
}
/* 5) Replace the bootstrap kmem_list3's */
{
- int node;
+ int nid;
+
/* Replace the static kmem_list3 structures for the boot cpu */
- init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
- numa_node_id());
+ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
- for_each_online_node(node) {
+ for_each_online_node(nid) {
init_list(malloc_sizes[INDEX_AC].cs_cachep,
- &initkmem_list3[SIZE_AC + node], node);
+ &initkmem_list3[SIZE_AC + nid], nid);
if (INDEX_AC != INDEX_L3) {
init_list(malloc_sizes[INDEX_L3].cs_cachep,
- &initkmem_list3[SIZE_L3 + node],
- node);
+ &initkmem_list3[SIZE_L3 + nid], nid);
}
}
}
@@ -1564,6 +1606,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
*/
flags |= __GFP_COMP;
#endif
+
flags |= cachep->gfpflags;
page = alloc_pages_node(nodeid, flags, cachep->gfporder);
@@ -1665,10 +1708,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
static void dump_line(char *data, int offset, int limit)
{
int i;
+ unsigned char error = 0;
+ int bad_count = 0;
+
printk(KERN_ERR "%03x:", offset);
- for (i = 0; i < limit; i++)
+ for (i = 0; i < limit; i++) {
+ if (data[offset + i] != POISON_FREE) {
+ error = data[offset + i];
+ bad_count++;
+ }
printk(" %02x", (unsigned char)data[offset + i]);
+ }
printk("\n");
+
+ if (bad_count == 1) {
+ error ^= POISON_FREE;
+ if (!(error & (error - 1))) {
+ printk(KERN_ERR "Single bit error detected. Probably "
+ "bad RAM.\n");
+#ifdef CONFIG_X86
+ printk(KERN_ERR "Run memtest86+ or a similar memory "
+ "test tool.\n");
+#else
+ printk(KERN_ERR "Run a memory test tool.\n");
+#endif
+ }
+ }
}
#endif
@@ -2055,15 +2120,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
/*
- * Prevent CPUs from coming and going.
- * lock_cpu_hotplug() nests outside cache_chain_mutex
+ * We use cache_chain_mutex to ensure a consistent view of
+ * cpu_online_map as well. Please see cpuup_callback
*/
- lock_cpu_hotplug();
-
mutex_lock(&cache_chain_mutex);
list_for_each_entry(pc, &cache_chain, next) {
- mm_segment_t old_fs = get_fs();
char tmp;
int res;
@@ -2072,9 +2134,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* destroy its slab cache and no-one else reuses the vmalloc
* area of the module. Print a warning.
*/
- set_fs(KERNEL_DS);
- res = __get_user(tmp, pc->name);
- set_fs(old_fs);
+ res = probe_kernel_address(pc->name, tmp);
if (res) {
printk("SLAB: cache with size %d has lost its name\n",
pc->buffer_size);
@@ -2154,25 +2214,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
ralign = BYTES_PER_WORD;
- /* 2) arch mandated alignment: disables debug if necessary */
+ /* 2) arch mandated alignment */
if (ralign < ARCH_SLAB_MINALIGN) {
ralign = ARCH_SLAB_MINALIGN;
- if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
- /* 3) caller mandated alignment: disables debug if necessary */
+ /* 3) caller mandated alignment */
if (ralign < align) {
ralign = align;
- if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
+ /* disable debug if necessary */
+ if (ralign > BYTES_PER_WORD)
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
align = ralign;
/* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
+ cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
if (!cachep)
goto oops;
@@ -2283,7 +2342,6 @@ oops:
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2401,6 +2459,7 @@ out:
return nr_freed;
}
+/* Called with cache_chain_mutex held to protect against cpu hotplug */
static int __cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
@@ -2431,9 +2490,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
+ int ret;
BUG_ON(!cachep || in_interrupt());
- return __cache_shrink(cachep);
+ mutex_lock(&cache_chain_mutex);
+ ret = __cache_shrink(cachep);
+ mutex_unlock(&cache_chain_mutex);
+ return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2442,7 +2505,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
* @cachep: the cache to destroy
*
* Remove a struct kmem_cache object from the slab cache.
- * Returns 0 on success.
*
* It is expected this function will be called by a module when it is
* unloaded. This will remove the cache completely, and avoid a duplicate
@@ -2454,36 +2516,28 @@ EXPORT_SYMBOL(kmem_cache_shrink);
* The caller must guarantee that noone will allocate memory from the cache
* during the kmem_cache_destroy().
*/
-int kmem_cache_destroy(struct kmem_cache *cachep)
+void kmem_cache_destroy(struct kmem_cache *cachep)
{
BUG_ON(!cachep || in_interrupt());
- /* Don't let CPUs to come and go */
- lock_cpu_hotplug();
-
/* Find the cache in the chain of caches. */
mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
*/
list_del(&cachep->next);
- mutex_unlock(&cache_chain_mutex);
-
if (__cache_shrink(cachep)) {
slab_error(cachep, "Can't free all objects");
- mutex_lock(&cache_chain_mutex);
list_add(&cachep->next, &cache_chain);
mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
- return 1;
+ return;
}
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
synchronize_rcu();
__kmem_cache_destroy(cachep);
- unlock_cpu_hotplug();
- return 0;
+ mutex_unlock(&cache_chain_mutex);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2507,7 +2561,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
slabp = kmem_cache_alloc_node(cachep->slabp_cache,
- local_flags, nodeid);
+ local_flags & ~GFP_THISNODE, nodeid);
if (!slabp)
return NULL;
} else {
@@ -2577,7 +2631,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
{
- if (flags & SLAB_DMA)
+ if (flags & GFP_DMA)
BUG_ON(!(cachep->gfpflags & GFP_DMA));
else
BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2648,10 +2702,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid, void *objp)
{
struct slab *slabp;
- void *objp;
size_t offset;
gfp_t local_flags;
unsigned long ctor_flags;
@@ -2661,12 +2715,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW));
- if (flags & SLAB_NO_GROW)
+ BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
+ if (flags & __GFP_NO_GROW)
return 0;
ctor_flags = SLAB_CTOR_CONSTRUCTOR;
- local_flags = (flags & SLAB_LEVEL_MASK);
+ local_flags = (flags & GFP_LEVEL_MASK);
if (!(local_flags & __GFP_WAIT))
/*
* Not allowed to sleep. Need to tell a constructor about
@@ -2703,12 +2757,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- objp = kmem_getpages(cachep, flags, nodeid);
+ if (!objp)
+ objp = kmem_getpages(cachep, flags, nodeid);
if (!objp)
goto failed;
/* Get slab management. */
- slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
+ slabp = alloc_slabmgmt(cachep, objp, offset,
+ local_flags & ~GFP_THISNODE, nodeid);
if (!slabp)
goto opps1;
@@ -2881,6 +2937,9 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
+ int node;
+
+ node = numa_node_id();
check_irq_off();
ac = cpu_cache_get(cachep);
@@ -2894,7 +2953,7 @@ retry:
*/
batchcount = BATCHREFILL_LIMIT;
}
- l3 = cachep->nodelists[numa_node_id()];
+ l3 = cachep->nodelists[node];
BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);
@@ -2924,7 +2983,7 @@ retry:
STATS_SET_HIGH(cachep);
ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
- numa_node_id());
+ node);
}
check_slabp(cachep, slabp);
@@ -2943,7 +3002,7 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags, numa_node_id());
+ x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
@@ -3019,26 +3078,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
cachep->ctor(objp, cachep, ctor_flags);
}
+#if ARCH_SLAB_MINALIGN
+ if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+ printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ objp, ARCH_SLAB_MINALIGN);
+ }
+#endif
return objp;
}
#else
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
+#ifdef CONFIG_FAILSLAB
+
+static struct failslab_attr {
+
+ struct fault_attr attr;
+
+ u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+ struct dentry *ignore_gfp_wait_file;
+#endif
+
+} failslab = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+};
+
+static int __init setup_failslab(char *str)
+{
+ return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+ if (cachep == &cache_cache)
+ return 0;
+ if (flags & __GFP_NOFAIL)
+ return 0;
+ if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
+ return 0;
+
+ return should_fail(&failslab.attr, obj_size(cachep));
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init failslab_debugfs(void)
+{
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+ int err;
+
+ err = init_fault_attr_dentries(&failslab.attr, "failslab");
+ if (err)
+ return err;
+ dir = failslab.attr.dentries.dir;
+
+ failslab.ignore_gfp_wait_file =
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_wait);
+
+ if (!failslab.ignore_gfp_wait_file) {
+ err = -ENOMEM;
+ debugfs_remove(failslab.ignore_gfp_wait_file);
+ cleanup_fault_attr_dentries(&failslab.attr);
+ }
+
+ return err;
+}
+
+late_initcall(failslab_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAILSLAB */
+
+static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FAILSLAB */
+
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
-#ifdef CONFIG_NUMA
- if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
- objp = alternate_node_alloc(cachep, flags);
- if (objp != NULL)
- return objp;
- }
-#endif
-
check_irq_off();
+
+ if (should_failslab(cachep, flags))
+ return NULL;
+
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
STATS_INC_ALLOCHIT(cachep);
@@ -3055,12 +3189,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
gfp_t flags, void *caller)
{
unsigned long save_flags;
- void *objp;
+ void *objp = NULL;
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- objp = ____cache_alloc(cachep, flags);
+
+ if (unlikely(NUMA_BUILD &&
+ current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+ objp = alternate_node_alloc(cachep, flags);
+
+ if (!objp)
+ objp = ____cache_alloc(cachep, flags);
+ /*
+ * We may just have run out of memory on the local node.
+ * ____cache_alloc_node() knows how to locate memory on other nodes
+ */
+ if (NUMA_BUILD && !objp)
+ objp = ____cache_alloc_node(cachep, flags, numa_node_id());
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
caller);
@@ -3079,7 +3225,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
{
int nid_alloc, nid_here;
- if (in_interrupt())
+ if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_node_id();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3087,14 +3233,77 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
if (nid_alloc != nid_here)
- return __cache_alloc_node(cachep, flags, nid_alloc);
+ return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
}
/*
+ * Fallback function if there was no memory available and no objects on a
+ * certain node and fall back is permitted. First we scan all the
+ * available nodelists for available objects. If that fails then we
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
+ */
+void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+ struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
+ ->node_zonelists[gfp_zone(flags)];
+ struct zone **z;
+ void *obj = NULL;
+ int nid;
+
+retry:
+ /*
+ * Look through allowed nodes for objects available
+ * from existing per node queues.
+ */
+ for (z = zonelist->zones; *z && !obj; z++) {
+ nid = zone_to_nid(*z);
+
+ if (cpuset_zone_allowed(*z, flags | __GFP_HARDWALL) &&
+ cache->nodelists[nid] &&
+ cache->nodelists[nid]->free_objects)
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ }
+
+ if (!obj) {
+ /*
+ * This allocation will be performed within the constraints
+ * of the current cpuset / memory policy requirements.
+ * We may trigger various forms of reclaim on the allowed
+ * set and go into memory reserves if necessary.
+ */
+ obj = kmem_getpages(cache, flags, -1);
+ if (obj) {
+ /*
+ * Insert into the appropriate per node queues
+ */
+ nid = page_to_nid(virt_to_page(obj));
+ if (cache_grow(cache, flags, nid, obj)) {
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ if (!obj)
+ /*
+ * Another processor may allocate the
+ * objects in the slab since we are
+ * not holding any locks.
+ */
+ goto retry;
+ } else {
+ kmem_freepages(cache, obj);
+ obj = NULL;
+ }
+ }
+ }
+ return obj;
+}
+
+/*
* A interface to enable slab creation on nodeid
*/
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
struct list_head *entry;
@@ -3143,12 +3352,16 @@ retry:
must_grow:
spin_unlock(&l3->list_lock);
- x = cache_grow(cachep, flags, nodeid);
+ x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+ if (x)
+ goto retry;
- if (!x)
- return NULL;
+ if (!(flags & __GFP_THISNODE))
+ /* Unable to grow the cache. Fall back to other nodes. */
+ return fallback_alloc(cachep, flags);
+
+ return NULL;
- goto retry;
done:
return obj;
}
@@ -3357,35 +3570,59 @@ out:
* @flags: See kmalloc().
* @nodeid: node number of the target node.
*
- * Identical to kmem_cache_alloc, except that this function is slow
- * and can sleep. And it will allocate memory on the given node, which
- * can improve the performance for cpu bound structures.
- * New and improved: it will now make sure that the object gets
- * put on the correct node list so that there is no false sharing.
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
*/
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static __always_inline void *
+__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid, void *caller)
{
unsigned long save_flags;
- void *ptr;
+ void *ptr = NULL;
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- if (nodeid == -1 || nodeid == numa_node_id() ||
- !cachep->nodelists[nodeid])
- ptr = ____cache_alloc(cachep, flags);
- else
- ptr = __cache_alloc_node(cachep, flags, nodeid);
- local_irq_restore(save_flags);
+ if (unlikely(nodeid == -1))
+ nodeid = numa_node_id();
+
+ if (likely(cachep->nodelists[nodeid])) {
+ if (nodeid == numa_node_id()) {
+ /*
+ * Use the locally cached objects if possible.
+ * However ____cache_alloc does not allow fallback
+ * to other nodes. It may fail while we still have
+ * objects on other nodes available.
+ */
+ ptr = ____cache_alloc(cachep, flags);
+ }
+ if (!ptr) {
+ /* ___cache_alloc_node can fall back to other nodes */
+ ptr = ____cache_alloc_node(cachep, flags, nodeid);
+ }
+ } else {
+ /* Node not bootstrapped yet */
+ if (!(flags & __GFP_THISNODE))
+ ptr = fallback_alloc(cachep, flags);
+ }
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
- __builtin_return_address(0));
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
return ptr;
}
+
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+{
+ return __cache_alloc_node(cachep, flags, nodeid,
+ __builtin_return_address(0));
+}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
@@ -3394,8 +3631,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return NULL;
return kmem_cache_alloc_node(cachep, flags, node);
}
+
+#ifdef CONFIG_DEBUG_SLAB
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node,
+ __builtin_return_address(0));
+}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, void *caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#else
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, NULL);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_NUMA */
/**
* __do_kmalloc - allocate memory
@@ -3420,22 +3678,25 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
}
+#ifdef CONFIG_DEBUG_SLAB
void *__kmalloc(size_t size, gfp_t flags)
{
-#ifndef CONFIG_DEBUG_SLAB
- return __do_kmalloc(size, flags, NULL);
-#else
return __do_kmalloc(size, flags, __builtin_return_address(0));
-#endif
}
EXPORT_SYMBOL(__kmalloc);
-#ifdef CONFIG_DEBUG_SLAB
void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
{
return __do_kmalloc(size, flags, caller);
}
EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#else
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc(size, flags, NULL);
+}
+EXPORT_SYMBOL(__kmalloc);
#endif
/**
@@ -3503,13 +3764,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
int node;
struct kmem_list3 *l3;
struct array_cache *new_shared;
- struct array_cache **new_alien;
+ struct array_cache **new_alien = NULL;
for_each_online_node(node) {
- new_alien = alloc_alien_cache(node, cachep->limit);
- if (!new_alien)
- goto fail;
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit);
+ if (!new_alien)
+ goto fail;
+ }
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
@@ -3735,7 +3998,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
* If we cannot acquire the cache chain mutex then just give up - we'll try
* again on the next iteration.
*/
-static void cache_reap(void *unused)
+static void cache_reap(struct work_struct *unused)
{
struct kmem_cache *searchp;
struct kmem_list3 *l3;
@@ -3744,7 +4007,7 @@ static void cache_reap(void *unused)
if (!mutex_trylock(&cache_chain_mutex)) {
/* Give up. Setup the next iteration. */
schedule_delayed_work(&__get_cpu_var(reap_work),
- REAPTIMEOUT_CPUC);
+ round_jiffies_relative(REAPTIMEOUT_CPUC));
return;
}
@@ -3790,7 +4053,8 @@ next:
next_reap_node();
refresh_cpu_vm_stats(smp_processor_id());
/* Set up the next iteration */
- schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+ schedule_delayed_work(&__get_cpu_var(reap_work),
+ round_jiffies_relative(REAPTIMEOUT_CPUC));
}
#ifdef CONFIG_PROC_FS
@@ -3958,7 +4222,7 @@ static int s_show(struct seq_file *m, void *p)
* + further values on SMP and with statistics enabled
*/
-struct seq_operations slabinfo_op = {
+const struct seq_operations slabinfo_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
@@ -4156,7 +4420,7 @@ static int leaks_show(struct seq_file *m, void *p)
return 0;
}
-struct seq_operations slabstats_op = {
+const struct seq_operations slabstats_op = {
.start = leaks_start,
.next = s_next,
.stop = s_stop,
diff --git a/mm/slob.c b/mm/slob.c
index 20188627347..542394184a5 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
}
EXPORT_SYMBOL(kmem_cache_create);
-int kmem_cache_destroy(struct kmem_cache *c)
+void kmem_cache_destroy(struct kmem_cache *c)
{
slob_free(c, sizeof(struct kmem_cache));
- return 0;
}
EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/sparse.c b/mm/sparse.c
index 86c52ab8087..ac26eb0d73c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
#endif
EXPORT_SYMBOL(mem_section);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
+
+int page_to_nid(struct page *page)
+{
+ return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+#endif
+
#ifdef CONFIG_SPARSEMEM_EXTREME
static struct mem_section *sparse_index_alloc(int nid)
{
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid)
struct mem_section *section;
int ret = 0;
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ section_to_node_table[section_nr] = nid;
+#endif
+
if (mem_section[root])
return -EEXIST;
@@ -211,7 +234,7 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
struct page *page, *ret;
unsigned long memmap_size = sizeof(struct page) * nr_pages;
- page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+ page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
if (page)
goto got_map_page;
diff --git a/mm/swap.c b/mm/swap.c
index 2e0e871f542..2ed7be39795 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page)
{
page = (struct page *)page_private(page);
if (put_page_testzero(page)) {
- void (*dtor)(struct page *page);
+ compound_page_dtor *dtor;
- dtor = (void (*)(struct page *))page[1].lru.next;
+ dtor = get_compound_page_dtor(page);
(*dtor)(page);
}
}
@@ -216,7 +216,7 @@ void lru_add_drain(void)
}
#ifdef CONFIG_NUMA
-static void lru_add_drain_per_cpu(void *dummy)
+static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
@@ -226,7 +226,7 @@ static void lru_add_drain_per_cpu(void *dummy)
*/
int lru_add_drain_all(void)
{
- return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+ return schedule_on_each_cpu(lru_add_drain_per_cpu);
}
#else
@@ -514,5 +514,7 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+#ifdef CONFIG_HOTPLUG_CPU
hotcpu_notifier(cpu_swap_callback, 0);
+#endif
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1f5ec78378..b9fc0e5de6d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -427,34 +427,48 @@ void free_swap_and_cache(swp_entry_t entry)
#ifdef CONFIG_SOFTWARE_SUSPEND
/*
- * Find the swap type that corresponds to given device (if any)
+ * Find the swap type that corresponds to given device (if any).
*
- * This is needed for software suspend and is done in such a way that inode
- * aliasing is allowed.
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
+ * from 0, in which the swap header is expected to be located.
+ *
+ * This is needed for the suspend to disk (aka swsusp).
*/
-int swap_type_of(dev_t device)
+int swap_type_of(dev_t device, sector_t offset)
{
+ struct block_device *bdev = NULL;
int i;
+ if (device)
+ bdev = bdget(device);
+
spin_lock(&swap_lock);
for (i = 0; i < nr_swapfiles; i++) {
- struct inode *inode;
+ struct swap_info_struct *sis = swap_info + i;
- if (!(swap_info[i].flags & SWP_WRITEOK))
+ if (!(sis->flags & SWP_WRITEOK))
continue;
- if (!device) {
+ if (!bdev) {
spin_unlock(&swap_lock);
return i;
}
- inode = swap_info[i].swap_file->f_dentry->d_inode;
- if (S_ISBLK(inode->i_mode) &&
- device == MKDEV(imajor(inode), iminor(inode))) {
- spin_unlock(&swap_lock);
- return i;
+ if (bdev == sis->bdev) {
+ struct swap_extent *se;
+
+ se = list_entry(sis->extent_list.next,
+ struct swap_extent, list);
+ if (se->start_block == offset) {
+ spin_unlock(&swap_lock);
+ bdput(bdev);
+ return i;
+ }
}
}
spin_unlock(&swap_lock);
+ if (bdev)
+ bdput(bdev);
+
return -ENODEV;
}
@@ -931,6 +945,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
}
}
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int swap_type, pgoff_t offset)
+{
+ struct swap_info_struct *sis;
+
+ if (swap_type >= nr_swapfiles)
+ return 0;
+
+ sis = swap_info + swap_type;
+ return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
+
/*
* Free all of a swapdev's extent information
*/
@@ -1274,10 +1305,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
mutex_lock(&swapon_mutex);
+ if (!l)
+ return SEQ_START_TOKEN;
+
for (i = 0; i < nr_swapfiles; i++, ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
continue;
- if (!l--)
+ if (!--l)
return ptr;
}
@@ -1286,10 +1320,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
- struct swap_info_struct *ptr = v;
+ struct swap_info_struct *ptr;
struct swap_info_struct *endptr = swap_info + nr_swapfiles;
- for (++ptr; ptr < endptr; ptr++) {
+ if (v == SEQ_START_TOKEN)
+ ptr = swap_info;
+ else {
+ ptr = v;
+ ptr++;
+ }
+
+ for (; ptr < endptr; ptr++) {
if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
continue;
++*pos;
@@ -1310,14 +1351,16 @@ static int swap_show(struct seq_file *swap, void *v)
struct file *file;
int len;
- if (v == swap_info)
- seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ if (ptr == SEQ_START_TOKEN) {
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ return 0;
+ }
file = ptr->swap_file;
- len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+ len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file->f_dentry->d_inode->i_mode) ?
+ S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
"partition" : "file\t",
ptr->pages << (PAGE_SHIFT - 10),
ptr->inuse_pages << (PAGE_SHIFT - 10),
@@ -1325,7 +1368,7 @@ static int swap_show(struct seq_file *swap, void *v)
return 0;
}
-static struct seq_operations swaps_op = {
+static const struct seq_operations swaps_op = {
.start = swap_start,
.next = swap_next,
.stop = swap_stop,
@@ -1337,7 +1380,7 @@ static int swaps_open(struct inode *inode, struct file *file)
return seq_open(file, &swaps_op);
}
-static struct file_operations proc_swaps_operations = {
+static const struct file_operations proc_swaps_operations = {
.open = swaps_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1540,6 +1583,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = -EINVAL;
if (!maxpages)
goto bad_swap;
+ if (swapfilesize && maxpages > swapfilesize) {
+ printk(KERN_WARNING
+ "Swap area shorter than signature indicates\n");
+ goto bad_swap;
+ }
if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
goto bad_swap;
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
@@ -1567,12 +1615,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}
- if (swapfilesize && maxpages > swapfilesize) {
- printk(KERN_WARNING
- "Swap area shorter than signature indicates\n");
- error = -EINVAL;
- goto bad_swap;
- }
if (nr_good_pages) {
p->swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
@@ -1723,13 +1765,14 @@ get_swap_info_struct(unsigned type)
*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
{
- int ret = 0, i = 1 << page_cluster;
+ int our_page_cluster = page_cluster;
+ int ret = 0, i = 1 << our_page_cluster;
unsigned long toff;
struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
- if (!page_cluster) /* no readahead */
+ if (!our_page_cluster) /* no readahead */
return 0;
- toff = (swp_offset(entry) >> page_cluster) << page_cluster;
+ toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
if (!toff) /* first page is swap header */
toff++, i--;
*offset = toff;
diff --git a/mm/thrash.c b/mm/thrash.c
index f4c560b4a2b..9ef9071f99b 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -7,100 +7,74 @@
*
* Simple token based thrashing protection, using the algorithm
* described in: http://www.cs.wm.edu/~sjiang/token.pdf
+ *
+ * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
+ * Improved algorithm to pass token:
+ * Each task has a priority which is incremented if it contended
+ * for the token in an interval less than its previous attempt.
+ * If the token is acquired, that task's priority is boosted to prevent
+ * the token from bouncing around too often and to let the task make
+ * some progress in its execution.
*/
+
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
static DEFINE_SPINLOCK(swap_token_lock);
-static unsigned long swap_token_timeout;
-static unsigned long swap_token_check;
-struct mm_struct * swap_token_mm = &init_mm;
-
-#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT (300 * HZ)
-/*
- * Currently disabled; Needs further code to work at HZ * 300.
- */
-unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
-
-/*
- * Take the token away if the process had no page faults
- * in the last interval, or if it has held the token for
- * too long.
- */
-#define SWAP_TOKEN_ENOUGH_RSS 1
-#define SWAP_TOKEN_TIMED_OUT 2
-static int should_release_swap_token(struct mm_struct *mm)
-{
- int ret = 0;
- if (!mm->recent_pagein)
- ret = SWAP_TOKEN_ENOUGH_RSS;
- else if (time_after(jiffies, swap_token_timeout))
- ret = SWAP_TOKEN_TIMED_OUT;
- mm->recent_pagein = 0;
- return ret;
-}
+struct mm_struct *swap_token_mm;
+static unsigned int global_faults;
-/*
- * Try to grab the swapout protection token. We only try to
- * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
- * SMP lock contention and to check that the process that held
- * the token before is no longer thrashing.
- */
void grab_swap_token(void)
{
- struct mm_struct *mm;
- int reason;
+ int current_interval;
- /* We have the token. Let others know we still need it. */
- if (has_swap_token(current->mm)) {
- current->mm->recent_pagein = 1;
- if (unlikely(!swap_token_default_timeout))
- disable_swap_token();
- return;
- }
-
- if (time_after(jiffies, swap_token_check)) {
+ global_faults++;
- if (!swap_token_default_timeout) {
- swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
- return;
- }
-
- /* ... or if we recently held the token. */
- if (time_before(jiffies, current->mm->swap_token_time))
- return;
+ current_interval = global_faults - current->mm->faultstamp;
- if (!spin_trylock(&swap_token_lock))
- return;
+ if (!spin_trylock(&swap_token_lock))
+ return;
- swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+ /* First come first served */
+ if (swap_token_mm == NULL) {
+ current->mm->token_priority = current->mm->token_priority + 2;
+ swap_token_mm = current->mm;
+ goto out;
+ }
- mm = swap_token_mm;
- if ((reason = should_release_swap_token(mm))) {
- unsigned long eligible = jiffies;
- if (reason == SWAP_TOKEN_TIMED_OUT) {
- eligible += swap_token_default_timeout;
- }
- mm->swap_token_time = eligible;
- swap_token_timeout = jiffies + swap_token_default_timeout;
+ if (current->mm != swap_token_mm) {
+ if (current_interval < current->mm->last_interval)
+ current->mm->token_priority++;
+ else {
+ current->mm->token_priority--;
+ if (unlikely(current->mm->token_priority < 0))
+ current->mm->token_priority = 0;
+ }
+ /* Check if we deserve the token */
+ if (current->mm->token_priority >
+ swap_token_mm->token_priority) {
+ current->mm->token_priority += 2;
swap_token_mm = current->mm;
}
- spin_unlock(&swap_token_lock);
+ } else {
+ /* Token holder came in again! */
+ current->mm->token_priority += 2;
}
- return;
+
+out:
+ current->mm->faultstamp = global_faults;
+ current->mm->last_interval = current_interval;
+ spin_unlock(&swap_token_lock);
+return;
}
/* Called on process exit. */
void __put_swap_token(struct mm_struct *mm)
{
spin_lock(&swap_token_lock);
- if (likely(mm == swap_token_mm)) {
- mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
- swap_token_mm = &init_mm;
- swap_token_check = jiffies;
- }
+ if (likely(mm == swap_token_mm))
+ swap_token_mm = NULL;
spin_unlock(&swap_token_lock);
}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 5f2cbf0f153..c7f6e1914bc 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
d_instantiate(dentry, inode);
inode->i_nlink = 0; /* It is unlinked */
- file->f_vfsmnt = mntget(shm_mnt);
- file->f_dentry = dentry;
+ file->f_path.mnt = mntget(shm_mnt);
+ file->f_path.dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_op = &ramfs_file_operations;
file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/truncate.c b/mm/truncate.c
index c6ab55ec688..9bfb8e85386 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,13 +9,41 @@
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
+/**
+ * do_invalidatepage - invalidate part of all of a page
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * do_invalidatepage() is called when all or part of the page has become
+ * invalidated by a truncate operation.
+ *
+ * do_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point. Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void do_invalidatepage(struct page *page, unsigned long offset)
+{
+ void (*invalidatepage)(struct page *, unsigned long);
+ invalidatepage = page->mapping->a_ops->invalidatepage;
+#ifdef CONFIG_BLOCK
+ if (!invalidatepage)
+ invalidatepage = block_invalidatepage;
+#endif
+ if (invalidatepage)
+ (*invalidatepage)(page, offset);
+}
+
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
@@ -42,7 +70,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (PagePrivate(page))
do_invalidatepage(page, 0);
- clear_page_dirty(page);
+ if (test_clear_page_dirty(page))
+ task_io_account_cancelled_write(PAGE_CACHE_SIZE);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page);
@@ -52,36 +81,25 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
/*
* This is for invalidate_inode_pages(). That function can be called at
* any time, and is not supposed to throw away dirty pages. But pages can
- * be marked dirty at any time too. So we re-check the dirtiness inside
- * ->tree_lock. That provides exclusion against the __set_page_dirty
- * functions.
+ * be marked dirty at any time too, so use remove_mapping which safely
+ * discards clean, unused pages.
*
* Returns non-zero if the page was successfully invalidated.
*/
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
+ int ret;
+
if (page->mapping != mapping)
return 0;
if (PagePrivate(page) && !try_to_release_page(page, 0))
return 0;
- write_lock_irq(&mapping->tree_lock);
- if (PageDirty(page))
- goto failed;
- if (page_count(page) != 2) /* caller's ref + pagecache ref */
- goto failed;
+ ret = remove_mapping(mapping, page);
- BUG_ON(PagePrivate(page));
- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- ClearPageUptodate(page);
- page_cache_release(page); /* pagecache ref */
- return 1;
-failed:
- write_unlock_irq(&mapping->tree_lock);
- return 0;
+ return ret;
}
/**
@@ -270,9 +288,39 @@ unsigned long invalidate_inode_pages(struct address_space *mapping)
{
return invalidate_mapping_pages(mapping, 0, ~0UL);
}
-
EXPORT_SYMBOL(invalidate_inode_pages);
+/*
+ * This is like invalidate_complete_page(), except it ignores the page's
+ * refcount. We do this because invalidate_inode_pages2() needs stronger
+ * invalidation guarantees, and cannot afford to leave pages behind because
+ * shrink_list() has a temp ref on them, or because they're transiently sitting
+ * in the lru_cache_add() pagevecs.
+ */
+static int
+invalidate_complete_page2(struct address_space *mapping, struct page *page)
+{
+ if (page->mapping != mapping)
+ return 0;
+
+ if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ return 0;
+
+ write_lock_irq(&mapping->tree_lock);
+ if (PageDirty(page))
+ goto failed;
+
+ BUG_ON(PagePrivate(page));
+ __remove_from_page_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ ClearPageUptodate(page);
+ page_cache_release(page); /* pagecache ref */
+ return 1;
+failed:
+ write_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
/**
* invalidate_inode_pages2_range - remove range of pages from an address_space
* @mapping: the address_space
@@ -339,7 +387,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
}
was_dirty = test_clear_page_dirty(page);
- if (!invalidate_complete_page(mapping, page)) {
+ if (!invalidate_complete_page2(mapping, page)) {
if (was_dirty)
set_page_dirty(page);
ret = -EIO;
@@ -349,6 +397,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pagevec_release(&pvec);
cond_resched();
}
+ WARN_ON_ONCE(ret);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/util.c b/mm/util.c
index 7368479220b..ace2aea69f1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,7 +11,7 @@
*/
void *__kzalloc(size_t size, gfp_t flags)
{
- void *ret = ____kmalloc(size, flags);
+ void *ret = kmalloc_track_caller(size, flags);
if (ret)
memset(ret, 0, size);
return ret;
@@ -33,13 +33,31 @@ char *kstrdup(const char *s, gfp_t gfp)
return NULL;
len = strlen(s) + 1;
- buf = ____kmalloc(len, gfp);
+ buf = kmalloc_track_caller(len, gfp);
if (buf)
memcpy(buf, s, len);
return buf;
}
EXPORT_SYMBOL(kstrdup);
+/**
+ * kmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ */
+void *kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kmalloc_track_caller(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kmemdup);
+
/*
* strndup_user - duplicate an existing string from user space
*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9aad8b0cc6e..86897ee792d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -160,13 +160,15 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
return err;
}
-struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end, int node)
+static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end,
+ int node, gfp_t gfp_mask)
{
struct vm_struct **p, *tmp, *area;
unsigned long align = 1;
unsigned long addr;
+ BUG_ON(in_interrupt());
if (flags & VM_IOREMAP) {
int bit = fls(size);
@@ -179,15 +181,12 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
}
addr = ALIGN(start, align);
size = PAGE_ALIGN(size);
-
- area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
- if (unlikely(!area))
+ if (unlikely(!size))
return NULL;
- if (unlikely(!size)) {
- kfree (area);
+ area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
+ if (unlikely(!area))
return NULL;
- }
/*
* We always allocate a guard page.
@@ -236,12 +235,11 @@ out:
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end)
{
- return __get_vm_area_node(size, flags, start, end, -1);
+ return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
}
/**
* get_vm_area - reserve a contingous kernel virtual area
- *
* @size: size of the area
* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
@@ -254,9 +252,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
}
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
+struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
+ int node, gfp_t gfp_mask)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
+ gfp_mask);
}
/* Caller must hold vmlist_lock */
@@ -273,7 +273,7 @@ static struct vm_struct *__find_vm_area(void *addr)
}
/* Caller must hold vmlist_lock */
-struct vm_struct *__remove_vm_area(void *addr)
+static struct vm_struct *__remove_vm_area(void *addr)
{
struct vm_struct **p, *tmp;
@@ -296,7 +296,6 @@ found:
/**
* remove_vm_area - find and remove a contingous kernel virtual area
- *
* @addr: base address
*
* Search for the kernel VM area starting at @addr, and remove it.
@@ -355,7 +354,6 @@ void __vunmap(void *addr, int deallocate_pages)
/**
* vfree - release memory allocated by vmalloc()
- *
* @addr: memory base address
*
* Free the virtually contiguous memory area starting at @addr, as
@@ -373,7 +371,6 @@ EXPORT_SYMBOL(vfree);
/**
* vunmap - release virtual mapping obtained by vmap()
- *
* @addr: memory base address
*
* Free the virtually contiguous memory area starting at @addr,
@@ -390,7 +387,6 @@ EXPORT_SYMBOL(vunmap);
/**
* vmap - map an array of pages into virtually contiguous space
- *
* @pages: array of page pointers
* @count: number of pages to map
* @flags: vm_area->flags
@@ -433,8 +429,11 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
area->flags |= VM_VPAGES;
- } else
- pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
+ } else {
+ pages = kmalloc_node(array_size,
+ (gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)),
+ node);
+ }
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
@@ -471,7 +470,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
/**
* __vmalloc_node - allocate virtually contiguous memory
- *
* @size: allocation size
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
@@ -490,7 +488,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
if (!size || (size >> PAGE_SHIFT) > num_physpages)
return NULL;
- area = get_vm_area_node(size, VM_ALLOC, node);
+ area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
if (!area)
return NULL;
@@ -505,13 +503,11 @@ EXPORT_SYMBOL(__vmalloc);
/**
* vmalloc - allocate virtually contiguous memory
- *
* @size: allocation size
- *
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
@@ -521,11 +517,11 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
- * vmalloc_user - allocate virtually contiguous memory which has
- * been zeroed so it can be mapped to userspace without
- * leaking data.
+ * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
+ * @size: allocation size
*
- * @size: allocation size
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
*/
void *vmalloc_user(unsigned long size)
{
@@ -533,25 +529,25 @@ void *vmalloc_user(unsigned long size)
void *ret;
ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
- write_lock(&vmlist_lock);
- area = __find_vm_area(ret);
- area->flags |= VM_USERMAP;
- write_unlock(&vmlist_lock);
-
+ if (ret) {
+ write_lock(&vmlist_lock);
+ area = __find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ write_unlock(&vmlist_lock);
+ }
return ret;
}
EXPORT_SYMBOL(vmalloc_user);
/**
* vmalloc_node - allocate memory on a specific node
- *
* @size: allocation size
* @node: numa node
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc_node(unsigned long size, int node)
@@ -566,14 +562,13 @@ EXPORT_SYMBOL(vmalloc_node);
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
- *
* @size: allocation size
*
* Kernel-internal function to allocate enough pages to cover @size
* the page level allocator and map them into contiguous and
* executable kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
@@ -584,7 +579,6 @@ void *vmalloc_exec(unsigned long size)
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
- *
* @size: allocation size
*
* Allocate enough 32bit PA addressable pages to cover @size from the
@@ -597,11 +591,11 @@ void *vmalloc_32(unsigned long size)
EXPORT_SYMBOL(vmalloc_32);
/**
- * vmalloc_32_user - allocate virtually contiguous memory (32bit
- * addressable) which is zeroed so it can be
- * mapped to userspace without leaking data.
- *
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
* @size: allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
*/
void *vmalloc_32_user(unsigned long size)
{
@@ -609,11 +603,12 @@ void *vmalloc_32_user(unsigned long size)
void *ret;
ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
- write_lock(&vmlist_lock);
- area = __find_vm_area(ret);
- area->flags |= VM_USERMAP;
- write_unlock(&vmlist_lock);
-
+ if (ret) {
+ write_lock(&vmlist_lock);
+ area = __find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ write_unlock(&vmlist_lock);
+ }
return ret;
}
EXPORT_SYMBOL(vmalloc_32_user);
@@ -695,7 +690,6 @@ finished:
/**
* remap_vmalloc_range - map vmalloc pages to userspace
- *
* @vma: vma to cover (map full range of vma)
* @addr: vmalloc memory
* @pgoff: number of pages into addr before first page to map
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 87779dda4ec..093f5fe6dd7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
@@ -35,6 +36,7 @@
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -370,24 +372,49 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
-
+ inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}
+/*
+ * Attempt to detach a locked page from its ->mapping. If it is dirty or if
+ * someone else has a ref on the page, abort and return 0. If it was
+ * successfully detached, return 1. Assumes the caller has a single ref on
+ * this page.
+ */
int remove_mapping(struct address_space *mapping, struct page *page)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
write_lock_irq(&mapping->tree_lock);
-
/*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
+ * The non racy check for a busy page.
+ *
+ * Must be careful with the order of the tests. When someone has
+ * a ref to the page, it may be possible that they dirty it then
+ * drop the reference. So if PageDirty is tested before page_count
+ * here, then the following race may occur:
+ *
+ * get_user_pages(&page);
+ * [user mapping goes away]
+ * write_to(page);
+ * !PageDirty(page) [good]
+ * SetPageDirty(page);
+ * put_page(page);
+ * !page_count(page) [good, discard it]
+ *
+ * [oops, our write_to data is lost]
+ *
+ * Reversing the order of the tests ensures such a situation cannot
+ * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ * load is not satisfied before that of page->_count.
+ *
+ * Note that if SetPageDirty is always performed via set_page_dirty,
+ * and thus under tree_lock, then this ordering is not required.
*/
if (unlikely(page_count(page) != 2))
goto cannot_free;
@@ -697,6 +724,20 @@ done:
return nr_reclaimed;
}
+/*
+ * We are about to scan this zone at a certain priority level. If that priority
+ * level is smaller (ie: more urgent) than the previous priority, then note
+ * that priority level within the zone. This is done so that when the next
+ * process comes in to scan this zone, it will immediately start out at this
+ * priority level rather than having to build up its own scanning priority.
+ * Here, this priority affects only the reclaim-mapped threshold.
+ */
+static inline void note_zone_scanning_priority(struct zone *zone, int priority)
+{
+ if (priority < zone->prev_priority)
+ zone->prev_priority = priority;
+}
+
static inline int zone_is_near_oom(struct zone *zone)
{
return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
@@ -720,7 +761,7 @@ static inline int zone_is_near_oom(struct zone *zone)
* But we had to alter page->flags anyway.
*/
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
- struct scan_control *sc)
+ struct scan_control *sc, int priority)
{
unsigned long pgmoved;
int pgdeactivate = 0;
@@ -744,7 +785,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* `distress' is a measure of how much trouble we're having
* reclaiming pages. 0 -> no problems. 100 -> great trouble.
*/
- distress = 100 >> zone->prev_priority;
+ distress = 100 >> min(zone->prev_priority, priority);
/*
* The point of this algorithm is to decide when to start
@@ -896,7 +937,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
nr_active -= nr_to_scan;
- shrink_active_list(nr_to_scan, zone, sc);
+ shrink_active_list(nr_to_scan, zone, sc, priority);
}
if (nr_inactive) {
@@ -946,9 +987,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
- zone->temp_priority = priority;
- if (zone->prev_priority > priority)
- zone->prev_priority = priority;
+ note_zone_scanning_priority(zone, priority);
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
@@ -998,7 +1037,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
- zone->temp_priority = DEF_PRIORITY;
lru_pages += zone->nr_active + zone->nr_inactive;
}
@@ -1033,19 +1071,28 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
/* Take a nap, wait for some writeback to complete */
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
}
/* top priority shrink_caches still had more to do? don't OOM, then */
if (!sc.all_unreclaimable)
ret = 1;
out:
+ /*
+ * Now that we've scanned all the zones at this priority level, note
+ * that level within the zone so that the next thread which performs
+ * scanning of this zone will immediately start out at this priority
+ * level. This affects only the decision whether or not to bring
+ * mapped pages onto the inactive list.
+ */
+ if (priority < 0)
+ priority = 0;
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
- zone->prev_priority = zone->temp_priority;
+ zone->prev_priority = priority;
}
return ret;
}
@@ -1085,6 +1132,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
};
+ /*
+ * temp_priority is used to remember the scanning priority at which
+ * this zone was successfully refilled to free_pages == pages_high.
+ */
+ int temp_priority[MAX_NR_ZONES];
loop_again:
total_scanned = 0;
@@ -1092,11 +1144,8 @@ loop_again:
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->temp_priority = DEF_PRIORITY;
- }
+ for (i = 0; i < pgdat->nr_zones; i++)
+ temp_priority[i] = DEF_PRIORITY;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -1124,11 +1173,12 @@ loop_again:
if (!zone_watermark_ok(zone, order, zone->pages_high,
0, 0)) {
end_zone = i;
- goto scan;
+ break;
}
}
- goto out;
-scan:
+ if (i < 0)
+ goto out;
+
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
@@ -1157,10 +1207,9 @@ scan:
if (!zone_watermark_ok(zone, order, zone->pages_high,
end_zone, 0))
all_zones_ok = 0;
- zone->temp_priority = priority;
- if (zone->prev_priority > priority)
- zone->prev_priority = priority;
+ temp_priority[i] = priority;
sc.nr_scanned = 0;
+ note_zone_scanning_priority(zone, priority);
nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1188,7 +1237,7 @@ scan:
* another pass across the zones.
*/
if (total_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
/*
* We do this so kswapd doesn't build up large priorities for
@@ -1200,13 +1249,21 @@ scan:
break;
}
out:
+ /*
+ * Note within each zone the priority level at which this zone was
+ * brought into a happy state. So that the next thread which scans this
+ * zone will start out at that priority level.
+ */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
- zone->prev_priority = zone->temp_priority;
+ zone->prev_priority = temp_priority[i];
}
if (!all_zones_ok) {
cond_resched();
+
+ try_to_freeze();
+
goto loop_again;
}
@@ -1332,7 +1389,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
if (zone->nr_scan_active >= nr_pages || pass > 3) {
zone->nr_scan_active = 0;
nr_to_scan = min(nr_pages, zone->nr_active);
- shrink_active_list(nr_to_scan, zone, sc);
+ shrink_active_list(nr_to_scan, zone, sc, prio);
}
}
@@ -1432,7 +1489,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
goto out;
if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ / 10);
+ congestion_wait(WRITE, HZ / 10);
}
lru_pages = 0;
@@ -1456,7 +1513,6 @@ out:
}
#endif
-#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
@@ -1477,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
/*
* This kswapd start function will be called by init and node-hot-add.
@@ -1588,6 +1643,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
+ note_zone_scanning_priority(zone, priority);
nr_reclaimed += shrink_zone(priority, zone, &sc);
priority--;
} while (priority >= 0 && nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 490d8c1a0de..dc005a0c96a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -9,7 +9,6 @@
* Christoph Lameter <christoph@lameter.com>
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/cpu.h>
@@ -371,7 +370,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
}
- if (z->zone_pgdat == NODE_DATA(numa_node_id()))
+ if (z->node == numa_node_id())
__inc_zone_state(z, NUMA_LOCAL);
else
__inc_zone_state(z, NUMA_OTHER);
@@ -431,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg)
return 0;
}
-struct seq_operations fragmentation_op = {
+const struct seq_operations fragmentation_op = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
@@ -453,7 +452,7 @@ struct seq_operations fragmentation_op = {
#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
TEXT_FOR_HIGHMEM(xx)
-static char *vmstat_text[] = {
+static const char * const vmstat_text[] = {
/* Zoned VM counters */
"nr_anon_pages",
"nr_mapped",
@@ -465,6 +464,7 @@ static char *vmstat_text[] = {
"nr_writeback",
"nr_unstable",
"nr_bounce",
+ "nr_vmscan_write",
#ifdef CONFIG_NUMA
"numa_hit",
@@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
seq_printf(m,
"\n all_unreclaimable: %u"
"\n prev_priority: %i"
- "\n temp_priority: %i"
"\n start_pfn: %lu",
zone->all_unreclaimable,
zone->prev_priority,
- zone->temp_priority,
zone->zone_start_pfn);
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
@@ -599,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
return 0;
}
-struct seq_operations zoneinfo_op = {
+const struct seq_operations zoneinfo_op = {
.start = frag_start, /* iterate over all zones. The same as in
* fragmentation. */
.next = frag_next,
@@ -662,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg)
m->private = NULL;
}
-struct seq_operations vmstat_op = {
+const struct seq_operations vmstat_op = {
.start = vmstat_start,
.next = vmstat_next,
.stop = vmstat_stop,
@@ -681,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
void *hcpu)
{
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- refresh_zone_stat_thresholds();
- break;
- default:
- break;
+ case CPU_UP_PREPARE:
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ refresh_zone_stat_thresholds();
+ break;
+ default:
+ break;
}
return NOTIFY_OK;
}