diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/backing-dev.c | 219 | ||||
-rw-r--r-- | mm/bootmem.c | 32 | ||||
-rw-r--r-- | mm/dmapool.c | 12 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/filemap_xip.c | 200 | ||||
-rw-r--r-- | mm/hugetlb.c | 84 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 190 | ||||
-rw-r--r-- | mm/memory.c | 228 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 186 | ||||
-rw-r--r-- | mm/mempolicy.c | 1051 | ||||
-rw-r--r-- | mm/migrate.c | 9 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 57 | ||||
-rw-r--r-- | mm/mmzone.c | 30 | ||||
-rw-r--r-- | mm/nommu.c | 29 | ||||
-rw-r--r-- | mm/oom_kill.c | 58 | ||||
-rw-r--r-- | mm/page-writeback.c | 77 | ||||
-rw-r--r-- | mm/page_alloc.c | 321 | ||||
-rw-r--r-- | mm/pagewalk.c | 8 | ||||
-rw-r--r-- | mm/readahead.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 146 | ||||
-rw-r--r-- | mm/slab.c | 31 | ||||
-rw-r--r-- | mm/slob.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 513 | ||||
-rw-r--r-- | mm/sparse.c | 97 | ||||
-rw-r--r-- | mm/swap.c | 37 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 14 | ||||
-rw-r--r-- | mm/truncate.c | 11 | ||||
-rw-r--r-- | mm/vmalloc.c | 144 | ||||
-rw-r--r-- | mm/vmscan.c | 57 | ||||
-rw-r--r-- | mm/vmstat.c | 16 |
37 files changed, 2563 insertions, 1339 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0016ebd4dcb..3aa819d628c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -143,6 +143,18 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +# +# If we have space for more page flags then we can enable additional +# optimizations and functionality. +# +# Regular Sparsemem takes page flag bits for the sectionid if it does not +# use a virtual memmap. Disable extended page flags for 32 bit platforms +# that require the use of a sectionid in the page flags. +# +config PAGEFLAGS_EXTENDED + def_bool y + depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e8644b1e552..7c4f9e09709 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -4,12 +4,229 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/writeback.h> +#include <linux/device.h> + + +static struct class *bdi_class; + +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +static struct dentry *bdi_debug_root; + +static void bdi_debug_init(void) +{ + bdi_debug_root = debugfs_create_dir("bdi", NULL); +} + +static int bdi_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + long background_thresh; + long dirty_thresh; + long bdi_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + +#define K(x) ((x) << (PAGE_SHIFT - 10)) + seq_printf(m, + "BdiWriteback: %8lu kB\n" + "BdiReclaimable: %8lu kB\n" + "BdiDirtyThresh: %8lu kB\n" + "DirtyThresh: %8lu kB\n" + "BackgroundThresh: %8lu kB\n", + (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), + (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh)); +#undef K + + return 0; +} + +static int bdi_debug_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, bdi_debug_stats_show, inode->i_private); +} + +static const struct file_operations bdi_debug_stats_fops = { + .open = bdi_debug_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) +{ + bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); + bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir, + bdi, &bdi_debug_stats_fops); +} + +static void bdi_debug_unregister(struct backing_dev_info *bdi) +{ + debugfs_remove(bdi->debug_stats); + debugfs_remove(bdi->debug_dir); +} +#else +static inline void bdi_debug_init(void) +{ +} +static inline void bdi_debug_register(struct backing_dev_info *bdi, + const char *name) +{ +} +static inline void bdi_debug_unregister(struct backing_dev_info *bdi) +{ +} +#endif + +static ssize_t read_ahead_kb_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned long read_ahead_kb; + ssize_t ret = -EINVAL; + + read_ahead_kb = simple_strtoul(buf, &end, 10); + if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { + bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); + ret = count; + } + return ret; +} + +#define K(pages) ((pages) << (PAGE_SHIFT - 10)) + +#define BDI_SHOW(name, expr) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *page) \ +{ \ + struct backing_dev_info *bdi = dev_get_drvdata(dev); \ + \ + return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ +} + +BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) + +static ssize_t min_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned int ratio; + ssize_t ret = -EINVAL; + + ratio = simple_strtoul(buf, &end, 10); + if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { + ret = bdi_set_min_ratio(bdi, ratio); + if (!ret) + ret = count; + } + return ret; +} +BDI_SHOW(min_ratio, bdi->min_ratio) + +static ssize_t max_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned int ratio; + ssize_t ret = -EINVAL; + + ratio = simple_strtoul(buf, &end, 10); + if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { + ret = bdi_set_max_ratio(bdi, ratio); + if (!ret) + ret = count; + } + return ret; +} +BDI_SHOW(max_ratio, bdi->max_ratio) + +#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) + +static struct device_attribute bdi_dev_attrs[] = { + __ATTR_RW(read_ahead_kb), + __ATTR_RW(min_ratio), + __ATTR_RW(max_ratio), + __ATTR_NULL, +}; + +static __init int bdi_class_init(void) +{ + bdi_class = class_create(THIS_MODULE, "bdi"); + bdi_class->dev_attrs = bdi_dev_attrs; + bdi_debug_init(); + return 0; +} + +postcore_initcall(bdi_class_init); + +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) +{ + char *name; + va_list args; + int ret = 0; + struct device *dev; + + va_start(args, fmt); + name = kvasprintf(GFP_KERNEL, fmt, args); + va_end(args); + + if (!name) + return -ENOMEM; + + dev = device_create(bdi_class, parent, MKDEV(0, 0), name); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto exit; + } + + bdi->dev = dev; + dev_set_drvdata(bdi->dev, bdi); + bdi_debug_register(bdi, name); + +exit: + kfree(name); + return ret; +} +EXPORT_SYMBOL(bdi_register); + +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) +{ + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); + +void bdi_unregister(struct backing_dev_info *bdi) +{ + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } +} +EXPORT_SYMBOL(bdi_unregister); int bdi_init(struct backing_dev_info *bdi) { int i; int err; + bdi->dev = NULL; + + bdi->min_ratio = 0; + bdi->max_ratio = 100; + bdi->max_prop_frac = PROP_FRAC_BASE; + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); if (err) @@ -33,6 +250,8 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + bdi_unregister(bdi); + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); diff --git a/mm/bootmem.c b/mm/bootmem.c index b6791646143..e8fb927392b 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -461,6 +461,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { + register_page_bootmem_info_node(pgdat); return free_all_bootmem_core(pgdat); } @@ -544,6 +545,37 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, return __alloc_bootmem(size, align, goal); } +#ifdef CONFIG_SPARSEMEM +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + void *ptr; + unsigned long limit, goal, start_nr, end_nr, pfn; + struct pglist_data *pgdat; + + pfn = section_nr_to_pfn(section_nr); + goal = PFN_PHYS(pfn); + limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; + pgdat = NODE_DATA(early_pfn_to_nid(pfn)); + ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, + limit); + + if (!ptr) + return NULL; + + start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); + end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); + if (start_nr != section_nr || end_nr != section_nr) { + printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", + section_nr); + free_bootmem_core(pgdat->bdata, __pa(ptr), size); + ptr = NULL; + } + + return ptr; +} +#endif + #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif diff --git a/mm/dmapool.c b/mm/dmapool.c index 34aaac451a9..b1f0885dda2 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -37,6 +37,10 @@ #include <linux/types.h> #include <linux/wait.h> +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#define DMAPOOL_DEBUG 1 +#endif + struct dma_pool { /* the pool */ struct list_head page_list; spinlock_t lock; @@ -216,7 +220,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, &page->dma, mem_flags); if (page->vaddr) { -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG memset(page->vaddr, POOL_POISON_FREED, pool->allocation); #endif pool_initialise_page(pool, page); @@ -239,7 +243,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) { dma_addr_t dma = page->dma; -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG memset(page->vaddr, POOL_POISON_FREED, pool->allocation); #endif dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); @@ -336,7 +340,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, page->offset = *(int *)(page->vaddr + offset); retval = offset + page->vaddr; *handle = offset + page->dma; -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif done: @@ -391,7 +395,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } offset = vaddr - page->vaddr; -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { if (pool->dev) dev_err(pool->dev, diff --git a/mm/fadvise.c b/mm/fadvise.c index 3c0f1e99f5e..343cfdfebd9 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -49,7 +49,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) goto out; } - if (mapping->a_ops->get_xip_page) { + if (mapping->a_ops->get_xip_mem) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: diff --git a/mm/filemap.c b/mm/filemap.c index 07e9d9258b4..239d36163bb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -576,10 +576,12 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - if (!test_clear_page_writeback(page)) - BUG(); - } + if (TestClearPageReclaim(page)) + rotate_reclaimable_page(page); + + if (!test_clear_page_writeback(page)) + BUG(); + smp_mb__after_clear_bit(); wake_up_page(page, PG_writeback); } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 5e598c42afd..3e744abcce9 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -15,6 +15,7 @@ #include <linux/rmap.h> #include <linux/sched.h> #include <asm/tlbflush.h> +#include <asm/io.h> /* * We do use our own empty page to avoid interference with other users @@ -42,37 +43,41 @@ static struct page *xip_sparse_page(void) /* * This is a file read routine for execute in place files, and uses - * the mapping->a_ops->get_xip_page() function for the actual low-level + * the mapping->a_ops->get_xip_mem() function for the actual low-level * stuff. * * Note the struct file* is not used at all. It may be NULL. */ -static void +static ssize_t do_xip_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, struct file *filp, - loff_t *ppos, - read_descriptor_t *desc, - read_actor_t actor) + char __user *buf, + size_t len, + loff_t *ppos) { struct inode *inode = mapping->host; pgoff_t index, end_index; unsigned long offset; - loff_t isize; + loff_t isize, pos; + size_t copied = 0, error = 0; - BUG_ON(!mapping->a_ops->get_xip_page); + BUG_ON(!mapping->a_ops->get_xip_mem); - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + pos = *ppos; + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; isize = i_size_read(inode); if (!isize) goto out; end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - for (;;) { - struct page *page; - unsigned long nr, ret; + do { + unsigned long nr, left; + void *xip_mem; + unsigned long xip_pfn; + int zero = 0; /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; @@ -85,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping, } } nr = nr - offset; + if (nr > len) + nr = len; - page = mapping->a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 0); - if (!page) - goto no_xip_page; - if (unlikely(IS_ERR(page))) { - if (PTR_ERR(page) == -ENODATA) { + error = mapping->a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (unlikely(error)) { + if (error == -ENODATA) { /* sparse */ - page = ZERO_PAGE(0); - } else { - desc->error = PTR_ERR(page); + zero = 1; + } else goto out; - } } /* If users can be writing to this page using arbitrary @@ -105,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping, * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + /* address based flush */ ; /* - * Ok, we have the page, so now we can copy it to user space... + * Ok, we have the mem, so now we can copy it to user space... * * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer @@ -116,47 +119,38 @@ do_xip_mapping_read(struct address_space *mapping, * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); - offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; + if (!zero) + left = __copy_to_user(buf+copied, xip_mem+offset, nr); + else + left = __clear_user(buf + copied, nr); - if (ret == nr && desc->count) - continue; - goto out; + if (left) { + error = -EFAULT; + goto out; + } -no_xip_page: - /* Did not get the page. Report it */ - desc->error = -EIO; - goto out; - } + copied += (nr - left); + offset += (nr - left); + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + } while (copied < len); out: - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + *ppos = pos + copied; if (filp) file_accessed(filp); + + return (copied ? copied : error); } ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { - read_descriptor_t desc; - if (!access_ok(VERIFY_WRITE, buf, len)) return -EFAULT; - desc.written = 0; - desc.arg.buf = buf; - desc.count = len; - desc.error = 0; - - do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, - ppos, &desc, file_read_actor); - - if (desc.written) - return desc.written; - else - return desc.error; + return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, + buf, len, ppos); } EXPORT_SYMBOL_GPL(xip_file_read); @@ -211,13 +205,16 @@ __xip_unmap (struct address_space * mapping, * * This function is derived from filemap_fault, but used for execute in place */ -static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) +static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct file *file = area->vm_file; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct page *page; pgoff_t size; + void *xip_mem; + unsigned long xip_pfn; + struct page *page; + int error; /* XXX: are VM_FAULT_ codes OK? */ @@ -225,35 +222,44 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; - page = mapping->a_ops->get_xip_page(mapping, - vmf->pgoff*(PAGE_SIZE/512), 0); - if (!IS_ERR(page)) - goto out; - if (PTR_ERR(page) != -ENODATA) + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (likely(!error)) + goto found; + if (error != -ENODATA) return VM_FAULT_OOM; /* sparse block */ - if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && - (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) && + if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && + (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { + int err; + /* maybe shared writable, allocate new block */ - page = mapping->a_ops->get_xip_page(mapping, - vmf->pgoff*(PAGE_SIZE/512), 1); - if (IS_ERR(page)) + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, + &xip_mem, &xip_pfn); + if (error) return VM_FAULT_SIGBUS; - /* unmap page at pgoff from all other vmas */ + /* unmap sparse mappings at pgoff from all other vmas */ __xip_unmap(mapping, vmf->pgoff); + +found: + err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, + xip_pfn); + if (err == -ENOMEM) + return VM_FAULT_OOM; + BUG_ON(err); + return VM_FAULT_NOPAGE; } else { /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) return VM_FAULT_OOM; - } -out: - page_cache_get(page); - vmf->page = page; - return 0; + page_cache_get(page); + vmf->page = page; + return 0; + } } static struct vm_operations_struct xip_file_vm_ops = { @@ -262,11 +268,11 @@ static struct vm_operations_struct xip_file_vm_ops = { int xip_file_mmap(struct file * file, struct vm_area_struct * vma) { - BUG_ON(!file->f_mapping->a_ops->get_xip_page); + BUG_ON(!file->f_mapping->a_ops->get_xip_mem); file_accessed(file); vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; + vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); @@ -279,17 +285,17 @@ __xip_file_write(struct file *filp, const char __user *buf, const struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; - struct page *page; size_t bytes; ssize_t written = 0; - BUG_ON(!mapping->a_ops->get_xip_page); + BUG_ON(!mapping->a_ops->get_xip_mem); do { unsigned long index; unsigned long offset; size_t copied; - char *kaddr; + void *xip_mem; + unsigned long xip_pfn; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; @@ -297,28 +303,22 @@ __xip_file_write(struct file *filp, const char __user *buf, if (bytes > count) bytes = count; - page = a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 0); - if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { + status = a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (status == -ENODATA) { /* we allocate a new page unmap it */ - page = a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 1); - if (!IS_ERR(page)) + status = a_ops->get_xip_mem(mapping, index, 1, + &xip_mem, &xip_pfn); + if (!status) /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, index); } - if (IS_ERR(page)) { - status = PTR_ERR(page); + if (status) break; - } - fault_in_pages_readable(buf, bytes); - kaddr = kmap_atomic(page, KM_USER0); copied = bytes - - __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page); + __copy_from_user_nocache(xip_mem + offset, buf, bytes); if (likely(copied > 0)) { status = copied; @@ -398,7 +398,7 @@ EXPORT_SYMBOL_GPL(xip_file_write); /* * truncate a page used for execute in place - * functionality is analog to block_truncate_page but does use get_xip_page + * functionality is analog to block_truncate_page but does use get_xip_mem * to get the page instead of page cache */ int @@ -408,9 +408,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from) unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize; unsigned length; - struct page *page; + void *xip_mem; + unsigned long xip_pfn; + int err; - BUG_ON(!mapping->a_ops->get_xip_page); + BUG_ON(!mapping->a_ops->get_xip_mem); blocksize = 1 << mapping->host->i_blkbits; length = offset & (blocksize - 1); @@ -421,18 +423,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from) length = blocksize - length; - page = mapping->a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 0); - if (!page) - return -ENOMEM; - if (unlikely(IS_ERR(page))) { - if (PTR_ERR(page) == -ENODATA) + err = mapping->a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (unlikely(err)) { + if (err == -ENODATA) /* Hole? No need to truncate */ return 0; else - return PTR_ERR(page); + return err; } - zero_user(page, offset, length); + memset(xip_mem + offset, 0, length); return 0; } EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51c9e2c0164..bbf953eeb58 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -95,13 +95,16 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, int nid; struct page *page = NULL; struct mempolicy *mpol; + nodemask_t *nodemask; struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol); - struct zone **z; - - for (z = zonelist->zones; *z; z++) { - nid = zone_to_nid(*z); - if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && + htlb_alloc_mask, &mpol, &nodemask); + struct zone *zone; + struct zoneref *z; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + MAX_NR_ZONES - 1, nodemask) { + nid = zone_to_nid(zone); + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && !list_empty(&hugepage_freelists[nid])) { page = list_entry(hugepage_freelists[nid].next, struct page, lru); @@ -113,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, break; } } - mpol_free(mpol); /* unref if mpol !NULL */ + mpol_cond_put(mpol); return page; } @@ -129,6 +132,7 @@ static void update_and_free_page(struct page *page) } set_compound_page_dtor(page, NULL); set_page_refcounted(page); + arch_release_hugepage(page); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -195,9 +199,14 @@ static struct page *alloc_fresh_huge_page_node(int nid) struct page *page; page = alloc_pages_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, HUGETLB_PAGE_ORDER); if (page) { + if (arch_prepare_hugepage(page)) { + __free_pages(page, HUGETLB_PAGE_ORDER); + return NULL; + } set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); nr_huge_pages++; @@ -239,6 +248,11 @@ static int alloc_fresh_huge_page(void) hugetlb_next_nid = next_nid; } while (!page && hugetlb_next_nid != start_nid); + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + return ret; } @@ -281,7 +295,8 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, } spin_unlock(&hugetlb_lock); - page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, HUGETLB_PAGE_ORDER); spin_lock(&hugetlb_lock); @@ -299,9 +314,11 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, */ nr_huge_pages_node[nid]++; surplus_huge_pages_node[nid]++; + __count_vm_event(HTLB_BUDDY_PGALLOC); } else { nr_huge_pages--; surplus_huge_pages--; + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); @@ -369,11 +386,19 @@ retry: resv_huge_pages += delta; ret = 0; free: + /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + if ((--needed) < 0) + break; list_del(&page->lru); - if ((--needed) >= 0) - enqueue_huge_page(page); - else { + enqueue_huge_page(page); + } + + /* Free unnecessary surplus pages to the buddy allocator */ + if (!list_empty(&surplus_list)) { + spin_unlock(&hugetlb_lock); + list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + list_del(&page->lru); /* * The page has a reference count of zero already, so * call free_huge_page directly instead of using @@ -381,10 +406,9 @@ free: * unlocked which is safe because free_huge_page takes * hugetlb_lock before deciding how to free the page. */ - spin_unlock(&hugetlb_lock); free_huge_page(page); - spin_lock(&hugetlb_lock); } + spin_lock(&hugetlb_lock); } return ret; @@ -718,7 +742,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); } else { - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); entry = pte_mkhuge(entry); @@ -731,8 +755,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, { pte_t entry; - entry = pte_mkwrite(pte_mkdirty(*ptep)); - if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { + entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { update_mmu_cache(vma, address, entry); } } @@ -762,10 +786,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); - if (!pte_none(*src_pte)) { + if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) - ptep_set_wrprotect(src, addr, src_pte); - entry = *src_pte; + huge_ptep_set_wrprotect(src, addr, src_pte); + entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); @@ -809,7 +833,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, continue; pte = huge_ptep_get_and_clear(mm, address, ptep); - if (pte_none(pte)) + if (huge_pte_none(pte)) continue; page = pte_page(pte); @@ -873,8 +897,9 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); - if (likely(pte_same(*ptep, pte))) { + if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ + huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); /* Make the old page be freed below */ @@ -942,7 +967,7 @@ retry: goto backout; ret = 0; - if (!pte_none(*ptep)) + if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) @@ -984,8 +1009,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * the same page in the page cache. */ mutex_lock(&hugetlb_instantiation_mutex); - entry = *ptep; - if (pte_none(entry)) { + entry = huge_ptep_get(ptep); + if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); mutex_unlock(&hugetlb_instantiation_mutex); return ret; @@ -995,7 +1020,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ - if (likely(pte_same(entry, *ptep))) + if (likely(pte_same(entry, huge_ptep_get(ptep)))) if (write_access && !pte_write(entry)) ret = hugetlb_cow(mm, vma, address, ptep, entry); spin_unlock(&mm->page_table_lock); @@ -1025,7 +1050,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { + if (!pte || huge_pte_none(huge_ptep_get(pte)) || + (write && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); @@ -1041,7 +1067,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, } pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; - page = pte_page(*pte); + page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { get_page(page); @@ -1090,7 +1116,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, continue; if (huge_pmd_unshare(mm, &address, ptep)) continue; - if (!pte_none(*ptep)) { + if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); diff --git a/mm/internal.h b/mm/internal.h index 789727309f4..0034e947e4b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -34,8 +34,7 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } -extern void __init __free_pages_bootmem(struct page *page, - unsigned int order); +extern void __free_pages_bootmem(struct page *page, unsigned int order); /* * function for dealing with page's order in buddy system. diff --git a/mm/madvise.c b/mm/madvise.c index 93ee375b38e..23a0ec3e0ea 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -112,7 +112,7 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; - if (file->f_mapping->a_ops->get_xip_page) { + if (file->f_mapping->a_ops->get_xip_mem) { /* no bad return value, but ignore advice */ return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e0bfc93484..e46451e1d9b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -26,15 +26,18 @@ #include <linux/backing-dev.h> #include <linux/bit_spinlock.h> #include <linux/rcupdate.h> +#include <linux/slab.h> #include <linux/swap.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> struct cgroup_subsys mem_cgroup_subsys; static const int MEM_CGROUP_RECLAIM_RETRIES = 5; +static struct kmem_cache *page_cgroup_cache; /* * Statistics for memory cgroup. @@ -45,6 +48,8 @@ enum mem_cgroup_stat_index { */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ + MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ + MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_NSTATS, }; @@ -196,6 +201,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); else __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); + + if (charge) + __mem_cgroup_stat_add_safe(stat, + MEM_CGROUP_STAT_PGPGIN_COUNT, 1); + else + __mem_cgroup_stat_add_safe(stat, + MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); } static struct mem_cgroup_per_zone * @@ -236,26 +248,12 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) css); } -static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } -void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) -{ - struct mem_cgroup *mem; - - mem = mem_cgroup_from_task(p); - css_get(&mem->css); - mm->mem_cgroup = mem; -} - -void mm_free_cgroup(struct mm_struct *mm) -{ - css_put(&mm->mem_cgroup->css); -} - static inline int page_cgroup_locked(struct page *page) { return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); @@ -287,10 +285,10 @@ static void unlock_page_cgroup(struct page *page) bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } -static void __mem_cgroup_remove_list(struct page_cgroup *pc) +static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, + struct page_cgroup *pc) { int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; - struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); if (from) MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; @@ -301,10 +299,10 @@ static void __mem_cgroup_remove_list(struct page_cgroup *pc) list_del_init(&pc->lru); } -static void __mem_cgroup_add_list(struct page_cgroup *pc) +static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, + struct page_cgroup *pc) { int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; - struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); if (!to) { MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; @@ -476,6 +474,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; + BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); if (active) src = &mz->active_list; @@ -560,7 +559,7 @@ retry: } unlock_page_cgroup(page); - pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); + pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); if (pc == NULL) goto err; @@ -574,7 +573,7 @@ retry: mm = &init_mm; rcu_read_lock(); - mem = rcu_dereference(mm->mem_cgroup); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); /* * For every charge from the cgroup, increment reference count */ @@ -602,7 +601,6 @@ retry: mem_cgroup_out_of_memory(mem, gfp_mask); goto out; } - congestion_wait(WRITE, HZ/10); } pc->ref_cnt = 1; @@ -610,7 +608,7 @@ retry: pc->page = page; pc->flags = PAGE_CGROUP_FLAG_ACTIVE; if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) - pc->flags |= PAGE_CGROUP_FLAG_CACHE; + pc->flags = PAGE_CGROUP_FLAG_CACHE; lock_page_cgroup(page); if (page_get_page_cgroup(page)) { @@ -622,14 +620,14 @@ retry: */ res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - kfree(pc); + kmem_cache_free(page_cgroup_cache, pc); goto retry; } page_assign_page_cgroup(page, pc); mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_add_list(pc); + __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); unlock_page_cgroup(page); @@ -637,7 +635,7 @@ done: return 0; out: css_put(&mem->css); - kfree(pc); + kmem_cache_free(page_cgroup_cache, pc); err: return -ENOMEM; } @@ -685,7 +683,7 @@ void mem_cgroup_uncharge_page(struct page *page) if (--(pc->ref_cnt) == 0) { mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(pc); + __mem_cgroup_remove_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); page_assign_page_cgroup(page, NULL); @@ -695,7 +693,7 @@ void mem_cgroup_uncharge_page(struct page *page) res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - kfree(pc); + kmem_cache_free(page_cgroup_cache, pc); return; } @@ -747,7 +745,7 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage) mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(pc); + __mem_cgroup_remove_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); page_assign_page_cgroup(page, NULL); @@ -759,7 +757,7 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage) mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_add_list(pc); + __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); unlock_page_cgroup(newpage); @@ -853,13 +851,10 @@ static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) return 0; } -static ssize_t mem_cgroup_read(struct cgroup *cont, - struct cftype *cft, struct file *file, - char __user *userbuf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - return res_counter_read(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - NULL); + return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, + cft->private); } static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, @@ -871,27 +866,25 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, mem_cgroup_write_strategy); } -static ssize_t mem_force_empty_write(struct cgroup *cont, - struct cftype *cft, struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - int ret = mem_cgroup_force_empty(mem); - if (!ret) - ret = nbytes; - return ret; + struct mem_cgroup *mem; + + mem = mem_cgroup_from_cont(cont); + switch (event) { + case RES_MAX_USAGE: + res_counter_reset_max(&mem->res); + break; + case RES_FAILCNT: + res_counter_reset_failcnt(&mem->res); + break; + } + return 0; } -/* - * Note: This should be removed if cgroup supports write-only file. - */ -static ssize_t mem_force_empty_read(struct cgroup *cont, - struct cftype *cft, - struct file *file, char __user *userbuf, - size_t nbytes, loff_t *ppos) +static int mem_force_empty_write(struct cgroup *cont, unsigned int event) { - return -EINVAL; + return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); } static const struct mem_cgroup_stat_desc { @@ -900,11 +893,13 @@ static const struct mem_cgroup_stat_desc { } mem_cgroup_stat_desc[] = { [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, + [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, + [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, }; -static int mem_control_stat_show(struct seq_file *m, void *arg) +static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, + struct cgroup_map_cb *cb) { - struct cgroup *cont = m->private; struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); struct mem_cgroup_stat *stat = &mem_cont->stat; int i; @@ -914,8 +909,7 @@ static int mem_control_stat_show(struct seq_file *m, void *arg) val = mem_cgroup_read_stat(stat, i); val *= mem_cgroup_stat_desc[i].unit; - seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, - (long long)val); + cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); } /* showing # of active pages */ { @@ -925,52 +919,43 @@ static int mem_control_stat_show(struct seq_file *m, void *arg) MEM_CGROUP_ZSTAT_INACTIVE); active = mem_cgroup_get_all_zonestat(mem_cont, MEM_CGROUP_ZSTAT_ACTIVE); - seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); - seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); + cb->fill(cb, "active", (active) * PAGE_SIZE); + cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); } return 0; } -static const struct file_operations mem_control_stat_file_operations = { - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mem_control_stat_open(struct inode *unused, struct file *file) -{ - /* XXX __d_cont */ - struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; - - file->f_op = &mem_control_stat_file_operations; - return single_open(file, mem_control_stat_show, cont); -} - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = RES_USAGE, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read, + }, + { + .name = "max_usage_in_bytes", + .private = RES_MAX_USAGE, + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, }, { .name = "limit_in_bytes", .private = RES_LIMIT, .write = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read, }, { .name = "failcnt", .private = RES_FAILCNT, - .read = mem_cgroup_read, + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, }, { .name = "force_empty", - .write = mem_force_empty_write, - .read = mem_force_empty_read, + .trigger = mem_force_empty_write, }, { .name = "stat", - .open = mem_control_stat_open, + .read_map = mem_control_stat_show, }, }; @@ -1010,6 +995,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) kfree(mem->info.nodeinfo[node]); } +static struct mem_cgroup *mem_cgroup_alloc(void) +{ + struct mem_cgroup *mem; + + if (sizeof(*mem) < PAGE_SIZE) + mem = kmalloc(sizeof(*mem), GFP_KERNEL); + else + mem = vmalloc(sizeof(*mem)); + + if (mem) + memset(mem, 0, sizeof(*mem)); + return mem; +} + +static void mem_cgroup_free(struct mem_cgroup *mem) +{ + if (sizeof(*mem) < PAGE_SIZE) + kfree(mem); + else + vfree(mem); +} + + static struct cgroup_subsys_state * mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -1018,17 +1026,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (unlikely((cont->parent) == NULL)) { mem = &init_mem_cgroup; - init_mm.mem_cgroup = mem; - } else - mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); - - if (mem == NULL) - return ERR_PTR(-ENOMEM); + page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); + } else { + mem = mem_cgroup_alloc(); + if (!mem) + return ERR_PTR(-ENOMEM); + } res_counter_init(&mem->res); - memset(&mem->info, 0, sizeof(mem->info)); - for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; @@ -1038,7 +1044,7 @@ free_out: for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); if (cont->parent != NULL) - kfree(mem); + mem_cgroup_free(mem); return ERR_PTR(-ENOMEM); } @@ -1058,7 +1064,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); - kfree(mem_cgroup_from_cont(cont)); + mem_cgroup_free(mem_cgroup_from_cont(cont)); } static int mem_cgroup_populate(struct cgroup_subsys *ss, @@ -1098,10 +1104,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, if (!thread_group_leader(p)) goto out; - css_get(&mem->css); - rcu_assign_pointer(mm->mem_cgroup, mem); - css_put(&old_mem->css); - out: mmput(mm); } diff --git a/mm/memory.c b/mm/memory.c index 0d14d1e58a5..bbab1e37055 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags) } /* - * This function gets the "struct page" associated with a pte. + * vm_normal_page -- This function gets the "struct page" associated with a pte. * - * NOTE! Some mappings do not have "struct pages". A raw PFN mapping - * will have each page table entry just pointing to a raw page frame - * number, and as far as the VM layer is concerned, those do not have - * pages associated with them - even if the PFN might point to memory - * that otherwise is perfectly fine and has a "struct page". + * "Special" mappings do not wish to be associated with a "struct page" (either + * it doesn't exist, or it exists but they don't want to touch it). In this + * case, NULL is returned here. "Normal" mappings do have a struct page. * - * The way we recognize those mappings is through the rules set up - * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, - * and the vm_pgoff will point to the first PFN mapped: thus every - * page that is a raw mapping will always honor the rule + * There are 2 broad cases. Firstly, an architecture may define a pte_special() + * pte bit, in which case this function is trivial. Secondly, an architecture + * may not have a spare pte bit, which requires a more complicated scheme, + * described below. + * + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a + * special mapping (even if there are underlying and valid "struct pages"). + * COWed pages of a VM_PFNMAP are always normal. + * + * The way we recognize COWed pages within VM_PFNMAP mappings is through the + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit + * set, and the vm_pgoff will point to the first PFN mapped: thus every special + * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * - * and if that isn't true, the page has been COW'ed (in which case it - * _does_ have a "struct page" associated with it even if it is in a - * VM_PFNMAP range). + * And for normal mappings this is false. + * + * This restricts such mappings to be a linear translation from virtual address + * to pfn. To get around this restriction, we allow arbitrary mappings so long + * as the vma is not a COW mapping; in that case, we know that all ptes are + * special (because none can have been COWed). + * + * + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. + * + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct + * page" backing, however the difference is that _all_ pages with a struct + * page (that is, those where pfn_valid is true) are refcounted and considered + * normal pages by the VM. The disadvantage is that pages are refcounted + * (which can be slower and simply not an option for some PFNMAP users). The + * advantage is that we don't have to follow the strict linearity rule of + * PFNMAP mappings in order to support COWable mappings. + * */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +#ifdef __HAVE_ARCH_PTE_SPECIAL +# define HAVE_PTE_SPECIAL 1 +#else +# define HAVE_PTE_SPECIAL 0 +#endif +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { - unsigned long pfn = pte_pfn(pte); + unsigned long pfn; - if (unlikely(vma->vm_flags & VM_PFNMAP)) { - unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; + if (HAVE_PTE_SPECIAL) { + if (likely(!pte_special(pte))) { + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + return pte_page(pte); + } + VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + return NULL; } -#ifdef CONFIG_DEBUG_VM - /* - * Add some anal sanity checks for now. Eventually, - * we should just do "return pfn_to_page(pfn)", but - * in the meantime we check that we get a valid pfn, - * and that the resulting page looks ok. - */ - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - return NULL; + /* !HAVE_PTE_SPECIAL case follows: */ + + pfn = pte_pfn(pte); + + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } } -#endif + + VM_BUG_ON(!pfn_valid(pfn)); /* - * NOTE! We still have PageReserved() pages in the page - * tables. + * NOTE! We still have PageReserved() pages in the page tables. * - * The PAGE_ZERO() pages and various VDSO mappings can - * cause them to exist. + * eg. VDSO mappings can cause them to exist. */ +out: return pfn_to_page(pfn); } @@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) foll_flags |= FOLL_GET; if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || (!vma->vm_ops->nopage && - !vma->vm_ops->fault))) + (!vma->vm_ops || !vma->vm_ops->fault)) foll_flags |= FOLL_ANON; do { @@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, * old drivers should use this, and they needed to mark their * pages reserved for the old functions anyway. */ -static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) +static int insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) { + struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1202,40 +1239,26 @@ out: * * The page does not need to be reserved. */ -int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!page_count(page)) return -EINVAL; vma->vm_flags |= VM_INSERTPAGE; - return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_inert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) +static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte, entry; spinlock_t *ptl; - BUG_ON(!(vma->vm_flags & VM_PFNMAP)); - BUG_ON(is_cow_mapping(vma->vm_flags)); - retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) @@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, goto out_unlock; /* Ok, finally just insert the thing.. */ - entry = pfn_pte(pfn, vma->vm_page_prot); + entry = pte_mkspecial(pfn_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); - update_mmu_cache(vma, addr, entry); + update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); - out: return retval; } + +/** + * vm_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_inert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return NULL. + */ +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} EXPORT_SYMBOL(vm_insert_pfn); +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + + /* + * If we don't have pte special, then we have to use the pfn_valid() + * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* + * refcount the page if pfn_valid is true (hence insert_page rather + * than insert_pfn). + */ + if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + struct page *page; + + page = pfn_to_page(pfn); + return insert_page(vma, addr, page, vma->vm_page_prot); + } + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_mixed); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results @@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, BUG_ON(vma->vm_flags & VM_PFNMAP); - if (likely(vma->vm_ops->fault)) { - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) - return ret; - } else { - /* Legacy ->nopage path */ - ret = 0; - vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); - /* no page was available -- either SIGBUS or OOM */ - if (unlikely(vmf.page == NOPAGE_SIGBUS)) - return VM_FAULT_SIGBUS; - else if (unlikely(vmf.page == NOPAGE_OOM)) - return VM_FAULT_OOM; - } + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; /* * For consistency in subsequent calls, make the faulted page always @@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long pfn; pte_unmap(page_table); - BUG_ON(!(vma->vm_flags & VM_PFNMAP)); - BUG_ON(is_cow_mapping(vma->vm_flags)); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); + + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + if (unlikely(pfn == NOPFN_OOM)) return VM_FAULT_OOM; else if (unlikely(pfn == NOPFN_SIGBUS)) @@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { - if (vma->vm_ops->fault || vma->vm_ops->nopage) + if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, pte, pmd, write_access, entry); if (unlikely(vma->vm_ops->nopfn)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0fb33027127..b17dca7249f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -29,6 +29,8 @@ #include <asm/tlbflush.h> +#include "internal.h" + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { @@ -58,8 +60,105 @@ static void release_memory_resource(struct resource *res) return; } - #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifndef CONFIG_SPARSEMEM_VMEMMAP +static void get_page_bootmem(unsigned long info, struct page *page, int magic) +{ + atomic_set(&page->_mapcount, magic); + SetPagePrivate(page); + set_page_private(page, info); + atomic_inc(&page->_count); +} + +void put_page_bootmem(struct page *page) +{ + int magic; + + magic = atomic_read(&page->_mapcount); + BUG_ON(magic >= -1); + + if (atomic_dec_return(&page->_count) == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + reset_page_mapcount(page); + __free_pages_bootmem(page, 0); + } + +} + +void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + if (!pfn_valid(start_pfn)) + return; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + /* Get section's memmap address */ + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + /* + * Get page for the memmap's phys address + * XXX: need more consideration for sparse_vmemmap... + */ + page = virt_to_page(memmap); + mapsize = sizeof(struct page) * PAGES_PER_SECTION; + mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; + + /* remember memmap's page */ + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, SECTION_INFO); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_INFO); + +} + +void register_page_bootmem_info_node(struct pglist_data *pgdat) +{ + unsigned long i, pfn, end_pfn, nr_pages; + int node = pgdat->node_id; + struct page *page; + struct zone *zone; + + nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; + page = virt_to_page(pgdat); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + + zone = &pgdat->node_zones[0]; + for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { + if (zone->wait_table) { + nr_pages = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; + page = virt_to_page(zone->wait_table); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + } + } + + pfn = pgdat->node_start_pfn; + end_pfn = pfn + pgdat->node_spanned_pages; + + /* register_section info */ + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) + register_page_bootmem_info_section(pfn); + +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) { struct pglist_data *pgdat = zone->zone_pgdat; @@ -101,6 +200,36 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) return register_new_memory(__pfn_to_section(phys_start_pfn)); } +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + /* + * XXX: Freeing memmap with vmemmap is not implement yet. + * This should be removed later. + */ + return -EBUSY; +} +#else +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + unsigned long flags; + struct pglist_data *pgdat = zone->zone_pgdat; + int ret = -EINVAL; + + if (!valid_section(ms)) + return ret; + + ret = unregister_memory_section(ms); + if (ret) + return ret; + + pgdat_resize_lock(pgdat, &flags); + sparse_remove_one_section(zone, ms); + pgdat_resize_unlock(pgdat, &flags); + return 0; +} +#endif + /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will @@ -134,6 +263,42 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, } EXPORT_SYMBOL_GPL(__add_pages); +/** + * __remove_pages() - remove sections of pages from a zone + * @zone: zone from which pages need to be removed + * @phys_start_pfn: starting pageframe (must be aligned to start of a section) + * @nr_pages: number of pages to remove (must be multiple of section size) + * + * Generic helper function to remove section mappings and sysfs entries + * for the section of the memory we are removing. Caller needs to make + * sure that pages are marked reserved and zones are adjust properly by + * calling offline_pages(). + */ +int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i, ret = 0; + int sections_to_remove; + + /* + * We can only remove entire sections + */ + BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); + BUG_ON(nr_pages % PAGES_PER_SECTION); + + release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); + + sections_to_remove = nr_pages / PAGES_PER_SECTION; + for (i = 0; i < sections_to_remove; i++) { + unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + ret = __remove_section(zone, __pfn_to_section(pfn)); + if (ret) + break; + } + return ret; +} +EXPORT_SYMBOL_GPL(__remove_pages); + static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { @@ -164,6 +329,25 @@ static void grow_pgdat_span(struct pglist_data *pgdat, pgdat->node_start_pfn; } +void online_page(struct page *page) +{ + totalram_pages++; + num_physpages++; + +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages++; +#endif + +#ifdef CONFIG_FLATMEM + max_mapnr = max(page_to_pfn(page), max_mapnr); +#endif + + ClearPageReserved(page); + init_page_count(page); + __free_page(page); +} + static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3c360112150..a37a5034f63 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -63,7 +63,6 @@ grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. - could replace all the switch()es with a mempolicy_ops structure. */ #include <linux/mempolicy.h> @@ -89,6 +88,7 @@ #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/ctype.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -105,142 +105,264 @@ static struct kmem_cache *sn_cache; policied. */ enum zone_type policy_zone = 0; +/* + * run-time system-wide default policy => local allocation + */ struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ - .policy = MPOL_DEFAULT, + .mode = MPOL_PREFERRED, + .flags = MPOL_F_LOCAL, }; -static void mpol_rebind_policy(struct mempolicy *pol, - const nodemask_t *newmask); +static const struct mempolicy_operations { + int (*create)(struct mempolicy *pol, const nodemask_t *nodes); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); +} mpol_ops[MPOL_MAX]; -/* Do sanity checking on a policy */ -static int mpol_check_policy(int mode, nodemask_t *nodes) +/* Check that the nodemask contains at least one populated zone */ +static int is_valid_nodemask(const nodemask_t *nodemask) { - int was_empty, is_empty; + int nd, k; - if (!nodes) - return 0; + /* Check that there is something useful in this mask */ + k = policy_zone; - /* - * "Contextualize" the in-coming nodemast for cpusets: - * Remember whether in-coming nodemask was empty, If not, - * restrict the nodes to the allowed nodes in the cpuset. - * This is guaranteed to be a subset of nodes with memory. - */ - cpuset_update_task_memory_state(); - is_empty = was_empty = nodes_empty(*nodes); - if (!was_empty) { - nodes_and(*nodes, *nodes, cpuset_current_mems_allowed); - is_empty = nodes_empty(*nodes); /* after "contextualization" */ - } + for_each_node_mask(nd, *nodemask) { + struct zone *z; - switch (mode) { - case MPOL_DEFAULT: - /* - * require caller to specify an empty nodemask - * before "contextualization" - */ - if (!was_empty) - return -EINVAL; - break; - case MPOL_BIND: - case MPOL_INTERLEAVE: - /* - * require at least 1 valid node after "contextualization" - */ - if (is_empty) - return -EINVAL; - break; - case MPOL_PREFERRED: - /* - * Did caller specify invalid nodes? - * Don't silently accept this as "local allocation". - */ - if (!was_empty && is_empty) - return -EINVAL; - break; + for (k = 0; k <= policy_zone; k++) { + z = &NODE_DATA(nd)->node_zones[k]; + if (z->present_pages > 0) + return 1; + } } + return 0; } -/* Generate a custom zonelist for the BIND policy. */ -static struct zonelist *bind_zonelist(nodemask_t *nodes) +static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - struct zonelist *zl; - int num, max, nd; - enum zone_type k; + return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); +} - max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); - max++; /* space for zlcache_ptr (see mmzone.h) */ - zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); - if (!zl) - return ERR_PTR(-ENOMEM); - zl->zlcache_ptr = NULL; - num = 0; - /* First put in the highest zones from all nodes, then all the next - lower zones etc. Avoid empty zones because the memory allocator - doesn't like them. If you implement node hot removal you - have to fix that. */ - k = MAX_NR_ZONES - 1; - while (1) { - for_each_node_mask(nd, *nodes) { - struct zone *z = &NODE_DATA(nd)->node_zones[k]; - if (z->present_pages > 0) - zl->zones[num++] = z; - } - if (k == 0) - break; - k--; - } - if (num == 0) { - kfree(zl); - return ERR_PTR(-EINVAL); - } - zl->zones[num] = NULL; - return zl; +static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, + const nodemask_t *rel) +{ + nodemask_t tmp; + nodes_fold(tmp, *orig, nodes_weight(*rel)); + nodes_onto(*ret, tmp, *rel); +} + +static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.nodes = *nodes; + return 0; +} + +static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (!nodes) + pol->flags |= MPOL_F_LOCAL; /* local allocation */ + else if (nodes_empty(*nodes)) + return -EINVAL; /* no allowed nodes */ + else + pol->v.preferred_node = first_node(*nodes); + return 0; +} + +static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (!is_valid_nodemask(nodes)) + return -EINVAL; + pol->v.nodes = *nodes; + return 0; } /* Create a new policy */ -static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) +static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, + nodemask_t *nodes) { struct mempolicy *policy; + nodemask_t cpuset_context_nmask; + int ret; - pr_debug("setting mode %d nodes[0] %lx\n", - mode, nodes ? nodes_addr(*nodes)[0] : -1); + pr_debug("setting mode %d flags %d nodes[0] %lx\n", + mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); - if (mode == MPOL_DEFAULT) - return NULL; + if (mode == MPOL_DEFAULT) { + if (nodes && !nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + return NULL; /* simply delete any existing policy */ + } + VM_BUG_ON(!nodes); + + /* + * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or + * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). + * All other modes require a valid pointer to a non-empty nodemask. + */ + if (mode == MPOL_PREFERRED) { + if (nodes_empty(*nodes)) { + if (((flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES))) + return ERR_PTR(-EINVAL); + nodes = NULL; /* flag local alloc */ + } + } else if (nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); - switch (mode) { - case MPOL_INTERLEAVE: - policy->v.nodes = *nodes; - if (nodes_weight(policy->v.nodes) == 0) { - kmem_cache_free(policy_cache, policy); - return ERR_PTR(-EINVAL); - } - break; - case MPOL_PREFERRED: - policy->v.preferred_node = first_node(*nodes); - if (policy->v.preferred_node >= MAX_NUMNODES) - policy->v.preferred_node = -1; - break; - case MPOL_BIND: - policy->v.zonelist = bind_zonelist(nodes); - if (IS_ERR(policy->v.zonelist)) { - void *error_code = policy->v.zonelist; - kmem_cache_free(policy_cache, policy); - return error_code; - } - break; + policy->mode = mode; + policy->flags = flags; + + if (nodes) { + /* + * cpuset related setup doesn't apply to local allocation + */ + cpuset_update_task_memory_state(); + if (flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&cpuset_context_nmask, nodes, + &cpuset_current_mems_allowed); + else + nodes_and(cpuset_context_nmask, *nodes, + cpuset_current_mems_allowed); + if (mpol_store_user_nodemask(policy)) + policy->w.user_nodemask = *nodes; + else + policy->w.cpuset_mems_allowed = + cpuset_mems_allowed(current); + } + + ret = mpol_ops[mode].create(policy, + nodes ? &cpuset_context_nmask : NULL); + if (ret < 0) { + kmem_cache_free(policy_cache, policy); + return ERR_PTR(ret); } - policy->policy = mode; - policy->cpuset_mems_allowed = cpuset_mems_allowed(current); return policy; } +/* Slow path of a mpol destructor. */ +void __mpol_put(struct mempolicy *p) +{ + if (!atomic_dec_and_test(&p->refcnt)) + return; + kmem_cache_free(policy_cache, p); +} + +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +{ +} + +static void mpol_rebind_nodemask(struct mempolicy *pol, + const nodemask_t *nodes) +{ + nodemask_t tmp; + + if (pol->flags & MPOL_F_STATIC_NODES) + nodes_and(tmp, pol->w.user_nodemask, *nodes); + else if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); + else { + nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = *nodes; + } + + pol->v.nodes = tmp; + if (!node_isset(current->il_next, tmp)) { + current->il_next = next_node(current->il_next, tmp); + if (current->il_next >= MAX_NUMNODES) + current->il_next = first_node(tmp); + if (current->il_next >= MAX_NUMNODES) + current->il_next = numa_node_id(); + } +} + +static void mpol_rebind_preferred(struct mempolicy *pol, + const nodemask_t *nodes) +{ + nodemask_t tmp; + + if (pol->flags & MPOL_F_STATIC_NODES) { + int node = first_node(pol->w.user_nodemask); + + if (node_isset(node, *nodes)) { + pol->v.preferred_node = node; + pol->flags &= ~MPOL_F_LOCAL; + } else + pol->flags |= MPOL_F_LOCAL; + } else if (pol->flags & MPOL_F_RELATIVE_NODES) { + mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); + pol->v.preferred_node = first_node(tmp); + } else if (!(pol->flags & MPOL_F_LOCAL)) { + pol->v.preferred_node = node_remap(pol->v.preferred_node, + pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = *nodes; + } +} + +/* Migrate a policy to a different set of nodes */ +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask) +{ + if (!pol) + return; + if (!mpol_store_user_nodemask(pol) && + nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) + return; + mpol_ops[pol->mode].rebind(pol, newmask); +} + +/* + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. + */ + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +{ + mpol_rebind_policy(tsk->mempolicy, new); +} + +/* + * Rebind each vma in mm to new nodemask. + * + * Call holding a reference to mm. Takes mm->mmap_sem during call. + */ + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new); + up_write(&mm->mmap_sem); +} + +static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { + [MPOL_DEFAULT] = { + .rebind = mpol_rebind_default, + }, + [MPOL_INTERLEAVE] = { + .create = mpol_new_interleave, + .rebind = mpol_rebind_nodemask, + }, + [MPOL_PREFERRED] = { + .create = mpol_new_preferred, + .rebind = mpol_rebind_preferred, + }, + [MPOL_BIND] = { + .create = mpol_new_bind, + .rebind = mpol_rebind_nodemask, + }, +}; + static void gather_stats(struct page *, void *, int pte_dirty); static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); @@ -421,7 +543,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) if (!err) { mpol_get(new); vma->vm_policy = new; - mpol_free(old); + mpol_put(old); } return err; } @@ -479,46 +601,55 @@ static void mpol_set_task_struct_flag(void) } /* Set the process memory policy */ -static long do_set_mempolicy(int mode, nodemask_t *nodes) +static long do_set_mempolicy(unsigned short mode, unsigned short flags, + nodemask_t *nodes) { struct mempolicy *new; + struct mm_struct *mm = current->mm; - if (mpol_check_policy(mode, nodes)) - return -EINVAL; - new = mpol_new(mode, nodes); + new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) return PTR_ERR(new); - mpol_free(current->mempolicy); + + /* + * prevent changing our mempolicy while show_numa_maps() + * is using it. + * Note: do_set_mempolicy() can be called at init time + * with no 'mm'. + */ + if (mm) + down_write(&mm->mmap_sem); + mpol_put(current->mempolicy); current->mempolicy = new; mpol_set_task_struct_flag(); - if (new && new->policy == MPOL_INTERLEAVE) + if (new && new->mode == MPOL_INTERLEAVE && + nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); + if (mm) + up_write(&mm->mmap_sem); + return 0; } -/* Fill a zone bitmap for a policy */ -static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) +/* + * Return nodemask for policy for get_mempolicy() query + */ +static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { - int i; - nodes_clear(*nodes); - switch (p->policy) { + if (p == &default_policy) + return; + + switch (p->mode) { case MPOL_BIND: - for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(zone_to_nid(p->v.zonelist->zones[i]), - *nodes); - break; - case MPOL_DEFAULT: - break; + /* Fall through */ case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; case MPOL_PREFERRED: - /* or use current node instead of memory_map? */ - if (p->v.preferred_node < 0) - *nodes = node_states[N_HIGH_MEMORY]; - else + if (!(p->flags & MPOL_F_LOCAL)) node_set(p->v.preferred_node, *nodes); + /* else return empty node mask for local allocation */ break; default: BUG(); @@ -561,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + /* + * Do NOT fall back to task policy if the + * vma/shared policy at addr is NULL. We + * want to return MPOL_DEFAULT in this case. + */ down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); if (!vma) { @@ -575,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, return -EINVAL; if (!pol) - pol = &default_policy; + pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { @@ -584,14 +720,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, goto out; *policy = err; } else if (pol == current->mempolicy && - pol->policy == MPOL_INTERLEAVE) { + pol->mode == MPOL_INTERLEAVE) { *policy = current->il_next; } else { err = -EINVAL; goto out; } - } else - *policy = pol->policy; + } else { + *policy = pol == &default_policy ? MPOL_DEFAULT : + pol->mode; + *policy |= pol->flags; + } if (vma) { up_read(¤t->mm->mmap_sem); @@ -600,9 +739,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, err = 0; if (nmask) - get_zonemask(pol, nmask); + get_policy_nodemask(pol, nmask); out: + mpol_cond_put(pol); if (vma) up_read(¤t->mm->mmap_sem); return err; @@ -664,7 +804,7 @@ int do_migrate_pages(struct mm_struct *mm, int err = 0; nodemask_t tmp; - down_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); err = migrate_vmas(mm, from_nodes, to_nodes, flags); if (err) @@ -781,8 +921,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * #endif static long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, - unsigned long flags) + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; @@ -791,9 +931,8 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if ((flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - || mode > MPOL_MAX) + if (flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -812,10 +951,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (end == start) return 0; - if (mpol_check_policy(mode, nmask)) - return -EINVAL; - - new = mpol_new(mode, nmask); + new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); @@ -826,8 +962,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode, nmask ? nodes_addr(*nmask)[0] : -1); + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", + start, start + len, mode, mode_flags, + nmask ? nodes_addr(*nmask)[0] : -1); down_write(&mm->mmap_sem); vma = check_range(mm, start, end, nmask, @@ -848,7 +985,7 @@ static long do_mbind(unsigned long start, unsigned long len, } up_write(&mm->mmap_sem); - mpol_free(new); + mpol_put(new); return err; } @@ -926,11 +1063,19 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, { nodemask_t nodes; int err; + unsigned short mode_flags; + mode_flags = mode & MPOL_MODE_FLAGS; + mode &= ~MPOL_MODE_FLAGS; + if (mode >= MPOL_MAX) + return -EINVAL; + if ((mode_flags & MPOL_F_STATIC_NODES) && + (mode_flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_mbind(start, len, mode, &nodes, flags); + return do_mbind(start, len, mode, mode_flags, &nodes, flags); } /* Set the process memory policy */ @@ -939,13 +1084,18 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, { int err; nodemask_t nodes; + unsigned short flags; - if (mode < 0 || mode > MPOL_MAX) + flags = mode & MPOL_MODE_FLAGS; + mode &= ~MPOL_MODE_FLAGS; + if ((unsigned int)mode >= MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_set_mempolicy(mode, &nodes); + return do_set_mempolicy(mode, flags, &nodes); } asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, @@ -1131,59 +1281,75 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. - * Returned policy has extra reference count if shared, vma, - * or some other task's policy [show_numa_maps() can pass - * @task != current]. It is the caller's responsibility to - * free the reference in these cases. + * Current or other task's task mempolicy and non-shared vma policies + * are protected by the task's mmap_sem, which must be held for read by + * the caller. + * Shared policies [those marked as MPOL_F_SHARED] require an extra reference + * count--added by the get_policy() vm_op, as appropriate--to protect against + * freeing by another task. It is the caller's responsibility to free the + * extra reference for shared policies. */ -static struct mempolicy * get_vma_policy(struct task_struct *task, +static struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; - int shared_pol = 0; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr); - shared_pol = 1; /* if pol non-NULL, add ref below */ - } else if (vma->vm_policy && - vma->vm_policy->policy != MPOL_DEFAULT) + struct mempolicy *vpol = vma->vm_ops->get_policy(vma, + addr); + if (vpol) + pol = vpol; + } else if (vma->vm_policy) pol = vma->vm_policy; } if (!pol) pol = &default_policy; - else if (!shared_pol && pol != current->mempolicy) - mpol_get(pol); /* vma or other task's policy */ return pol; } -/* Return a zonelist representing a mempolicy */ -static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { - int nd; + /* Lower zones don't get a nodemask applied for MPOL_BIND */ + if (unlikely(policy->mode == MPOL_BIND) && + gfp_zone(gfp) >= policy_zone && + cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) + return &policy->v.nodes; - switch (policy->policy) { + return NULL; +} + +/* Return a zonelist indicated by gfp for node representing a mempolicy */ +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +{ + int nd = numa_node_id(); + + switch (policy->mode) { case MPOL_PREFERRED: - nd = policy->v.preferred_node; - if (nd < 0) - nd = numa_node_id(); + if (!(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; break; case MPOL_BIND: - /* Lower zones don't get a policy applied */ - /* Careful: current->mems_allowed might have moved */ - if (gfp_zone(gfp) >= policy_zone) - if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) - return policy->v.zonelist; - /*FALL THROUGH*/ + /* + * Normally, MPOL_BIND allocations are node-local within the + * allowed nodemask. However, if __GFP_THISNODE is set and the + * current node is part of the mask, we use the zonelist for + * the first node in the mask instead. + */ + if (unlikely(gfp & __GFP_THISNODE) && + unlikely(!node_isset(nd, policy->v.nodes))) + nd = first_node(policy->v.nodes); + break; case MPOL_INTERLEAVE: /* should not happen */ - case MPOL_DEFAULT: - nd = numa_node_id(); break; default: - nd = 0; BUG(); } - return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); + return node_zonelist(nd, gfp); } /* Do dynamic interleaving for a process */ @@ -1196,36 +1362,51 @@ static unsigned interleave_nodes(struct mempolicy *policy) next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) next = first_node(policy->v.nodes); - me->il_next = next; + if (next < MAX_NUMNODES) + me->il_next = next; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. + * @policy must be protected by freeing by the caller. If @policy is + * the current task's mempolicy, this protection is implicit, as only the + * task can change it's policy. The system default policy requires no + * such protection. */ unsigned slab_node(struct mempolicy *policy) { - int pol = policy ? policy->policy : MPOL_DEFAULT; + if (!policy || policy->flags & MPOL_F_LOCAL) + return numa_node_id(); + + switch (policy->mode) { + case MPOL_PREFERRED: + /* + * handled MPOL_F_LOCAL above + */ + return policy->v.preferred_node; - switch (pol) { case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: + case MPOL_BIND: { /* * Follow bind policy behavior and start allocation at the * first node. */ - return zone_to_nid(policy->v.zonelist->zones[0]); - - case MPOL_PREFERRED: - if (policy->v.preferred_node >= 0) - return policy->v.preferred_node; - /* Fall through */ + struct zonelist *zonelist; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); + zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + (void)first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes, + &zone); + return zone->node; + } default: - return numa_node_id(); + BUG(); } } @@ -1234,10 +1415,13 @@ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) { unsigned nnodes = nodes_weight(pol->v.nodes); - unsigned target = (unsigned)off % nnodes; + unsigned target; int c; int nid = -1; + if (!nnodes) + return numa_node_id(); + target = (unsigned int)off % nnodes; c = 0; do { nid = next_node(nid, pol->v.nodes); @@ -1274,40 +1458,30 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * @vma = virtual memory area whose policy is sought * @addr = address in @vma for shared policy lookup and interleave policy * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy + * @mpol = pointer to mempolicy pointer for reference counted mempolicy + * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask * - * Returns a zonelist suitable for a huge page allocation. - * If the effective policy is 'BIND, returns pointer to policy's zonelist. - * If it is also a policy for which get_vma_policy() returns an extra - * reference, we must hold that reference until after allocation. - * In that case, return policy via @mpol so hugetlb allocation can drop - * the reference. For non-'BIND referenced policies, we can/do drop the - * reference here, so the caller doesn't need to know about the special case - * for default and current task policy. + * Returns a zonelist suitable for a huge page allocation and a pointer + * to the struct mempolicy for conditional unref after allocation. + * If the effective policy is 'BIND, returns a pointer to the mempolicy's + * @nodemask for filtering the zonelist. */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags, struct mempolicy **mpol) + gfp_t gfp_flags, struct mempolicy **mpol, + nodemask_t **nodemask) { - struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; - *mpol = NULL; /* probably no unref needed */ - if (pol->policy == MPOL_INTERLEAVE) { - unsigned nid; - - nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); - if (unlikely(pol != &default_policy && - pol != current->mempolicy)) - __mpol_free(pol); /* finished with pol */ - return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); - } + *mpol = get_vma_policy(current, vma, addr); + *nodemask = NULL; /* assume !MPOL_BIND */ - zl = zonelist_policy(GFP_HIGHUSER, pol); - if (unlikely(pol != &default_policy && pol != current->mempolicy)) { - if (pol->policy != MPOL_BIND) - __mpol_free(pol); /* finished with pol */ - else - *mpol = pol; /* unref needed after allocation */ + if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + zl = node_zonelist(interleave_nid(*mpol, vma, addr, + HPAGE_SHIFT), gfp_flags); + } else { + zl = policy_zonelist(gfp_flags, *mpol); + if ((*mpol)->mode == MPOL_BIND) + *nodemask = &(*mpol)->v.nodes; } return zl; } @@ -1321,9 +1495,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, struct zonelist *zl; struct page *page; - zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); + zl = node_zonelist(nid, gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) + if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1358,28 +1532,27 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) cpuset_update_task_memory_state(); - if (unlikely(pol->policy == MPOL_INTERLEAVE)) { + if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); - if (unlikely(pol != &default_policy && - pol != current->mempolicy)) - __mpol_free(pol); /* finished with pol */ + mpol_cond_put(pol); return alloc_page_interleave(gfp, 0, nid); } - zl = zonelist_policy(gfp, pol); - if (pol != &default_policy && pol != current->mempolicy) { + zl = policy_zonelist(gfp, pol); + if (unlikely(mpol_needs_cond_ref(pol))) { /* - * slow path: ref counted policy -- shared or vma + * slow path: ref counted shared policy */ - struct page *page = __alloc_pages(gfp, 0, zl); - __mpol_free(pol); + struct page *page = __alloc_pages_nodemask(gfp, 0, + zl, policy_nodemask(gfp, pol)); + __mpol_put(pol); return page; } /* * fast path: default or task policy */ - return __alloc_pages(gfp, 0, zl); + return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); } /** @@ -1409,22 +1582,28 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) cpuset_update_task_memory_state(); if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; - if (pol->policy == MPOL_INTERLEAVE) + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (pol->mode == MPOL_INTERLEAVE) return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); + return __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); } EXPORT_SYMBOL(alloc_pages_current); /* - * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it + * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). */ -/* Slow path of a mempolicy copy */ -struct mempolicy *__mpol_copy(struct mempolicy *old) +/* Slow path of a mempolicy duplicate */ +struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -1436,55 +1615,64 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) } *new = *old; atomic_set(&new->refcnt, 1); - if (new->policy == MPOL_BIND) { - int sz = ksize(old->v.zonelist); - new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); - if (!new->v.zonelist) { - kmem_cache_free(policy_cache, new); - return ERR_PTR(-ENOMEM); - } - } return new; } +/* + * If *frompol needs [has] an extra ref, copy *frompol to *tompol , + * eliminate the * MPOL_F_* flags that require conditional ref and + * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly + * after return. Use the returned value. + * + * Allows use of a mempolicy for, e.g., multiple allocations with a single + * policy lookup, even if the policy needs/has extra ref on lookup. + * shmem_readahead needs this. + */ +struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, + struct mempolicy *frompol) +{ + if (!mpol_needs_cond_ref(frompol)) + return frompol; + + *tompol = *frompol; + tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ + __mpol_put(frompol); + return tompol; +} + +static int mpol_match_intent(const struct mempolicy *a, + const struct mempolicy *b) +{ + if (a->flags != b->flags) + return 0; + if (!mpol_store_user_nodemask(a)) + return 1; + return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); +} + /* Slow path of a mempolicy comparison */ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return 0; - if (a->policy != b->policy) + if (a->mode != b->mode) return 0; - switch (a->policy) { - case MPOL_DEFAULT: - return 1; + if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) + return 0; + switch (a->mode) { + case MPOL_BIND: + /* Fall through */ case MPOL_INTERLEAVE: return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - return a->v.preferred_node == b->v.preferred_node; - case MPOL_BIND: { - int i; - for (i = 0; a->v.zonelist->zones[i]; i++) - if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) - return 0; - return b->v.zonelist->zones[i] == NULL; - } + return a->v.preferred_node == b->v.preferred_node && + a->flags == b->flags; default: BUG(); return 0; } } -/* Slow path of a mpol destructor. */ -void __mpol_free(struct mempolicy *p) -{ - if (!atomic_dec_and_test(&p->refcnt)) - return; - if (p->policy == MPOL_BIND) - kfree(p->v.zonelist); - p->policy = MPOL_DEFAULT; - kmem_cache_free(policy_cache, p); -} - /* * Shared memory backing store policy support. * @@ -1547,7 +1735,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, - new->policy ? new->policy->policy : 0); + new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ @@ -1573,7 +1761,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); - mpol_free(n->policy); + mpol_put(n->policy); kmem_cache_free(sn_cache, n); } @@ -1587,6 +1775,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, n->start = start; n->end = end; mpol_get(pol); + pol->flags |= MPOL_F_SHARED; /* for unref */ n->policy = pol; return n; } @@ -1633,33 +1822,41 @@ restart: sp_insert(sp, new); spin_unlock(&sp->lock); if (new2) { - mpol_free(new2->policy); + mpol_put(new2->policy); kmem_cache_free(sn_cache, new2); } return 0; } -void mpol_shared_policy_init(struct shared_policy *info, int policy, - nodemask_t *policy_nodes) -{ - info->root = RB_ROOT; - spin_lock_init(&info->lock); - - if (policy != MPOL_DEFAULT) { - struct mempolicy *newpol; - - /* Falls back to MPOL_DEFAULT on any error */ - newpol = mpol_new(policy, policy_nodes); - if (!IS_ERR(newpol)) { - /* Create pseudo-vma that contains just the policy */ - struct vm_area_struct pvma; - - memset(&pvma, 0, sizeof(struct vm_area_struct)); - /* Policy covers entire file */ - pvma.vm_end = TASK_SIZE; - mpol_set_shared_policy(info, &pvma, newpol); - mpol_free(newpol); - } +/** + * mpol_shared_policy_init - initialize shared policy for inode + * @sp: pointer to inode shared policy + * @mpol: struct mempolicy to install + * + * Install non-NULL @mpol in inode's shared policy rb-tree. + * On entry, the current task has a reference on a non-NULL @mpol. + * This must be released on exit. + */ +void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) +{ + sp->root = RB_ROOT; /* empty tree == default mempolicy */ + spin_lock_init(&sp->lock); + + if (mpol) { + struct vm_area_struct pvma; + struct mempolicy *new; + + /* contextualize the tmpfs mount point mempolicy */ + new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + mpol_put(mpol); /* drop our ref on sb mpol */ + if (IS_ERR(new)) + return; /* no valid nodemask intersection */ + + /* Create pseudo-vma that contains just the policy */ + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_end = TASK_SIZE; /* policy covers entire file */ + mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + mpol_put(new); /* drop initial ref */ } } @@ -1670,9 +1867,10 @@ int mpol_set_shared_policy(struct shared_policy *info, struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - pr_debug("set_shared_policy %lx sz %lu %d %lx\n", + pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", vma->vm_pgoff, - sz, npol? npol->policy : -1, + sz, npol ? npol->mode : -1, + npol ? npol->flags : -1, npol ? nodes_addr(npol->v.nodes)[0] : -1); if (npol) { @@ -1700,7 +1898,7 @@ void mpol_free_shared_policy(struct shared_policy *p) n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); rb_erase(&n->nd, &p->root); - mpol_free(n->policy); + mpol_put(n->policy); kmem_cache_free(sn_cache, n); } spin_unlock(&p->lock); @@ -1745,120 +1943,177 @@ void __init numa_policy_init(void) if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); - if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) + if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); } /* Reset policy of current process to default */ void numa_default_policy(void) { - do_set_mempolicy(MPOL_DEFAULT, NULL); + do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } -/* Migrate a policy to a different set of nodes */ -static void mpol_rebind_policy(struct mempolicy *pol, - const nodemask_t *newmask) -{ - nodemask_t *mpolmask; - nodemask_t tmp; +/* + * Parse and format mempolicy from/to strings + */ - if (!pol) - return; - mpolmask = &pol->cpuset_mems_allowed; - if (nodes_equal(*mpolmask, *newmask)) - return; +/* + * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag + * Used only for mpol_parse_str() and mpol_to_str() + */ +#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) +static const char * const policy_types[] = + { "default", "prefer", "bind", "interleave", "local" }; - switch (pol->policy) { - case MPOL_DEFAULT: - break; - case MPOL_INTERLEAVE: - nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); - pol->v.nodes = tmp; - *mpolmask = *newmask; - current->il_next = node_remap(current->il_next, - *mpolmask, *newmask); - break; - case MPOL_PREFERRED: - pol->v.preferred_node = node_remap(pol->v.preferred_node, - *mpolmask, *newmask); - *mpolmask = *newmask; - break; - case MPOL_BIND: { - nodemask_t nodes; - struct zone **z; - struct zonelist *zonelist; +#ifdef CONFIG_TMPFS +/** + * mpol_parse_str - parse string to mempolicy + * @str: string containing mempolicy to parse + * @mpol: pointer to struct mempolicy pointer, returned on success. + * @no_context: flag whether to "contextualize" the mempolicy + * + * Format of input: + * <mode>[=<flags>][:<nodelist>] + * + * if @no_context is true, save the input nodemask in w.user_nodemask in + * the returned mempolicy. This will be used to "clone" the mempolicy in + * a specific context [cpuset] at a later time. Used to parse tmpfs mpol + * mount option. Note that if 'static' or 'relative' mode flags were + * specified, the input nodemask will already have been saved. Saving + * it again is redundant, but safe. + * + * On success, returns 0, else 1 + */ +int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +{ + struct mempolicy *new = NULL; + unsigned short uninitialized_var(mode); + unsigned short uninitialized_var(mode_flags); + nodemask_t nodes; + char *nodelist = strchr(str, ':'); + char *flags = strchr(str, '='); + int i; + int err = 1; + + if (nodelist) { + /* NUL-terminate mode or flags string */ + *nodelist++ = '\0'; + if (nodelist_parse(nodelist, nodes)) + goto out; + if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) + goto out; + } else nodes_clear(nodes); - for (z = pol->v.zonelist->zones; *z; z++) - node_set(zone_to_nid(*z), nodes); - nodes_remap(tmp, nodes, *mpolmask, *newmask); - nodes = tmp; - zonelist = bind_zonelist(&nodes); + if (flags) + *flags++ = '\0'; /* terminate mode string */ - /* If no mem, then zonelist is NULL and we keep old zonelist. - * If that old zonelist has no remaining mems_allowed nodes, - * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. - */ + for (i = 0; i <= MPOL_LOCAL; i++) { + if (!strcmp(str, policy_types[i])) { + mode = i; + break; + } + } + if (i > MPOL_LOCAL) + goto out; - if (!IS_ERR(zonelist)) { - /* Good - got mem - substitute new zonelist */ - kfree(pol->v.zonelist); - pol->v.zonelist = zonelist; + switch (mode) { + case MPOL_PREFERRED: + /* + * Insist on a nodelist of one node only + */ + if (nodelist) { + char *rest = nodelist; + while (isdigit(*rest)) + rest++; + if (!*rest) + err = 0; } - *mpolmask = *newmask; break; - } - default: - BUG(); + case MPOL_INTERLEAVE: + /* + * Default to online nodes with memory if no nodelist + */ + if (!nodelist) + nodes = node_states[N_HIGH_MEMORY]; + err = 0; + break; + case MPOL_LOCAL: + /* + * Don't allow a nodelist; mpol_new() checks flags + */ + if (nodelist) + goto out; + mode = MPOL_PREFERRED; break; - } -} - -/* - * Wrapper for mpol_rebind_policy() that just requires task - * pointer, and updates task mempolicy. - */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) -{ - mpol_rebind_policy(tsk->mempolicy, new); -} + /* + * case MPOL_BIND: mpol_new() enforces non-empty nodemask. + * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. + */ + } -/* - * Rebind each vma in mm to new nodemask. - * - * Call holding a reference to mm. Takes mm->mmap_sem during call. - */ + mode_flags = 0; + if (flags) { + /* + * Currently, we only support two mutually exclusive + * mode flags. + */ + if (!strcmp(flags, "static")) + mode_flags |= MPOL_F_STATIC_NODES; + else if (!strcmp(flags, "relative")) + mode_flags |= MPOL_F_RELATIVE_NODES; + else + err = 1; + } -void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) -{ - struct vm_area_struct *vma; + new = mpol_new(mode, mode_flags, &nodes); + if (IS_ERR(new)) + err = 1; + else if (no_context) + new->w.user_nodemask = nodes; /* save for contextualization */ - down_write(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new); - up_write(&mm->mmap_sem); +out: + /* Restore string for error message */ + if (nodelist) + *--nodelist = ':'; + if (flags) + *--flags = '='; + if (!err) + *mpol = new; + return err; } +#endif /* CONFIG_TMPFS */ -/* - * Display pages allocated per node and memory policy via /proc. - */ - -static const char * const policy_types[] = - { "default", "prefer", "bind", "interleave" }; - -/* +/** + * mpol_to_str - format a mempolicy structure for printing + * @buffer: to contain formatted mempolicy string + * @maxlen: length of @buffer + * @pol: pointer to mempolicy to be formatted + * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask + * * Convert a mempolicy into a string. * Returns the number of characters in buffer (if positive) * or an error (negative) */ -static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) { char *p = buffer; int l; nodemask_t nodes; - int mode = pol ? pol->policy : MPOL_DEFAULT; + unsigned short mode; + unsigned short flags = pol ? pol->flags : 0; + + /* + * Sanity check: room for longest mode, flag and some nodes + */ + VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); + + if (!pol || pol == &default_policy) + mode = MPOL_DEFAULT; + else + mode = pol->mode; switch (mode) { case MPOL_DEFAULT: @@ -1867,33 +2122,50 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED: nodes_clear(nodes); - node_set(pol->v.preferred_node, nodes); + if (flags & MPOL_F_LOCAL) + mode = MPOL_LOCAL; /* pseudo-policy */ + else + node_set(pol->v.preferred_node, nodes); break; case MPOL_BIND: - get_zonemask(pol, &nodes); - break; - + /* Fall through */ case MPOL_INTERLEAVE: - nodes = pol->v.nodes; + if (no_context) + nodes = pol->w.user_nodemask; + else + nodes = pol->v.nodes; break; default: BUG(); - return -EFAULT; } l = strlen(policy_types[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; + if (buffer + maxlen < p + l + 1) + return -ENOSPC; strcpy(p, policy_types[mode]); p += l; - if (!nodes_empty(nodes)) { + if (flags & MPOL_MODE_FLAGS) { if (buffer + maxlen < p + 2) return -ENOSPC; *p++ = '='; + + /* + * Currently, the only defined flags are mutually exclusive + */ + if (flags & MPOL_F_STATIC_NODES) + p += snprintf(p, buffer + maxlen - p, "static"); + else if (flags & MPOL_F_RELATIVE_NODES) + p += snprintf(p, buffer + maxlen - p, "relative"); + } + + if (!nodes_empty(nodes)) { + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = ':'; p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); } return p - buffer; @@ -1971,6 +2243,9 @@ static inline void check_huge_range(struct vm_area_struct *vma, } #endif +/* + * Display pages allocated per node and memory policy via /proc. + */ int show_numa_map(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; @@ -1990,12 +2265,8 @@ int show_numa_map(struct seq_file *m, void *v) return 0; pol = get_vma_policy(priv->task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol); - /* - * unref shared or other task's mempolicy - */ - if (pol != &default_policy && pol != current->mempolicy) - __mpol_free(pol); + mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_cond_put(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/mm/migrate.c b/mm/migrate.c index 4e0eccca5e2..449d77d409f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -383,7 +383,14 @@ static void migrate_page_copy(struct page *newpage, struct page *page) if (PageDirty(page)) { clear_page_dirty_for_io(page); - set_page_dirty(newpage); + /* + * Want to mark the page and the radix tree as dirty, and + * redo the accounting that clear_page_dirty_for_io undid, + * but we can't use set_page_dirty because that function + * is actually a signal that all of the page has become dirty. + * Wheras only part of our page may be dirty. + */ + __set_page_dirty_nobuffers(newpage); } #ifdef CONFIG_SWAP diff --git a/mm/mincore.c b/mm/mincore.c index 5efe0ded69b..5178800bc12 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -33,7 +33,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * When tmpfs swaps out a page from a file, any process mapping that * file will not get a swp_entry_t in its pte, but rather it is like * any other file mapping (ie. marked !present and faulted in with - * tmpfs's .nopage). So swapped out tmpfs mappings are tested here. + * tmpfs's .fault). So swapped out tmpfs mappings are tested here. * * However when tmpfs moves the page from pagecache and into swapcache, * it is still in core, but the find_get_page below won't find it. diff --git a/mm/mmap.c b/mm/mmap.c index a32d28ce31c..fac66337da2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -230,9 +230,12 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) + if (vma->vm_file) { fput(vma->vm_file); - mpol_free(vma_policy(vma)); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + } + mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); return next; } @@ -623,10 +626,13 @@ again: remove_next = 1 + (end > next->vm_end); spin_unlock(&mapping->i_mmap_lock); if (remove_next) { - if (file) + if (file) { fput(file); + if (next->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } mm->map_count--; - mpol_free(vma_policy(next)); + mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); /* * In mprotect's case 6 (see comments on vma_merge), @@ -1068,7 +1074,6 @@ int vma_wants_writenotify(struct vm_area_struct *vma) mapping_cap_account_dirty(vma->vm_file->f_mapping); } - unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, unsigned int vm_flags, unsigned long pgoff, @@ -1155,6 +1160,8 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; + if (vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) @@ -1181,22 +1188,22 @@ munmap_back: if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (!file || !vma_merge(mm, prev, addr, vma->vm_end, + if (file && vma_merge(mm, prev, addr, vma->vm_end, vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { - file = vma->vm_file; - vma_link(mm, vma, prev, rb_link, rb_parent); - if (correct_wcount) - atomic_inc(&inode->i_writecount); - } else { - if (file) { - if (correct_wcount) - atomic_inc(&inode->i_writecount); - fput(file); - } - mpol_free(vma_policy(vma)); + mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); + fput(file); + if (vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } else { + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; } -out: + + /* Once vma denies write, undo our temporary denial count */ + if (correct_wcount) + atomic_inc(&inode->i_writecount); +out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { @@ -1813,15 +1820,18 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } - pol = mpol_copy(vma_policy(vma)); + pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { kmem_cache_free(vm_area_cachep, new); return PTR_ERR(pol); } vma_set_policy(new, pol); - if (new->vm_file) + if (new->vm_file) { get_file(new->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); @@ -2129,7 +2139,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; - pol = mpol_copy(vma_policy(vma)); + pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { kmem_cache_free(vm_area_cachep, new_vma); return NULL; @@ -2138,8 +2148,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; - if (new_vma->vm_file) + if (new_vma->vm_file) { get_file(new_vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); diff --git a/mm/mmzone.c b/mm/mmzone.c index eb5838634f1..486ed595ee6 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone) return zone; } +static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) +{ +#ifdef CONFIG_NUMA + return node_isset(zonelist_node_idx(zref), *nodes); +#else + return 1; +#endif /* CONFIG_NUMA */ +} + +/* Returns the next zone at or below highest_zoneidx in a zonelist */ +struct zoneref *next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes, + struct zone **zone) +{ + /* + * Find the next suitable zone to use for the allocation. + * Only filter based on nodemask if it's set + */ + if (likely(nodes == NULL)) + while (zonelist_zone_idx(z) > highest_zoneidx) + z++; + else + while (zonelist_zone_idx(z) > highest_zoneidx || + (z->zone && !zref_in_nodemask(z, nodes))) + z++; + + *zone = zonelist_zone(z++); + return z; +} diff --git a/mm/nommu.c b/mm/nommu.c index 5d8ae086f74..ef8c62cec69 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -105,7 +105,11 @@ unsigned int kobjsize(const void *objp) { struct page *page; - if (!objp || !((page = virt_to_page(objp)))) + /* + * If the object we have should not have ksize performed on it, + * return size of 0 + */ + if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp)))) return 0; if (PageSlab(page)) @@ -962,8 +966,13 @@ unsigned long do_mmap_pgoff(struct file *file, INIT_LIST_HEAD(&vma->anon_vma_node); atomic_set(&vma->vm_usage, 1); - if (file) + if (file) { get_file(file); + if (vm_flags & VM_EXECUTABLE) { + added_exe_file_vma(current->mm); + vma->vm_mm = current->mm; + } + } vma->vm_file = file; vma->vm_flags = vm_flags; vma->vm_start = addr; @@ -1018,8 +1027,11 @@ unsigned long do_mmap_pgoff(struct file *file, up_write(&nommu_vma_sem); kfree(vml); if (vma) { - if (vma->vm_file) + if (vma->vm_file) { fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + } kfree(vma); } return ret; @@ -1049,7 +1061,7 @@ EXPORT_SYMBOL(do_mmap_pgoff); /* * handle mapping disposal for uClinux */ -static void put_vma(struct vm_area_struct *vma) +static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) { if (vma) { down_write(&nommu_vma_sem); @@ -1071,8 +1083,11 @@ static void put_vma(struct vm_area_struct *vma) realalloc -= kobjsize(vma); askedalloc -= sizeof(*vma); - if (vma->vm_file) + if (vma->vm_file) { fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } kfree(vma); } @@ -1109,7 +1124,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) found: vml = *parent; - put_vma(vml->vma); + put_vma(mm, vml->vma); *parent = vml->next; realalloc -= kobjsize(vml); @@ -1154,7 +1169,7 @@ void exit_mmap(struct mm_struct * mm) while ((tmp = mm->context.vmlist)) { mm->context.vmlist = tmp->next; - put_vma(tmp->vma); + put_vma(mm, tmp->vma); realalloc -= kobjsize(tmp); askedalloc -= sizeof(*tmp); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index beb592fe938..8a5467ee626 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,8 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex); * of least surprise ... (be careful when you change it) */ -unsigned long badness(struct task_struct *p, unsigned long uptime, - struct mem_cgroup *mem) +unsigned long badness(struct task_struct *p, unsigned long uptime) { unsigned long points, cpu_time, run_time, s; struct mm_struct *mm; @@ -175,12 +174,14 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) { #ifdef CONFIG_NUMA - struct zone **z; + struct zone *zone; + struct zoneref *z; + enum zone_type high_zoneidx = gfp_zone(gfp_mask); nodemask_t nodes = node_states[N_HIGH_MEMORY]; - for (z = zonelist->zones; *z; z++) - if (cpuset_zone_allowed_softwall(*z, gfp_mask)) - node_clear(zone_to_nid(*z), nodes); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + if (cpuset_zone_allowed_softwall(zone, gfp_mask)) + node_clear(zone_to_nid(zone), nodes); else return CONSTRAINT_CPUSET; @@ -254,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, if (p->oomkilladj == OOM_DISABLE) continue; - points = badness(p, uptime.tv_sec, mem); + points = badness(p, uptime.tv_sec); if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; @@ -460,29 +461,29 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zone_oom(struct zonelist *zonelist) +int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) { - struct zone **z; + struct zoneref *z; + struct zone *zone; int ret = 1; - z = zonelist->zones; - spin_lock(&zone_scan_mutex); - do { - if (zone_is_oom_locked(*z)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + if (zone_is_oom_locked(zone)) { ret = 0; goto out; } - } while (*(++z) != NULL); + } + + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + /* + * Lock each zone in the zonelist under zone_scan_mutex so a + * parallel invocation of try_set_zone_oom() doesn't succeed + * when it shouldn't. + */ + zone_set_flag(zone, ZONE_OOM_LOCKED); + } - /* - * Lock each zone in the zonelist under zone_scan_mutex so a parallel - * invocation of try_set_zone_oom() doesn't succeed when it shouldn't. - */ - z = zonelist->zones; - do { - zone_set_flag(*z, ZONE_OOM_LOCKED); - } while (*(++z) != NULL); out: spin_unlock(&zone_scan_mutex); return ret; @@ -493,16 +494,15 @@ out: * allocation attempts with zonelists containing them may now recall the OOM * killer, if necessary. */ -void clear_zonelist_oom(struct zonelist *zonelist) +void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) { - struct zone **z; - - z = zonelist->zones; + struct zoneref *z; + struct zone *zone; spin_lock(&zone_scan_mutex); - do { - zone_clear_flag(*z, ZONE_OOM_LOCKED); - } while (*(++z) != NULL); + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + zone_clear_flag(zone, ZONE_OOM_LOCKED); + } spin_unlock(&zone_scan_mutex); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5e00f1772c2..789b6adbef3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -164,9 +164,20 @@ int dirty_ratio_handler(struct ctl_table *table, int write, */ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { - __prop_inc_percpu(&vm_completions, &bdi->completions); + __prop_inc_percpu_max(&vm_completions, &bdi->completions, + bdi->max_prop_frac); } +void bdi_writeout_inc(struct backing_dev_info *bdi) +{ + unsigned long flags; + + local_irq_save(flags); + __bdi_writeout_inc(bdi); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(bdi_writeout_inc); + static inline void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); @@ -200,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) avail_dirty = dirty - (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_WRITEBACK) + - global_page_state(NR_UNSTABLE_NFS)); + global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK_TEMP)); if (avail_dirty < 0) avail_dirty = 0; @@ -243,6 +255,55 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty) } /* + * + */ +static DEFINE_SPINLOCK(bdi_lock); +static unsigned int bdi_min_ratio; + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&bdi_lock, flags); + if (min_ratio > bdi->max_ratio) { + ret = -EINVAL; + } else { + min_ratio -= bdi->min_ratio; + if (bdi_min_ratio + min_ratio < 100) { + bdi_min_ratio += min_ratio; + bdi->min_ratio += min_ratio; + } else { + ret = -EINVAL; + } + } + spin_unlock_irqrestore(&bdi_lock, flags); + + return ret; +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +{ + unsigned long flags; + int ret = 0; + + if (max_ratio > 100) + return -EINVAL; + + spin_lock_irqsave(&bdi_lock, flags); + if (bdi->min_ratio > max_ratio) { + ret = -EINVAL; + } else { + bdi->max_ratio = max_ratio; + bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; + } + spin_unlock_irqrestore(&bdi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(bdi_set_max_ratio); + +/* * Work out the current dirty-memory clamping and background writeout * thresholds. * @@ -300,7 +361,7 @@ static unsigned long determine_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } -static void +void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, struct backing_dev_info *bdi) { @@ -330,7 +391,7 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, *pdirty = dirty; if (bdi) { - u64 bdi_dirty = dirty; + u64 bdi_dirty; long numerator, denominator; /* @@ -338,8 +399,12 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, */ bdi_writeout_fraction(bdi, &numerator, &denominator); + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; bdi_dirty *= numerator; do_div(bdi_dirty, denominator); + bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) + bdi_dirty = dirty * bdi->max_ratio / 100; *pbdi_dirty = bdi_dirty; clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); @@ -1192,7 +1257,7 @@ int test_clear_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_writeback_dirty(bdi)) { + if (bdi_cap_account_writeback(bdi)) { __dec_bdi_stat(bdi, BDI_WRITEBACK); __bdi_writeout_inc(bdi); } @@ -1221,7 +1286,7 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_writeback_dirty(bdi)) + if (bdi_cap_account_writeback(bdi)) __inc_bdi_stat(bdi, BDI_WRITEBACK); } if (!PageDirty(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 32e796af12a..bdd5c432c42 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -45,6 +45,7 @@ #include <linux/fault-inject.h> #include <linux/page-isolation.h> #include <linux/memcontrol.h> +#include <linux/debugobjects.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -532,8 +533,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) if (reserved) return; - if (!PageHighMem(page)) + if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); @@ -546,7 +550,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) /* * permit the bootmem allocator to evade page validation on high-order frees */ -void __init __free_pages_bootmem(struct page *page, unsigned int order) +void __free_pages_bootmem(struct page *page, unsigned int order) { if (order == 0) { __ClearPageReserved(page); @@ -632,7 +636,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) if (PageReserved(page)) return 1; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); set_page_private(page, 0); @@ -995,8 +999,10 @@ static void free_hot_cold_page(struct page *page, int cold) if (free_pages_check(page)) return; - if (!PageHighMem(page)) + if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), PAGE_SIZE); + debug_check_no_obj_freed(page_address(page), PAGE_SIZE); + } arch_free_page(page, 0); kernel_map_pages(page, 1, 0); @@ -1050,7 +1056,7 @@ void split_page(struct page *page, unsigned int order) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page *buffered_rmqueue(struct zonelist *zonelist, +static struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; @@ -1102,7 +1108,7 @@ again: } __count_zone_vm_events(PGALLOC, zone, 1 << order); - zone_statistics(zonelist, zone); + zone_statistics(preferred_zone, zone); local_irq_restore(flags); put_cpu(); @@ -1284,7 +1290,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) if (!zlc) return NULL; - if (time_after(jiffies, zlc->last_full_zap + HZ)) { + if (time_after(jiffies, zlc->last_full_zap + HZ)) { bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); zlc->last_full_zap = jiffies; } @@ -1317,7 +1323,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) * We are low on memory in the second scan, and should leave no stone * unturned looking for a free page. */ -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, nodemask_t *allowednodes) { struct zonelist_cache *zlc; /* cached zonelist speedup info */ @@ -1328,7 +1334,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, if (!zlc) return 1; - i = z - zonelist->zones; + i = z - zonelist->_zonerefs; n = zlc->z_to_n[i]; /* This zone is worth trying if it is allowed but not full */ @@ -1340,7 +1346,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, * zlc->fullzones, so that subsequent attempts to allocate a page * from that zone don't waste time re-examining it. */ -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { struct zonelist_cache *zlc; /* cached zonelist speedup info */ int i; /* index of *z in zonelist zones */ @@ -1349,7 +1355,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) if (!zlc) return; - i = z - zonelist->zones; + i = z - zonelist->_zonerefs; set_bit(i, zlc->fullzones); } @@ -1361,13 +1367,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) return NULL; } -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, nodemask_t *allowednodes) { return 1; } -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { } #endif /* CONFIG_NUMA */ @@ -1377,42 +1383,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) * a page. */ static struct page * -get_page_from_freelist(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, int alloc_flags) +get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, + struct zonelist *zonelist, int high_zoneidx, int alloc_flags) { - struct zone **z; + struct zoneref *z; struct page *page = NULL; - int classzone_idx = zone_idx(zonelist->zones[0]); - struct zone *zone; + int classzone_idx; + struct zone *zone, *preferred_zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ + + (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, + &preferred_zone); + classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - z = zonelist->zones; - - do { - /* - * In NUMA, this could be a policy zonelist which contains - * zones that may not be allowed by the current gfp_mask. - * Check the zone is allowed by the current flags - */ - if (unlikely(alloc_should_filter_zonelist(zonelist))) { - if (highest_zoneidx == -1) - highest_zoneidx = gfp_zone(gfp_mask); - if (zone_idx(*z) > highest_zoneidx) - continue; - } - + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) { if (NUMA_BUILD && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - zone = *z; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; @@ -1433,7 +1428,7 @@ zonelist_scan: } } - page = buffered_rmqueue(zonelist, zone, order, gfp_mask); + page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); if (page) break; this_zone_full: @@ -1446,7 +1441,7 @@ try_next_zone: zlc_active = 1; did_zlc_setup = 1; } - } while (*(++z) != NULL); + } if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { /* Disable zlc cache for second zonelist scan */ @@ -1459,18 +1454,21 @@ try_next_zone: /* * This is the 'heart' of the zoned buddy allocator. */ -struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist) +static struct page * +__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) { const gfp_t wait = gfp_mask & __GFP_WAIT; - struct zone **z; + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zoneref *z; + struct zone *zone; struct page *page; struct reclaim_state reclaim_state; struct task_struct *p = current; int do_retry; int alloc_flags; - int did_some_progress; + unsigned long did_some_progress; + unsigned long pages_reclaimed = 0; might_sleep_if(wait); @@ -1478,9 +1476,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, return NULL; restart: - z = zonelist->zones; /* the list of zones suitable for gfp_mask */ + z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ - if (unlikely(*z == NULL)) { + if (unlikely(!z->zone)) { /* * Happens if we have an empty zonelist as a result of * GFP_THISNODE being used on a memoryless node @@ -1488,8 +1486,8 @@ restart: return NULL; } - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) goto got_pg; @@ -1504,8 +1502,8 @@ restart: if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; - for (z = zonelist->zones; *z; z++) - wakeup_kswapd(*z, order); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order); /* * OK, we're below the kswapd watermark and have kicked background @@ -1533,7 +1531,8 @@ restart: * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, + high_zoneidx, alloc_flags); if (page) goto got_pg; @@ -1545,8 +1544,8 @@ rebalance: if (!(gfp_mask & __GFP_NOMEMALLOC)) { nofail_alloc: /* go through the zonelist yet again, ignoring mins */ - page = get_page_from_freelist(gfp_mask, order, - zonelist, ALLOC_NO_WATERMARKS); + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { @@ -1569,7 +1568,7 @@ nofail_alloc: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); + did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -1580,12 +1579,12 @@ nofail_alloc: drain_all_pages(); if (likely(did_some_progress)) { - page = get_page_from_freelist(gfp_mask, order, - zonelist, alloc_flags); + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, alloc_flags); if (page) goto got_pg; } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { - if (!try_set_zone_oom(zonelist)) { + if (!try_set_zone_oom(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); goto restart; } @@ -1596,21 +1595,22 @@ nofail_alloc: * a parallel oom killing, we must fail if we're still * under heavy pressure. */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, + order, zonelist, high_zoneidx, + ALLOC_WMARK_HIGH|ALLOC_CPUSET); if (page) { - clear_zonelist_oom(zonelist); + clear_zonelist_oom(zonelist, gfp_mask); goto got_pg; } /* The OOM killer will not help higher order allocs so fail */ if (order > PAGE_ALLOC_COSTLY_ORDER) { - clear_zonelist_oom(zonelist); + clear_zonelist_oom(zonelist, gfp_mask); goto nopage; } out_of_memory(zonelist, gfp_mask, order); - clear_zonelist_oom(zonelist); + clear_zonelist_oom(zonelist, gfp_mask); goto restart; } @@ -1618,14 +1618,26 @@ nofail_alloc: * Don't let big-order allocations loop unless the caller explicitly * requests that. Wait for some write requests to complete then retry. * - * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order - * <= 3, but that may not be true in other implementations. + * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER + * means __GFP_NOFAIL, but that may not be true in other + * implementations. + * + * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is + * specified, then we retry until we no longer reclaim any pages + * (above), or we've reclaimed an order of pages at least as + * large as the allocation's order. In both cases, if the + * allocation still fails, we stop retrying. */ + pages_reclaimed += did_some_progress; do_retry = 0; if (!(gfp_mask & __GFP_NORETRY)) { - if ((order <= PAGE_ALLOC_COSTLY_ORDER) || - (gfp_mask & __GFP_REPEAT)) + if (order <= PAGE_ALLOC_COSTLY_ORDER) { do_retry = 1; + } else { + if (gfp_mask & __GFP_REPEAT && + pages_reclaimed < (1 << order)) + do_retry = 1; + } if (gfp_mask & __GFP_NOFAIL) do_retry = 1; } @@ -1646,6 +1658,20 @@ got_pg: return page; } +struct page * +__alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); +} + +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); +} + EXPORT_SYMBOL(__alloc_pages); /* @@ -1712,15 +1738,15 @@ EXPORT_SYMBOL(free_pages); static unsigned int nr_free_zone_pages(int offset) { + struct zoneref *z; + struct zone *zone; + /* Just pick one node, since fallback list is circular */ - pg_data_t *pgdat = NODE_DATA(numa_node_id()); unsigned int sum = 0; - struct zonelist *zonelist = pgdat->node_zonelists + offset; - struct zone **zonep = zonelist->zones; - struct zone *zone; + struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); - for (zone = *zonep++; zone; zone = *zonep++) { + for_each_zone_zonelist(zone, z, zonelist, offset) { unsigned long size = zone->present_pages; unsigned long high = zone->pages_high; if (size > high) @@ -1889,6 +1915,12 @@ void show_free_areas(void) show_swap_cache_info(); } +static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) +{ + zoneref->zone = zone; + zoneref->zone_idx = zone_idx(zone); +} + /* * Builds allocation fallback zone lists. * @@ -1906,7 +1938,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, zone_type--; zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) { - zonelist->zones[nr_zones++] = zone; + zoneref_set_zone(zone, + &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } @@ -2078,17 +2111,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) */ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) { - enum zone_type i; int j; struct zonelist *zonelist; - for (i = 0; i < MAX_NR_ZONES; i++) { - zonelist = pgdat->node_zonelists + i; - for (j = 0; zonelist->zones[j] != NULL; j++) - ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); - zonelist->zones[j] = NULL; - } + zonelist = &pgdat->node_zonelists[0]; + for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) + ; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, + MAX_NR_ZONES - 1); + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; } /* @@ -2096,15 +2128,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) */ static void build_thisnode_zonelists(pg_data_t *pgdat) { - enum zone_type i; int j; struct zonelist *zonelist; - for (i = 0; i < MAX_NR_ZONES; i++) { - zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; - j = build_zonelists_node(pgdat, zonelist, 0, i); - zonelist->zones[j] = NULL; - } + zonelist = &pgdat->node_zonelists[1]; + j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; } /* @@ -2117,27 +2147,26 @@ static int node_order[MAX_NUMNODES]; static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) { - enum zone_type i; int pos, j, node; int zone_type; /* needs to be signed */ struct zone *z; struct zonelist *zonelist; - for (i = 0; i < MAX_NR_ZONES; i++) { - zonelist = pgdat->node_zonelists + i; - pos = 0; - for (zone_type = i; zone_type >= 0; zone_type--) { - for (j = 0; j < nr_nodes; j++) { - node = node_order[j]; - z = &NODE_DATA(node)->node_zones[zone_type]; - if (populated_zone(z)) { - zonelist->zones[pos++] = z; - check_highest_zone(zone_type); - } + zonelist = &pgdat->node_zonelists[0]; + pos = 0; + for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { + for (j = 0; j < nr_nodes; j++) { + node = node_order[j]; + z = &NODE_DATA(node)->node_zones[zone_type]; + if (populated_zone(z)) { + zoneref_set_zone(z, + &zonelist->_zonerefs[pos++]); + check_highest_zone(zone_type); } } - zonelist->zones[pos] = NULL; } + zonelist->_zonerefs[pos].zone = NULL; + zonelist->_zonerefs[pos].zone_idx = 0; } static int default_zonelist_order(void) @@ -2214,7 +2243,8 @@ static void build_zonelists(pg_data_t *pgdat) /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { zonelist = pgdat->node_zonelists + i; - zonelist->zones[0] = NULL; + zonelist->_zonerefs[0].zone = NULL; + zonelist->_zonerefs[0].zone_idx = 0; } /* NUMA-aware ordering of nodes */ @@ -2264,19 +2294,15 @@ static void build_zonelists(pg_data_t *pgdat) /* Construct the zonelist performance cache - see further mmzone.h */ static void build_zonelist_cache(pg_data_t *pgdat) { - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zonelist *zonelist; - struct zonelist_cache *zlc; - struct zone **z; + struct zonelist *zonelist; + struct zonelist_cache *zlc; + struct zoneref *z; - zonelist = pgdat->node_zonelists + i; - zonelist->zlcache_ptr = zlc = &zonelist->zlcache; - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - for (z = zonelist->zones; *z; z++) - zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); - } + zonelist = &pgdat->node_zonelists[0]; + zonelist->zlcache_ptr = zlc = &zonelist->zlcache; + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + for (z = zonelist->_zonerefs; z->zone; z++) + zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); } @@ -2290,45 +2316,44 @@ static void set_zonelist_order(void) static void build_zonelists(pg_data_t *pgdat) { int node, local_node; - enum zone_type i,j; + enum zone_type j; + struct zonelist *zonelist; local_node = pgdat->node_id; - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zonelist *zonelist; - - zonelist = pgdat->node_zonelists + i; - j = build_zonelists_node(pgdat, zonelist, 0, i); - /* - * Now we build the zonelist so that it contains the zones - * of all the other nodes. - * We don't want to pressure a particular node, so when - * building the zones for node N, we make sure that the - * zones coming right after the local ones are those from - * node N+1 (modulo N) - */ - for (node = local_node + 1; node < MAX_NUMNODES; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); - } - for (node = 0; node < local_node; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); - } + zonelist = &pgdat->node_zonelists[0]; + j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); - zonelist->zones[j] = NULL; + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, + MAX_NR_ZONES - 1); } + for (node = 0; node < local_node; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, + MAX_NR_ZONES - 1); + } + + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; } /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ static void build_zonelist_cache(pg_data_t *pgdat) { - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) - pgdat->node_zonelists[i].zlcache_ptr = NULL; + pgdat->node_zonelists[0].zlcache_ptr = NULL; + pgdat->node_zonelists[1].zlcache_ptr = NULL; } #endif /* CONFIG_NUMA */ @@ -2518,7 +2543,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; + struct zone *z; + z = &NODE_DATA(nid)->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -2536,7 +2563,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); - /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -2545,8 +2571,15 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * kernel allocations are made. Later some blocks near * the start are marked MIGRATE_RESERVE by * setup_zone_migrate_reserve() + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. */ - if ((pfn & (pageblock_nr_pages-1))) + if ((z->zone_start_pfn <= pfn) + && (pfn < z->zone_start_pfn + z->spanned_pages) + && !(pfn & (pageblock_nr_pages - 1))) set_pageblock_migratetype(page, MIGRATE_MOVABLE); INIT_LIST_HEAD(&page->lru); @@ -4339,9 +4372,7 @@ void *__init alloc_large_system_hash(const char *tablename, else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { - unsigned long order; - for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) - ; + unsigned long order = get_order(size); table = (void*) __get_free_pages(GFP_ATOMIC, order); /* * If bucketsize is not a power-of-two, we may free @@ -4460,6 +4491,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + VM_BUG_ON(pfn < zone->zone_start_pfn); + VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) if (flags & value) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 1cf1417ef8b..0afd2387e50 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -9,11 +9,15 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; pte = pte_offset_map(pmd, addr); - do { + for (;;) { err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); if (err) break; - } while (pte++, addr += PAGE_SIZE, addr != end); + addr += PAGE_SIZE; + if (addr == end) + break; + pte++; + } pte_unmap(pte); return err; diff --git a/mm/readahead.c b/mm/readahead.c index 8762e898897..d8723a5f649 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -235,7 +235,13 @@ unsigned long max_sane_readahead(unsigned long nr) static int __init readahead_init(void) { - return bdi_init(&default_backing_dev_info); + int err; + + err = bdi_init(&default_backing_dev_info); + if (!err) + bdi_register(&default_backing_dev_info, NULL, "default"); + + return err; } subsys_initcall(readahead_init); diff --git a/mm/rmap.c b/mm/rmap.c index e9bb6b1093f..bf0a5b7cfb8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -662,7 +662,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) printk (KERN_EMERG " page->mapping = %p\n", page->mapping); print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); if (vma->vm_ops) { - print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); } if (vma->vm_file && vma->vm_file->f_op) diff --git a/mm/shmem.c b/mm/shmem.c index f514dd392cd..e2a6ae1a44e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -201,7 +201,7 @@ static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, .unplug_io_fn = default_unplug_io_fn, }; @@ -1079,104 +1079,47 @@ redirty: #ifdef CONFIG_NUMA #ifdef CONFIG_TMPFS -static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) +static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { - char *nodelist = strchr(value, ':'); - int err = 1; + char buffer[64]; - if (nodelist) { - /* NUL-terminate policy string */ - *nodelist++ = '\0'; - if (nodelist_parse(nodelist, *policy_nodes)) - goto out; - if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY])) - goto out; - } - if (!strcmp(value, "default")) { - *policy = MPOL_DEFAULT; - /* Don't allow a nodelist */ - if (!nodelist) - err = 0; - } else if (!strcmp(value, "prefer")) { - *policy = MPOL_PREFERRED; - /* Insist on a nodelist of one node only */ - if (nodelist) { - char *rest = nodelist; - while (isdigit(*rest)) - rest++; - if (!*rest) - err = 0; - } - } else if (!strcmp(value, "bind")) { - *policy = MPOL_BIND; - /* Insist on a nodelist */ - if (nodelist) - err = 0; - } else if (!strcmp(value, "interleave")) { - *policy = MPOL_INTERLEAVE; - /* - * Default to online nodes with memory if no nodelist - */ - if (!nodelist) - *policy_nodes = node_states[N_HIGH_MEMORY]; - err = 0; - } -out: - /* Restore string for error message */ - if (nodelist) - *--nodelist = ':'; - return err; -} - -static void shmem_show_mpol(struct seq_file *seq, int policy, - const nodemask_t policy_nodes) -{ - char *policy_string; + if (!mpol || mpol->mode == MPOL_DEFAULT) + return; /* show nothing */ - switch (policy) { - case MPOL_PREFERRED: - policy_string = "prefer"; - break; - case MPOL_BIND: - policy_string = "bind"; - break; - case MPOL_INTERLEAVE: - policy_string = "interleave"; - break; - default: - /* MPOL_DEFAULT */ - return; - } + mpol_to_str(buffer, sizeof(buffer), mpol, 1); - seq_printf(seq, ",mpol=%s", policy_string); - - if (policy != MPOL_INTERLEAVE || - !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) { - char buffer[64]; - int len; + seq_printf(seq, ",mpol=%s", buffer); +} - len = nodelist_scnprintf(buffer, sizeof(buffer), policy_nodes); - if (len < sizeof(buffer)) - seq_printf(seq, ":%s", buffer); - else - seq_printf(seq, ":?"); +static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + struct mempolicy *mpol = NULL; + if (sbinfo->mpol) { + spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + mpol = sbinfo->mpol; + mpol_get(mpol); + spin_unlock(&sbinfo->stat_lock); } + return mpol; } #endif /* CONFIG_TMPFS */ static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, struct shmem_inode_info *info, unsigned long idx) { + struct mempolicy mpol, *spol; struct vm_area_struct pvma; struct page *page; + spol = mpol_cond_copy(&mpol, + mpol_shared_policy_lookup(&info->policy, idx)); + /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; pvma.vm_pgoff = idx; pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + pvma.vm_policy = spol; page = swapin_readahead(entry, gfp, &pvma, 0); - mpol_free(pvma.vm_policy); return page; } @@ -1184,27 +1127,21 @@ static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, unsigned long idx) { struct vm_area_struct pvma; - struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; pvma.vm_pgoff = idx; pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); - page = alloc_page_vma(gfp, &pvma, 0); - mpol_free(pvma.vm_policy); - return page; + + /* + * alloc_page_vma() will drop the shared policy reference + */ + return alloc_page_vma(gfp, &pvma, 0); } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS -static inline int shmem_parse_mpol(char *value, int *policy, - nodemask_t *policy_nodes) -{ - return 1; -} - -static inline void shmem_show_mpol(struct seq_file *seq, int policy, - const nodemask_t policy_nodes) +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) { } #endif /* CONFIG_TMPFS */ @@ -1222,6 +1159,13 @@ static inline struct page *shmem_alloc_page(gfp_t gfp, } #endif /* CONFIG_NUMA */ +#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif + /* * shmem_getpage - either get the page from swap or allocate a new one * @@ -1576,8 +1520,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) case S_IFREG: inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; - mpol_shared_policy_init(&info->policy, sbinfo->policy, - &sbinfo->policy_nodes); + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); break; case S_IFDIR: inc_nlink(inode); @@ -1591,8 +1535,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, - NULL); + mpol_shared_policy_init(&info->policy, NULL); break; } } else @@ -2207,8 +2150,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (*rest) goto bad_val; } else if (!strcmp(this_char,"mpol")) { - if (shmem_parse_mpol(value, &sbinfo->policy, - &sbinfo->policy_nodes)) + if (mpol_parse_str(value, &sbinfo->mpol, 1)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", @@ -2259,8 +2201,9 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) sbinfo->free_blocks = config.max_blocks - blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; - sbinfo->policy = config.policy; - sbinfo->policy_nodes = config.policy_nodes; + + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ out: spin_unlock(&sbinfo->stat_lock); return error; @@ -2281,7 +2224,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",uid=%u", sbinfo->uid); if (sbinfo->gid != 0) seq_printf(seq, ",gid=%u", sbinfo->gid); - shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes); + shmem_show_mpol(seq, sbinfo->mpol); return 0; } #endif /* CONFIG_TMPFS */ @@ -2311,8 +2254,7 @@ static int shmem_fill_super(struct super_block *sb, sbinfo->mode = S_IRWXUGO | S_ISVTX; sbinfo->uid = current->fsuid; sbinfo->gid = current->fsgid; - sbinfo->policy = MPOL_DEFAULT; - sbinfo->policy_nodes = node_states[N_HIGH_MEMORY]; + sbinfo->mpol = NULL; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS diff --git a/mm/slab.c b/mm/slab.c index 03927cb5ec9..06236e4ddc1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -110,6 +110,7 @@ #include <linux/fault-inject.h> #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> +#include <linux/debugobjects.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -139,10 +140,6 @@ #define BYTES_PER_WORD sizeof(void *) #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif - #ifndef ARCH_KMALLOC_MINALIGN /* * Enforce a minimum alignment for the kmalloc caches. @@ -178,12 +175,14 @@ SLAB_CACHE_DMA | \ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_DEBUG_OBJECTS) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_DEBUG_OBJECTS) #endif /* @@ -862,7 +861,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } -#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) +#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) @@ -2157,7 +2156,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (!name || in_interrupt() || (size < BYTES_PER_WORD) || size > KMALLOC_MAX_SIZE) { - printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, + printk(KERN_ERR "%s: Early error in slab %s\n", __func__, name); BUG(); } @@ -3242,15 +3241,16 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { struct zonelist *zonelist; gfp_t local_flags; - struct zone **z; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; if (flags & __GFP_THISNODE) return NULL; - zonelist = &NODE_DATA(slab_node(current->mempolicy)) - ->node_zonelists[gfp_zone(flags)]; + zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry: @@ -3258,10 +3258,10 @@ retry: * Look through allowed nodes for objects available * from existing per node queues. */ - for (z = zonelist->zones; *z && !obj; z++) { - nid = zone_to_nid(*z); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + nid = zone_to_nid(zone); - if (cpuset_zone_allowed_hardwall(*z, flags) && + if (cpuset_zone_allowed_hardwall(zone, flags) && cache->nodelists[nid] && cache->nodelists[nid]->free_objects) obj = ____cache_alloc_node(cache, @@ -3763,6 +3763,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) local_irq_save(flags); debug_check_no_locks_freed(objp, obj_size(cachep)); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); } @@ -3788,6 +3790,7 @@ void kfree(const void *objp) kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); + debug_check_no_obj_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); } diff --git a/mm/slob.c b/mm/slob.c index e2c3c0ec546..6038cbadf79 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -533,7 +533,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, { struct kmem_cache *c; - c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1); + c = slob_alloc(sizeof(struct kmem_cache), + flags, ARCH_KMALLOC_MINALIGN, -1); if (c) { c->name = name; diff --git a/mm/slub.c b/mm/slub.c index 39592b5ce68..32b62623846 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,8 +19,10 @@ #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> +#include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/memory.h> +#include <linux/math64.h> /* * Lock order: @@ -149,25 +151,6 @@ static inline void ClearSlabDebug(struct page *page) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -#if PAGE_SHIFT <= 12 - -/* - * Small page size. Make sure that we do not fragment memory - */ -#define DEFAULT_MAX_ORDER 1 -#define DEFAULT_MIN_OBJECTS 4 - -#else - -/* - * Large page machines are customarily able to handle larger - * page orders. - */ -#define DEFAULT_MAX_ORDER 2 -#define DEFAULT_MIN_OBJECTS 8 - -#endif - /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -204,13 +187,6 @@ static inline void ClearSlabDebug(struct page *page) /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ -#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */ -#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */ - -/* Not all arches define cache_line_size */ -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif static int kmem_size = sizeof(struct kmem_cache); @@ -301,7 +277,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); - if (object < base || object >= base + s->objects * s->size || + if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { return 0; } @@ -327,8 +303,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) } /* Loop over all objects in a slab */ -#define for_each_object(__p, __s, __addr) \ - for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) /* Scan freelist */ @@ -341,6 +317,26 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size) +{ + struct kmem_cache_order_objects x = { + (order << 16) + (PAGE_SIZE << order) / size + }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> 16; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & ((1 << 16) - 1); +} + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -451,8 +447,8 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", - page, page->inuse, page->freelist, page->flags); + printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } @@ -652,6 +648,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) p + off, POISON_INUSE, s->size - off); } +/* Check the pad bytes at the end of a slab page */ static int slab_pad_check(struct kmem_cache *s, struct page *page) { u8 *start; @@ -664,20 +661,20 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - end = start + (PAGE_SIZE << s->order); - length = s->objects * s->size; - remainder = end - (start + length); + length = (PAGE_SIZE << compound_order(page)); + end = start + length; + remainder = length % s->size; if (!remainder) return 1; - fault = check_bytes(start + length, POISON_INUSE, remainder); + fault = check_bytes(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", start, length); + print_section("Padding", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, start, end); return 0; @@ -739,15 +736,24 @@ static int check_object(struct kmem_cache *s, struct page *page, static int check_slab(struct kmem_cache *s, struct page *page) { + int maxobj; + VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->inuse > s->objects) { + + maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + s->name, page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, s->objects); + s->name, page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -764,8 +770,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) int nr = 0; void *fp = page->freelist; void *object = NULL; + unsigned long max_objects; - while (fp && nr <= s->objects) { + while (fp && nr <= page->objects) { if (fp == search) return 1; if (!check_valid_pointer(s, page, fp)) { @@ -777,7 +784,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; - page->inuse = s->objects; + page->inuse = page->objects; slab_fix(s, "Freelist cleared"); return 0; } @@ -788,10 +795,20 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - if (page->inuse != s->objects - nr) { + max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + if (max_objects > 65535) + max_objects = 65535; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } + if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, s->objects - nr); - page->inuse = s->objects - nr; + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } return search == NULL; @@ -845,7 +862,7 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node) return atomic_long_read(&n->nr_slabs); } -static inline void inc_slabs_node(struct kmem_cache *s, int node) +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); @@ -855,14 +872,17 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (!NUMA_BUILD || n) + if (!NUMA_BUILD || n) { atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } } -static inline void dec_slabs_node(struct kmem_cache *s, int node) +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); } /* Object debug checks for alloc/free paths */ @@ -910,7 +930,7 @@ bad: * as used avoids touching the remaining objects. */ slab_fix(s, "Marking all objects used"); - page->inuse = s->objects; + page->inuse = page->objects; page->freelist = NULL; } return 0; @@ -1060,31 +1080,52 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, static inline unsigned long slabs_node(struct kmem_cache *s, int node) { return 0; } -static inline void inc_slabs_node(struct kmem_cache *s, int node) {} -static inline void dec_slabs_node(struct kmem_cache *s, int node) {} +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} #endif + /* * Slab allocation and freeing */ +static inline struct page *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + int order = oo_order(oo); + + if (node == -1) + return alloc_pages(flags, order); + else + return alloc_pages_node(node, flags, order); +} + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int pages = 1 << s->order; + struct kmem_cache_order_objects oo = s->oo; flags |= s->allocflags; - if (node == -1) - page = alloc_pages(flags, s->order); - else - page = alloc_pages_node(node, flags, s->order); - - if (!page) - return NULL; + page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, + oo); + if (unlikely(!page)) { + oo = s->min; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(flags, node, oo); + if (!page) + return NULL; + stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + } + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); + 1 << oo_order(oo)); return page; } @@ -1111,7 +1152,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; - inc_slabs_node(s, page_to_nid(page)); + inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | @@ -1121,10 +1162,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << s->order); + memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); last = start; - for_each_object(p, s, start) { + for_each_object(p, s, start, page->objects) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@ -1140,13 +1181,15 @@ out: static void __free_slab(struct kmem_cache *s, struct page *page) { - int pages = 1 << s->order; + int order = compound_order(page); + int pages = 1 << order; if (unlikely(SlabDebug(page))) { void *p; slab_pad_check(s, page); - for_each_object(p, s, page_address(page)) + for_each_object(p, s, page_address(page), + page->objects) check_object(s, page, p, 0); ClearSlabDebug(page); } @@ -1158,7 +1201,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); reset_page_mapcount(page); - __free_pages(page, s->order); + __free_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -1184,7 +1227,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) static void discard_slab(struct kmem_cache *s, struct page *page) { - dec_slabs_node(s, page_to_nid(page)); + dec_slabs_node(s, page_to_nid(page), page->objects); free_slab(s, page); } @@ -1284,7 +1327,9 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) { #ifdef CONFIG_NUMA struct zonelist *zonelist; - struct zone **z; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); struct page *page; /* @@ -1309,14 +1354,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - zonelist = &NODE_DATA( - slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)]; - for (z = zonelist->zones; *z; z++) { + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; - n = get_node(s, zone_to_nid(*z)); + n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed_hardwall(*z, flags) && + if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > MIN_PARTIAL) { page = get_partial_node(n); if (page) @@ -1519,7 +1563,7 @@ load_freelist: goto debug; c->freelist = object[c->offset]; - c->page->inuse = s->objects; + c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: @@ -1556,27 +1600,6 @@ new_slab: c->page = new; goto load_freelist; } - - /* - * No memory available. - * - * If the slab uses higher order allocs but the object is - * smaller than a page size then we can fallback in emergencies - * to the page allocator via kmalloc_large. The page allocator may - * have failed to obtain a higher order page and we can try to - * allocate a single page if the object fits into a single page. - * That is only possible if certain conditions are met that are being - * checked when a slab is created. - */ - if (!(gfpflags & __GFP_NORETRY) && - (s->flags & __PAGE_ALLOC_FALLBACK)) { - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - object = kmalloc_large(s->objsize, gfpflags); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); - return object; - } return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -1726,6 +1749,8 @@ static __always_inline void slab_free(struct kmem_cache *s, local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); debug_check_no_locks_freed(object, c->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; @@ -1777,8 +1802,8 @@ static struct page *get_object_page(const void *x) * take the list_lock. */ static int slub_min_order; -static int slub_max_order = DEFAULT_MAX_ORDER; -static int slub_min_objects = DEFAULT_MIN_OBJECTS; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects; /* * Merge control. If this is set then no merging of slab caches will occur. @@ -1793,7 +1818,7 @@ static int slub_nomerge; * system components. Generally order 0 allocations should be preferred since * order 0 does not cause fragmentation in the page allocator. Larger objects * be problematic to put into order 0 slabs because there may be too much - * unused space left. We go to a higher order if more than 1/8th of the slab + * unused space left. We go to a higher order if more than 1/16th of the slab * would be wasted. * * In order to reach satisfactory performance we must ensure that a minimum @@ -1818,6 +1843,9 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; + if ((PAGE_SIZE << min_order) / size > 65535) + return get_order(size * 65535) - 1; + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1852,8 +1880,10 @@ static inline int calculate_order(int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * (fls(nr_cpu_ids) + 1); while (min_objects > 1) { - fraction = 8; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, slub_max_order, fraction); @@ -2095,7 +2125,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_tracking(kmalloc_caches, n); #endif init_kmem_cache_node(n); - inc_slabs_node(kmalloc_caches, node); + inc_slabs_node(kmalloc_caches, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2171,11 +2201,12 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->objsize; unsigned long align = s->align; + int order; /* * Round up object size to the next word boundary. We can only @@ -2259,26 +2290,16 @@ static int calculate_sizes(struct kmem_cache *s) */ size = ALIGN(size, align); s->size = size; + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size); - if ((flags & __KMALLOC_CACHE) && - PAGE_SIZE / size < slub_min_objects) { - /* - * Kmalloc cache that would not have enough objects in - * an order 0 page. Kmalloc slabs can fallback to - * page allocator order 0 allocs so take a reasonably large - * order that will allows us a good number of objects. - */ - s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); - s->flags |= __PAGE_ALLOC_FALLBACK; - s->allocflags |= __GFP_NOWARN; - } else - s->order = calculate_order(size); - - if (s->order < 0) + if (order < 0) return 0; s->allocflags = 0; - if (s->order) + if (order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) @@ -2290,9 +2311,12 @@ static int calculate_sizes(struct kmem_cache *s) /* * Determine the number of objects per slab */ - s->objects = (PAGE_SIZE << s->order) / size; + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; - return !!s->objects; + return !!oo_objects(s->oo); } @@ -2308,7 +2332,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); - if (!calculate_sizes(s)) + if (!calculate_sizes(s, -1)) goto error; s->refcount = 1; @@ -2325,7 +2349,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, s->order, + s->name, (unsigned long)size, s->size, oo_order(s->oo), s->offset, flags); return 0; } @@ -2371,26 +2395,52 @@ const char *kmem_cache_name(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_name); +static void list_slab_objects(struct kmem_cache *s, struct page *page, + const char *text) +{ +#ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); + void *p; + DECLARE_BITMAP(map, page->objects); + + bitmap_zero(map, page->objects); + slab_err(s, page, "%s", text); + slab_lock(page); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr, page->objects) { + + if (!test_bit(slab_index(p, s, addr), map)) { + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", + p, p - addr); + print_tracking(s, p); + } + } + slab_unlock(page); +#endif +} + /* - * Attempt to free all slabs on a node. Return the number of slabs we - * were unable to free. + * Attempt to free all partial slabs on a node. */ -static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, - struct list_head *list) +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - int slabs_inuse = 0; unsigned long flags; struct page *page, *h; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, h, list, lru) + list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { list_del(&page->lru); discard_slab(s, page); - } else - slabs_inuse++; + n->nr_partial--; + } else { + list_slab_objects(s, page, + "Objects remaining on kmem_cache_close()"); + } + } spin_unlock_irqrestore(&n->list_lock, flags); - return slabs_inuse; } /* @@ -2407,8 +2457,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - n->nr_partial -= free_list(s, n, &n->partial); - if (slabs_node(s, node)) + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) return 1; } free_kmem_cache_nodes(s); @@ -2426,8 +2476,11 @@ void kmem_cache_destroy(struct kmem_cache *s) if (!s->refcount) { list_del(&s->list); up_write(&slub_lock); - if (kmem_cache_close(s)) - WARN_ON(1); + if (kmem_cache_close(s)) { + printk(KERN_ERR "SLUB %s: %s called for cache that " + "still has objects.\n", s->name, __func__); + dump_stack(); + } sysfs_slab_remove(s); } else up_write(&slub_lock); @@ -2486,7 +2539,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, down_write(&slub_lock); if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags | __KMALLOC_CACHE, NULL)) + flags, NULL)) goto panic; list_add(&s->list, &slab_caches); @@ -2734,8 +2787,9 @@ int kmem_cache_shrink(struct kmem_cache *s) struct kmem_cache_node *n; struct page *page; struct page *t; + int objects = oo_objects(s->max); struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags; if (!slabs_by_inuse) @@ -2748,7 +2802,7 @@ int kmem_cache_shrink(struct kmem_cache *s) if (!n->nr_partial) continue; - for (i = 0; i < s->objects; i++) + for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i); spin_lock_irqsave(&n->list_lock, flags); @@ -2780,7 +2834,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = s->objects - 1; i >= 0; i--) + for (i = objects - 1; i >= 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); @@ -2928,7 +2982,7 @@ void __init kmem_cache_init(void) kmalloc_caches[0].refcount = -1; caches++; - hotplug_memory_notifier(slab_memory_callback, 1); + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); #endif /* Able to allocate the per node structures */ @@ -3001,9 +3055,6 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if ((s->flags & __PAGE_ALLOC_FALLBACK)) - return 1; - if (s->ctor) return 1; @@ -3196,7 +3247,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, } #if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO) -static unsigned long count_partial(struct kmem_cache_node *n) +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) { unsigned long flags; unsigned long x = 0; @@ -3204,10 +3256,25 @@ static unsigned long count_partial(struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) - x += page->inuse; + x += get_count(page); spin_unlock_irqrestore(&n->list_lock, flags); return x; } + +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; +} + +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} #endif #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) @@ -3222,7 +3289,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; /* Now we know that a valid freelist exists */ - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); @@ -3230,7 +3297,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; } - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) if (!check_object(s, page, p, 1)) return 0; @@ -3296,7 +3363,7 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); if (!map) @@ -3499,14 +3566,14 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc) { void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); + DECLARE_BITMAP(map, page->objects); void *p; - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) set_bit(slab_index(p, s, addr), map); - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) add_location(t, s, get_track(s, p, alloc)); } @@ -3555,12 +3622,10 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "<not-available>"); if (l->sum_time != l->min_time) { - unsigned long remainder; - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - div_long_long_rem(l->sum_time, l->count, &remainder), - l->max_time); + l->min_time, + (long)div_u64(l->sum_time, l->count), + l->max_time); } else len += sprintf(buf + len, " age=%ld", l->min_time); @@ -3596,22 +3661,23 @@ static int list_locations(struct kmem_cache *s, char *buf, } enum slab_stat_type { - SL_FULL, - SL_PARTIAL, - SL_CPU, - SL_OBJECTS + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ }; -#define SO_FULL (1 << SL_FULL) +#define SO_ALL (1 << SL_ALL) #define SO_PARTIAL (1 << SL_PARTIAL) #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { unsigned long total = 0; - int cpu; int node; int x; unsigned long *nodes; @@ -3622,56 +3688,60 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return -ENOMEM; per_cpu = nodes + nr_node_ids; - for_each_possible_cpu(cpu) { - struct page *page; - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (flags & SO_CPU) { + int cpu; - if (!c) - continue; + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - page = c->page; - node = c->node; - if (node < 0) - continue; - if (page) { - if (flags & SO_CPU) { - if (flags & SO_OBJECTS) - x = page->inuse; + if (!c || c->node < 0) + continue; + + if (c->page) { + if (flags & SO_TOTAL) + x = c->page->objects; + else if (flags & SO_OBJECTS) + x = c->page->inuse; else x = 1; + total += x; - nodes[node] += x; + nodes[c->node] += x; } - per_cpu[node]++; + per_cpu[c->node]++; } } - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + if (flags & SO_ALL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); - if (flags & SO_PARTIAL) { - if (flags & SO_OBJECTS) - x = count_partial(n); else - x = n->nr_partial; + x = atomic_long_read(&n->nr_slabs); total += x; nodes[node] += x; } - if (flags & SO_FULL) { - int full_slabs = atomic_long_read(&n->nr_slabs) - - per_cpu[node] - - n->nr_partial; + } else if (flags & SO_PARTIAL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); - if (flags & SO_OBJECTS) - x = full_slabs * s->objects; + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); else - x = full_slabs; + x = n->nr_partial; total += x; nodes[node] += x; } } - x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA for_each_node_state(node, N_NORMAL_MEMORY) @@ -3686,14 +3756,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c && c->page) - return 1; - } for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -3701,7 +3763,7 @@ static int any_slab_objects(struct kmem_cache *s) if (!n) continue; - if (n->nr_partial || atomic_long_read(&n->nr_slabs)) + if (atomic_read(&n->total_objects)) return 1; } return 0; @@ -3743,15 +3805,27 @@ SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objects); + return sprintf(buf, "%d\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int order = simple_strtoul(buf, NULL, 10); + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->order); + return sprintf(buf, "%d\n", oo_order(s->oo)); } -SLAB_ATTR_RO(order); +SLAB_ATTR(order); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { @@ -3772,7 +3846,7 @@ SLAB_ATTR_RO(aliases); static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); + return show_slab_objects(s, buf, SO_ALL); } SLAB_ATTR_RO(slabs); @@ -3790,10 +3864,22 @@ SLAB_ATTR_RO(cpu_slabs); static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); } SLAB_ATTR_RO(objects); +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); @@ -3873,7 +3959,7 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') s->flags |= SLAB_RED_ZONE; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(red_zone); @@ -3892,7 +3978,7 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') s->flags |= SLAB_POISON; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(poison); @@ -3911,7 +3997,7 @@ static ssize_t store_user_store(struct kmem_cache *s, s->flags &= ~SLAB_STORE_USER; if (buf[0] == '1') s->flags |= SLAB_STORE_USER; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(store_user); @@ -4042,7 +4128,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); - +STAT_ATTR(ORDER_FALLBACK, order_fallback); #endif static struct attribute *slab_attrs[] = { @@ -4051,6 +4137,8 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &objects_attr.attr, + &objects_partial_attr.attr, + &total_objects_attr.attr, &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, @@ -4093,6 +4181,7 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &order_fallback_attr.attr, #endif NULL }; @@ -4379,7 +4468,8 @@ static int s_show(struct seq_file *m, void *p) unsigned long nr_partials = 0; unsigned long nr_slabs = 0; unsigned long nr_inuse = 0; - unsigned long nr_objs; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; struct kmem_cache *s; int node; @@ -4393,14 +4483,15 @@ static int s_show(struct seq_file *m, void *p) nr_partials += n->nr_partial; nr_slabs += atomic_long_read(&n->nr_slabs); - nr_inuse += count_partial(n); + nr_objs += atomic_long_read(&n->total_objects); + nr_free += count_partial(n, count_free); } - nr_objs = nr_slabs * s->objects; - nr_inuse += (nr_slabs - nr_partials) * s->objects; + nr_inuse = nr_objs - nr_free; seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, s->objects, (1 << s->order)); + nr_objs, s->size, oo_objects(s->oo), + (1 << oo_order(s->oo))); seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 0UL); diff --git a/mm/sparse.c b/mm/sparse.c index 7e9191381f8..36511c7b5e2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> +#include "internal.h" #include <asm/dma.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> @@ -208,12 +209,12 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p } /* - * We need this if we ever free the mem_maps. While not implemented yet, - * this function is included for parity with its sibling. + * Decode mem_map from the coded memmap */ -static __attribute((unused)) struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) { + /* mask off the extra low bits of information */ + coded_mem_map &= SECTION_MAP_MASK; return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } @@ -232,7 +233,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, return 1; } -static unsigned long usemap_size(void) +unsigned long usemap_size(void) { unsigned long size_bytes; size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; @@ -260,7 +261,7 @@ static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ nid = 0; - printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); + printk(KERN_WARNING "%s: allocation failed\n", __func__); return NULL; } @@ -273,8 +274,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) if (map) return map; - map = alloc_bootmem_node(NODE_DATA(nid), - sizeof(struct page) * PAGES_PER_SECTION); + map = alloc_bootmem_pages_node(NODE_DATA(nid), + PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); return map; } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -290,7 +291,7 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) return map; printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __FUNCTION__); + "some memory will not be available.\n", __func__); ms->section_mem_map = 0; return NULL; } @@ -365,6 +366,9 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { return; /* XXX: Not implemented yet */ } +static void free_map_bootmem(struct page *page, unsigned long nr_pages) +{ +} #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) { @@ -402,8 +406,69 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) free_pages((unsigned long)memmap, get_order(sizeof(struct page) * nr_pages)); } + +static void free_map_bootmem(struct page *page, unsigned long nr_pages) +{ + unsigned long maps_section_nr, removing_section_nr, i; + int magic; + + for (i = 0; i < nr_pages; i++, page++) { + magic = atomic_read(&page->_mapcount); + + BUG_ON(magic == NODE_INFO); + + maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); + removing_section_nr = page->private; + + /* + * When this function is called, the removing section is + * logical offlined state. This means all pages are isolated + * from page allocator. If removing section's memmap is placed + * on the same section, it must not be freed. + * If it is freed, page allocator may allocate it which will + * be removed physically soon. + */ + if (maps_section_nr != removing_section_nr) + put_page_bootmem(page); + } +} #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +static void free_section_usemap(struct page *memmap, unsigned long *usemap) +{ + struct page *usemap_page; + unsigned long nr_pages; + + if (!usemap) + return; + + usemap_page = virt_to_page(usemap); + /* + * Check to see if allocation came from hot-plug-add + */ + if (PageSlab(usemap_page)) { + kfree(usemap); + if (memmap) + __kfree_section_memmap(memmap, PAGES_PER_SECTION); + return; + } + + /* + * The usemap came from bootmem. This is packed with other usemaps + * on the section which has pgdat at boot time. Just keep it as is now. + */ + + if (memmap) { + struct page *memmap_page; + memmap_page = virt_to_page(memmap); + + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + free_map_bootmem(memmap_page, nr_pages); + } +} + /* * returns the number of sections whose mem_maps were properly * set. If this is <=0, then that means that the passed-in @@ -456,4 +521,20 @@ out: } return ret; } + +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +{ + struct page *memmap = NULL; + unsigned long *usemap = NULL; + + if (ms->section_mem_map) { + usemap = ms->pageblock_flags; + memmap = sparse_decode_mem_map(ms->section_mem_map, + __section_nr(ms)); + ms->section_mem_map = 0; + ms->pageblock_flags = NULL; + } + + free_section_usemap(memmap, usemap); +} #endif diff --git a/mm/swap.c b/mm/swap.c index aa1139ccf3a..91e194445a5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -132,34 +132,21 @@ static void pagevec_move_tail(struct pagevec *pvec) * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. - * - * Returns zero if it cleared PG_writeback. */ -int rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { - struct pagevec *pvec; - unsigned long flags; - - if (PageLocked(page)) - return 1; - if (PageDirty(page)) - return 1; - if (PageActive(page)) - return 1; - if (!PageLRU(page)) - return 1; - - page_cache_get(page); - local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); - if (!pagevec_add(pvec, page)) - pagevec_move_tail(pvec); - local_irq_restore(flags); - - if (!test_clear_page_writeback(page)) - BUG(); + if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && + PageLRU(page)) { + struct pagevec *pvec; + unsigned long flags; - return 0; + page_cache_get(page); + local_irq_save(flags); + pvec = &__get_cpu_var(lru_rotate_pvecs); + if (!pagevec_add(pvec, page)) + pagevec_move_tail(pvec); + local_irq_restore(flags); + } } /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 50757ee3f9f..d8aadaf2a0b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, .unplug_io_fn = swap_unplug_io_fn, }; diff --git a/mm/swapfile.c b/mm/swapfile.c index 2da149cfc9a..bd1bb592030 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1426,11 +1426,7 @@ static const struct file_operations proc_swaps_operations = { static int __init procswaps_init(void) { - struct proc_dir_entry *entry; - - entry = create_proc_entry("swaps", 0, NULL); - if (entry) - entry->proc_fops = &proc_swaps_operations; + proc_create("swaps", 0, NULL, &proc_swaps_operations); return 0; } __initcall(procswaps_init); @@ -1582,6 +1578,14 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = -EINVAL; goto bad_swap; case 2: + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } /* Check the swap header's sub-version and the size of the swap file and bad block lists */ if (swap_header->info.version != 1) { diff --git a/mm/truncate.c b/mm/truncate.c index 7d20ce41ecf..b8961cb6341 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -391,6 +391,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t next; int i; int ret = 0; + int ret2 = 0; int did_range_unmap = 0; int wrapped = 0; @@ -438,9 +439,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } } BUG_ON(page_mapped(page)); - ret = do_launder_page(mapping, page); - if (ret == 0 && !invalidate_complete_page2(mapping, page)) - ret = -EIO; + ret2 = do_launder_page(mapping, page); + if (ret2 == 0) { + if (!invalidate_complete_page2(mapping, page)) + ret2 = -EIO; + } + if (ret2 < 0) + ret = ret2; unlock_page(page); } pagevec_release(&pvec); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ecf91f8034b..6e45b0f3d12 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,8 +14,10 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> - +#include <linux/seq_file.h> +#include <linux/debugobjects.h> #include <linux/vmalloc.h> +#include <linux/kallsyms.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> @@ -25,7 +27,7 @@ DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node); + int node, void *caller); static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) { @@ -204,9 +206,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_pfn); -static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end, - int node, gfp_t gfp_mask) +static struct vm_struct * +__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, void *caller) { struct vm_struct **p, *tmp, *area; unsigned long align = 1; @@ -269,6 +271,7 @@ found: area->pages = NULL; area->nr_pages = 0; area->phys_addr = 0; + area->caller = caller; write_unlock(&vmlist_lock); return area; @@ -284,7 +287,8 @@ out: struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL); + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__get_vm_area); @@ -299,14 +303,22 @@ EXPORT_SYMBOL_GPL(__get_vm_area); */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { - return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + -1, GFP_KERNEL, __builtin_return_address(0)); +} + +struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, + void *caller) +{ + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + -1, GFP_KERNEL, caller); } struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask) { return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, - gfp_mask); + gfp_mask, __builtin_return_address(0)); } /* Caller must hold vmlist_lock */ @@ -383,6 +395,7 @@ static void __vunmap(const void *addr, int deallocate_pages) } debug_check_no_locks_freed(addr, area->size); + debug_check_no_obj_freed(addr, area->size); if (deallocate_pages) { int i; @@ -455,9 +468,11 @@ void *vmap(struct page **pages, unsigned int count, if (count > num_physpages) return NULL; - area = get_vm_area((count << PAGE_SHIFT), flags); + area = get_vm_area_caller((count << PAGE_SHIFT), flags, + __builtin_return_address(0)); if (!area) return NULL; + if (map_vm_area(area, prot, &pages)) { vunmap(area->addr); return NULL; @@ -468,7 +483,7 @@ void *vmap(struct page **pages, unsigned int count, EXPORT_SYMBOL(vmap); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, int node, void *caller) { struct page **pages; unsigned int nr_pages, array_size, i; @@ -480,7 +495,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, - PAGE_KERNEL, node); + PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, @@ -488,6 +503,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, node); } area->pages = pages; + area->caller = caller; if (!area->pages) { remove_vm_area(area->addr); kfree(area); @@ -521,7 +537,8 @@ fail: void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_area_node(area, gfp_mask, prot, -1); + return __vmalloc_area_node(area, gfp_mask, prot, -1, + __builtin_return_address(0)); } /** @@ -530,13 +547,14 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 + * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node) + int node, void *caller) { struct vm_struct *area; @@ -544,16 +562,19 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, if (!size || (size >> PAGE_SHIFT) > num_physpages) return NULL; - area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask); + area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, + node, gfp_mask, caller); + if (!area) return NULL; - return __vmalloc_area_node(area, gfp_mask, prot, node); + return __vmalloc_area_node(area, gfp_mask, prot, node, caller); } void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_node(size, gfp_mask, prot, -1); + return __vmalloc_node(size, gfp_mask, prot, -1, + __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); @@ -568,7 +589,8 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -608,7 +630,8 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -843,7 +866,8 @@ struct vm_struct *alloc_vm_area(size_t size) { struct vm_struct *area; - area = get_vm_area(size, VM_IOREMAP); + area = get_vm_area_caller(size, VM_IOREMAP, + __builtin_return_address(0)); if (area == NULL) return NULL; @@ -873,3 +897,85 @@ void free_vm_area(struct vm_struct *area) kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); + + +#ifdef CONFIG_PROC_FS +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct vm_struct *v; + + read_lock(&vmlist_lock); + v = vmlist; + while (n > 0 && v) { + n--; + v = v->next; + } + if (!n) + return v; + + return NULL; + +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct vm_struct *v = p; + + ++*pos; + return v->next; +} + +static void s_stop(struct seq_file *m, void *p) +{ + read_unlock(&vmlist_lock); +} + +static int s_show(struct seq_file *m, void *p) +{ + struct vm_struct *v = p; + + seq_printf(m, "0x%p-0x%p %7ld", + v->addr, v->addr + v->size, v->size); + + if (v->caller) { + char buff[2 * KSYM_NAME_LEN]; + + seq_putc(m, ' '); + sprint_symbol(buff, (unsigned long)v->caller); + seq_puts(m, buff); + } + + if (v->nr_pages) + seq_printf(m, " pages=%d", v->nr_pages); + + if (v->phys_addr) + seq_printf(m, " phys=%lx", v->phys_addr); + + if (v->flags & VM_IOREMAP) + seq_printf(m, " ioremap"); + + if (v->flags & VM_ALLOC) + seq_printf(m, " vmalloc"); + + if (v->flags & VM_MAP) + seq_printf(m, " vmap"); + + if (v->flags & VM_USERMAP) + seq_printf(m, " user"); + + if (v->flags & VM_VPAGES) + seq_printf(m, " vpages"); + + seq_putc(m, '\n'); + return 0; +} + +const struct seq_operations vmalloc_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; +#endif + diff --git a/mm/vmscan.c b/mm/vmscan.c index f80a5b7c057..9a29901ad3b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -191,7 +191,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, shrinker->nr += delta; if (shrinker->nr < 0) { printk(KERN_ERR "%s: nr=%ld\n", - __FUNCTION__, shrinker->nr); + __func__, shrinker->nr); shrinker->nr = max_pass; } @@ -339,7 +339,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, if (PagePrivate(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __FUNCTION__); + printk("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -1246,17 +1246,16 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_zones(int priority, struct zone **zones, +static unsigned long shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { + enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); unsigned long nr_reclaimed = 0; - int i; - + struct zoneref *z; + struct zone *zone; sc->all_unreclaimable = 1; - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; - + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!populated_zone(zone)) continue; /* @@ -1300,9 +1299,12 @@ static unsigned long shrink_zones(int priority, struct zone **zones, * hope that some of these pages can be written. But if the allocating task * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. + * + * returns: 0, if no pages reclaimed + * else, the number of pages reclaimed */ -static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, - struct scan_control *sc) +static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + struct scan_control *sc) { int priority; int ret = 0; @@ -1310,7 +1312,9 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; - int i; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); if (scan_global_lru(sc)) count_vm_event(ALLOCSTALL); @@ -1318,8 +1322,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, * mem_cgroup will not do shrink_slab. */ if (scan_global_lru(sc)) { - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; @@ -1333,13 +1336,13 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zones, sc); + nr_reclaimed += shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ if (scan_global_lru(sc)) { - shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -1347,7 +1350,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, } total_scanned += sc->nr_scanned; if (nr_reclaimed >= sc->swap_cluster_max) { - ret = 1; + ret = nr_reclaimed; goto out; } @@ -1370,7 +1373,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, } /* top priority shrink_caches still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scan_global_lru(sc)) - ret = 1; + ret = nr_reclaimed; out: /* * Now that we've scanned all the zones at this priority level, note @@ -1383,8 +1386,7 @@ out: priority = 0; if (scan_global_lru(sc)) { - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; @@ -1397,7 +1399,8 @@ out: return ret; } -unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zonelist *zonelist, int order, + gfp_t gfp_mask) { struct scan_control sc = { .gfp_mask = gfp_mask, @@ -1410,7 +1413,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) .isolate_pages = isolate_pages_global, }; - return do_try_to_free_pages(zones, gfp_mask, &sc); + return do_try_to_free_pages(zonelist, &sc); } #ifdef CONFIG_CGROUP_MEM_RES_CTLR @@ -1419,7 +1422,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask) { struct scan_control sc = { - .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, @@ -1428,13 +1430,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, }; - struct zone **zones; - int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE); + struct zonelist *zonelist; - zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; - if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) - return 1; - return 0; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + zonelist = NODE_DATA(numa_node_id())->node_zonelists; + return do_try_to_free_pages(zonelist, &sc); } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 7c7286e9506..1a32130b958 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -322,6 +322,7 @@ void refresh_cpu_vm_stats(int cpu) p->expire = 3; #endif } + cond_resched(); #ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this @@ -364,13 +365,13 @@ void refresh_cpu_vm_stats(int cpu) * * Must be called with interrupts disabled. */ -void zone_statistics(struct zonelist *zonelist, struct zone *z) +void zone_statistics(struct zone *preferred_zone, struct zone *z) { - if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { + if (z->zone_pgdat == preferred_zone->zone_pgdat) { __inc_zone_state(z, NUMA_HIT); } else { __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); } if (z->node == numa_node_id()) __inc_zone_state(z, NUMA_LOCAL); @@ -547,6 +548,10 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + return 0; + seq_printf(m, "Page block order: %d\n", pageblock_order); seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); seq_putc(m, '\n'); @@ -607,6 +612,7 @@ static const char * const vmstat_text[] = { "nr_unstable", "nr_bounce", "nr_vmscan_write", + "nr_writeback_temp", #ifdef CONFIG_NUMA "numa_hit", @@ -645,6 +651,10 @@ static const char * const vmstat_text[] = { "allocstall", "pgrotated", +#ifdef CONFIG_HUGETLB_PAGE + "htlb_buddy_alloc_success", + "htlb_buddy_alloc_fail", +#endif #endif }; |