aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/backing-dev.c16
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/madvise.c6
-rw-r--r--mm/memory.c12
-rw-r--r--mm/mempolicy.c51
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/mlock.c5
-rw-r--r--mm/mmap.c29
-rw-r--r--mm/mremap.c13
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c330
-rw-r--r--mm/slab.c32
-rw-r--r--mm/slob.c538
-rw-r--r--mm/slub.c79
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c42
20 files changed, 882 insertions, 323 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ac412b45f1..086af703da4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -117,7 +117,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC64)
+ depends on (IA64 || X86 || PPC64 || SUPERH)
comment "Memory hotplug is currently incompatible with Software Suspend"
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
@@ -168,3 +168,7 @@ config NR_QUICK
depends on QUICKLIST
default "2" if (SUPERH && !SUPERH64)
default "1"
+
+config VIRT_TO_BUS
+ def_bool y
+ depends on !ARCH_NO_VIRT_TO_BUS
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e5de3781d3f..f50a2811f9d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout)
}
EXPORT_SYMBOL(congestion_wait);
-long congestion_wait_interruptible(int rw, long timeout)
-{
- long ret;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[rw];
-
- prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
- if (signal_pending(current))
- ret = -ERESTARTSYS;
- else
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
- return ret;
-}
-EXPORT_SYMBOL(congestion_wait_interruptible);
-
/**
* congestion_end - wake up sleepers on a congested backing_dev_info
* @rw: READ or WRITE
diff --git a/mm/filemap.c b/mm/filemap.c
index c6ebd9f912a..100b99c2d50 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page)
page->mapping = NULL;
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
+ BUG_ON(page_mapped(page));
}
void remove_from_page_cache(struct page *page)
@@ -1218,6 +1219,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
retval = retval ?: desc.error;
break;
}
+ if (desc.count > 0)
+ break;
}
}
out:
@@ -1964,7 +1967,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (unlikely(*pos + *count > MAX_NON_LFS &&
!(file->f_flags & O_LARGEFILE))) {
if (*pos >= MAX_NON_LFS) {
- send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
if (*count > MAX_NON_LFS - (unsigned long)*pos) {
@@ -1982,7 +1984,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (likely(!isblk)) {
if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
if (*count || *pos > inode->i_sb->s_maxbytes) {
- send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
/* zero-length writes at ->s_maxbytes are OK */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a45d1f0691c..acc0fb3cf06 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,7 +66,7 @@ static void enqueue_huge_page(struct page *page)
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
- int nid = numa_node_id();
+ int nid;
struct page *page = NULL;
struct zonelist *zonelist = huge_zonelist(vma, address);
struct zone **z;
@@ -101,13 +101,20 @@ static void free_huge_page(struct page *page)
static int alloc_fresh_huge_page(void)
{
- static int nid = 0;
+ static int prev_nid;
struct page *page;
- page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
- nid = next_node(nid, node_online_map);
+ static DEFINE_SPINLOCK(nid_lock);
+ int nid;
+
+ spin_lock(&nid_lock);
+ nid = next_node(prev_nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
+ prev_nid = nid;
+ spin_unlock(&nid_lock);
+
+ page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+ HUGETLB_PAGE_ORDER);
if (page) {
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
diff --git a/mm/madvise.c b/mm/madvise.c
index 60542d006ec..93ee375b38e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -287,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
struct vm_area_struct * vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
+ int write;
size_t len;
- if (madvise_need_mmap_write(behavior))
+ write = madvise_need_mmap_write(behavior);
+ if (write)
down_write(&current->mm->mmap_sem);
else
down_read(&current->mm->mmap_sem);
@@ -354,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
vma = find_vma(current->mm, start);
}
out:
- if (madvise_need_mmap_write(behavior))
+ if (write)
up_write(&current->mm->mmap_sem);
else
up_read(&current->mm->mmap_sem);
diff --git a/mm/memory.c b/mm/memory.c
index f64cbf9baa3..b3d73bb1f68 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,11 +78,9 @@ unsigned long num_physpages;
* and ZONE_HIGHMEM.
*/
void * high_memory;
-unsigned long vmalloc_earlyreserve;
EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
-EXPORT_SYMBOL(vmalloc_earlyreserve);
int randomize_va_space __read_mostly = 1;
@@ -1055,6 +1053,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
do {
struct page *page;
+ /*
+ * If tsk is ooming, cut off its access to large memory
+ * allocations. It has a pending SIGKILL, but it can't
+ * be processed until returning to user space.
+ */
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
+ return -ENOMEM;
+
if (write)
foll_flags |= FOLL_WRITE;
@@ -2673,7 +2679,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
write = (vma->vm_flags & VM_WRITE) != 0;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
- len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+ len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
if (ret < 0)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d76e8eb342d..188f8d9c4ae 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -101,8 +101,6 @@
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
-#define PDprintk(fmt...)
-
/* Highest zone. An specific allocation for a zone below that is not
policied. */
enum zone_type policy_zone = 0;
@@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
{
struct mempolicy *policy;
- PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
+ pr_debug("setting mode %d nodes[0] %lx\n",
+ mode, nodes ? nodes_addr(*nodes)[0] : -1);
+
if (mode == MPOL_DEFAULT)
return NULL;
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
int err = 0;
struct mempolicy *old = vma->vm_policy;
- PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
@@ -776,8 +776,8 @@ long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
- PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes_addr(nodes)[0]);
+ pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ mode, nmask ? nodes_addr(*nmask)[0] : -1);
down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask,
@@ -1434,7 +1434,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
- PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
+ pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->policy : 0);
}
@@ -1459,7 +1459,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
- PDprintk("deleting %lx-l%x\n", n->start, n->end);
+ pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
mpol_free(n->policy);
kmem_cache_free(sn_cache, n);
@@ -1558,10 +1558,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
- PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+ pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
vma->vm_pgoff,
sz, npol? npol->policy : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : -1);
+ npol ? nodes_addr(npol->v.nodes)[0] : -1);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1597,6 +1597,10 @@ void mpol_free_shared_policy(struct shared_policy *p)
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
+ nodemask_t interleave_nodes;
+ unsigned long largest = 0;
+ int nid, prefer = 0;
+
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL, NULL);
@@ -1605,10 +1609,31 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL, NULL);
- /* Set interleaving policy for system init. This way not all
- the data structures allocated at system boot end up in node zero. */
+ /*
+ * Set interleaving policy for system init. Interleaving is only
+ * enabled across suitably sized nodes (default is >= 16MB), or
+ * fall back to the largest node if they're all smaller.
+ */
+ nodes_clear(interleave_nodes);
+ for_each_online_node(nid) {
+ unsigned long total_pages = node_present_pages(nid);
+
+ /* Preserve the largest node */
+ if (largest < total_pages) {
+ largest = total_pages;
+ prefer = nid;
+ }
+
+ /* Interleave this node? */
+ if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+ node_set(nid, interleave_nodes);
+ }
+
+ /* All too small, use the largest */
+ if (unlikely(nodes_empty(interleave_nodes)))
+ node_set(prefer, interleave_nodes);
- if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+ if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
}
diff --git a/mm/mempool.c b/mm/mempool.c
index cc1ca86dfc2..3e8f1fed0e1 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -263,6 +263,9 @@ void mempool_free(void *element, mempool_t *pool)
{
unsigned long flags;
+ if (unlikely(element == NULL))
+ return;
+
smp_mb();
if (pool->curr_nr < pool->min_nr) {
spin_lock_irqsave(&pool->lock, flags);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4d3fea267e0..7b2656055d6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -244,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user)
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ if (lock_limit == RLIM_INFINITY)
+ allowed = 1;
lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
- if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+ if (!allowed &&
+ locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
goto out;
get_uid(user);
user->locked_shm += locked;
diff --git a/mm/mmap.c b/mm/mmap.c
index 906ed402f7c..144b4a290f2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
struct inode *inode;
unsigned int vm_flags;
- int correct_wcount = 0;
int error;
- struct rb_node ** rb_link, * rb_parent;
int accountable = 1;
- unsigned long charged = 0, reqprot = prot;
+ unsigned long reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1023,10 +1020,28 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
}
}
- error = security_file_mmap(file, reqprot, prot, flags);
+ error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (error)
return error;
-
+
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+ accountable);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ unsigned int vm_flags, unsigned long pgoff,
+ int accountable)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = 0;
+ struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
@@ -1175,8 +1190,6 @@ unacct_error:
return error;
}
-EXPORT_SYMBOL(do_mmap_pgoff);
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
diff --git a/mm/mremap.c b/mm/mremap.c
index 5d4bd4f95b8..bc7c52efc71 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr,
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;
+ ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
+
ret = do_munmap(mm, new_addr, new_len);
if (ret)
goto out;
@@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr,
new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
vma->vm_pgoff, map_flags);
- ret = new_addr;
- if (new_addr & ~PAGE_MASK)
+ if (new_addr & ~PAGE_MASK) {
+ ret = new_addr;
+ goto out;
+ }
+
+ ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+ if (ret)
goto out;
}
ret = move_vma(vma, addr, old_len, new_len, new_addr);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2b16b00a5b1..8bbbf147a79 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -367,6 +367,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
return find_vma(mm, addr);
}
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return -ENOMEM;
+}
+
/*
* look up the first VMA exactly that exactly matches addr
* - should be called with mm->mmap_sem at least held readlocked
@@ -639,7 +644,7 @@ static int validate_mmap_request(struct file *file,
}
/* allow the security API to have its say */
- ret = security_file_mmap(file, reqprot, prot, flags);
+ ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (ret < 0)
return ret;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eec1481ba44..ea9da3bed3e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg)
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
- if (dirty_writeback_interval) {
- mod_timer(&wb_timer,
- jiffies + dirty_writeback_interval);
- } else {
+ if (dirty_writeback_interval)
+ mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+ else
del_timer(&wb_timer);
- }
return 0;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05ace44852e..f9e4e647d7e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -126,13 +126,13 @@ static unsigned long __meminitdata dma_reserve;
#endif
#endif
- struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
- int __meminitdata nr_nodemap_entries;
- unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
- unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+ static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
+ static int __meminitdata nr_nodemap_entries;
+ static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
- unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
- unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+ static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
+ static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
@@ -900,11 +900,13 @@ static struct fail_page_alloc_attr {
u32 ignore_gfp_highmem;
u32 ignore_gfp_wait;
+ u32 min_order;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
struct dentry *ignore_gfp_highmem_file;
struct dentry *ignore_gfp_wait_file;
+ struct dentry *min_order_file;
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
@@ -912,6 +914,7 @@ static struct fail_page_alloc_attr {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
.ignore_gfp_highmem = 1,
+ .min_order = 1,
};
static int __init setup_fail_page_alloc(char *str)
@@ -922,6 +925,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc);
static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
+ if (order < fail_page_alloc.min_order)
+ return 0;
if (gfp_mask & __GFP_NOFAIL)
return 0;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
@@ -953,12 +958,17 @@ static int __init fail_page_alloc_debugfs(void)
fail_page_alloc.ignore_gfp_highmem_file =
debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem);
+ fail_page_alloc.min_order_file =
+ debugfs_create_u32("min-order", mode, dir,
+ &fail_page_alloc.min_order);
if (!fail_page_alloc.ignore_gfp_wait_file ||
- !fail_page_alloc.ignore_gfp_highmem_file) {
+ !fail_page_alloc.ignore_gfp_highmem_file ||
+ !fail_page_alloc.min_order_file) {
err = -ENOMEM;
debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+ debugfs_remove(fail_page_alloc.min_order_file);
cleanup_fault_attr_dentries(&fail_page_alloc.attr);
}
@@ -1621,8 +1631,8 @@ void show_free_areas(void)
*
* Add all populated zones of a node to the zonelist.
*/
-static int __meminit build_zonelists_node(pg_data_t *pgdat,
- struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
+ int nr_zones, enum zone_type zone_type)
{
struct zone *zone;
@@ -1641,9 +1651,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
return nr_zones;
}
+
+/*
+ * zonelist_order:
+ * 0 = automatic detection of better ordering.
+ * 1 = order by ([node] distance, -zonetype)
+ * 2 = order by (-zonetype, [node] distance)
+ *
+ * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
+ * the same zonelist. So only NUMA can configure this param.
+ */
+#define ZONELIST_ORDER_DEFAULT 0
+#define ZONELIST_ORDER_NODE 1
+#define ZONELIST_ORDER_ZONE 2
+
+/* zonelist order in the kernel.
+ * set_zonelist_order() will set this to NODE or ZONE.
+ */
+static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
+static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
+
+
#ifdef CONFIG_NUMA
+/* The value user specified ....changed by config */
+static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+/* string for sysctl */
+#define NUMA_ZONELIST_ORDER_LEN 16
+char numa_zonelist_order[16] = "default";
+
+/*
+ * interface for configure zonelist ordering.
+ * command line option "numa_zonelist_order"
+ * = "[dD]efault - default, automatic configuration.
+ * = "[nN]ode - order by node locality, then by zone within node
+ * = "[zZ]one - order by zone, then by locality within zone
+ */
+
+static int __parse_numa_zonelist_order(char *s)
+{
+ if (*s == 'd' || *s == 'D') {
+ user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+ } else if (*s == 'n' || *s == 'N') {
+ user_zonelist_order = ZONELIST_ORDER_NODE;
+ } else if (*s == 'z' || *s == 'Z') {
+ user_zonelist_order = ZONELIST_ORDER_ZONE;
+ } else {
+ printk(KERN_WARNING
+ "Ignoring invalid numa_zonelist_order value: "
+ "%s\n", s);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static __init int setup_numa_zonelist_order(char *s)
+{
+ if (s)
+ return __parse_numa_zonelist_order(s);
+ return 0;
+}
+early_param("numa_zonelist_order", setup_numa_zonelist_order);
+
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ int ret;
+
+ if (write)
+ strncpy(saved_string, (char*)table->data,
+ NUMA_ZONELIST_ORDER_LEN);
+ ret = proc_dostring(table, write, file, buffer, length, ppos);
+ if (ret)
+ return ret;
+ if (write) {
+ int oldval = user_zonelist_order;
+ if (__parse_numa_zonelist_order((char*)table->data)) {
+ /*
+ * bogus value. restore saved string
+ */
+ strncpy((char*)table->data, saved_string,
+ NUMA_ZONELIST_ORDER_LEN);
+ user_zonelist_order = oldval;
+ } else if (oldval != user_zonelist_order)
+ build_all_zonelists();
+ }
+ return 0;
+}
+
+
#define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+static int node_load[MAX_NUMNODES];
+
/**
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
@@ -1658,7 +1761,7 @@ static int __meminitdata node_load[MAX_NUMNODES];
* on them otherwise.
* It returns -1 if no node is found.
*/
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
@@ -1704,13 +1807,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
return best_node;
}
-static void __meminit build_zonelists(pg_data_t *pgdat)
+
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
+ */
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
{
- int j, node, local_node;
enum zone_type i;
- int prev_node, load;
+ int j;
struct zonelist *zonelist;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ for (j = 0; zonelist->zones[j] != NULL; j++)
+ ;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+ zonelist->zones[j] = NULL;
+ }
+}
+
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+static int node_order[MAX_NUMNODES];
+
+static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
+{
+ enum zone_type i;
+ int pos, j, node;
+ int zone_type; /* needs to be signed */
+ struct zone *z;
+ struct zonelist *zonelist;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ pos = 0;
+ for (zone_type = i; zone_type >= 0; zone_type--) {
+ for (j = 0; j < nr_nodes; j++) {
+ node = node_order[j];
+ z = &NODE_DATA(node)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ zonelist->zones[pos++] = z;
+ check_highest_zone(zone_type);
+ }
+ }
+ }
+ zonelist->zones[pos] = NULL;
+ }
+}
+
+static int default_zonelist_order(void)
+{
+ int nid, zone_type;
+ unsigned long low_kmem_size,total_size;
+ struct zone *z;
+ int average_size;
+ /*
+ * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+ * If they are really small and used heavily, the system can fall
+ * into OOM very easily.
+ * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+ */
+ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
+ low_kmem_size = 0;
+ total_size = 0;
+ for_each_online_node(nid) {
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
+ }
+ }
+ }
+ if (!low_kmem_size || /* there are no DMA area. */
+ low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
+ return ZONELIST_ORDER_NODE;
+ /*
+ * look into each node's config.
+ * If there is a node whose DMA/DMA32 memory is very big area on
+ * local memory, NODE_ORDER may be suitable.
+ */
+ average_size = total_size / (num_online_nodes() + 1);
+ for_each_online_node(nid) {
+ low_kmem_size = 0;
+ total_size = 0;
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
+ }
+ }
+ if (low_kmem_size &&
+ total_size > average_size && /* ignore small node */
+ low_kmem_size > total_size * 70/100)
+ return ZONELIST_ORDER_NODE;
+ }
+ return ZONELIST_ORDER_ZONE;
+}
+
+static void set_zonelist_order(void)
+{
+ if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
+ current_zonelist_order = default_zonelist_order();
+ else
+ current_zonelist_order = user_zonelist_order;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ int j, node, load;
+ enum zone_type i;
nodemask_t used_mask;
+ int local_node, prev_node;
+ struct zonelist *zonelist;
+ int order = current_zonelist_order;
/* initialize zonelists */
for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1723,6 +1942,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
load = num_online_nodes();
prev_node = local_node;
nodes_clear(used_mask);
+
+ memset(node_load, 0, sizeof(node_load));
+ memset(node_order, 0, sizeof(node_order));
+ j = 0;
+
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
int distance = node_distance(local_node, node);
@@ -1738,23 +1962,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
-
if (distance != node_distance(local_node, prev_node))
- node_load[node] += load;
+ node_load[node] = load;
+
prev_node = node;
load--;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + i;
- for (j = 0; zonelist->zones[j] != NULL; j++);
+ if (order == ZONELIST_ORDER_NODE)
+ build_zonelists_in_node_order(pgdat, node);
+ else
+ node_order[j++] = node; /* remember order */
+ }
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- zonelist->zones[j] = NULL;
- }
+ if (order == ZONELIST_ORDER_ZONE) {
+ /* calculate node order -- i.e., DMA last! */
+ build_zonelists_in_zone_order(pgdat, j);
}
}
/* Construct the zonelist performance cache - see further mmzone.h */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
{
int i;
@@ -1771,9 +1997,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
}
}
+
#else /* CONFIG_NUMA */
-static void __meminit build_zonelists(pg_data_t *pgdat)
+static void set_zonelist_order(void)
+{
+ current_zonelist_order = ZONELIST_ORDER_ZONE;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
enum zone_type i,j;
@@ -1809,7 +2041,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
{
int i;
@@ -1820,7 +2052,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
#endif /* CONFIG_NUMA */
/* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+static int __build_all_zonelists(void *dummy)
{
int nid;
@@ -1831,8 +2063,10 @@ static int __meminit __build_all_zonelists(void *dummy)
return 0;
}
-void __meminit build_all_zonelists(void)
+void build_all_zonelists(void)
{
+ set_zonelist_order();
+
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
cpuset_init_current_mems_allowed();
@@ -1843,8 +2077,13 @@ void __meminit build_all_zonelists(void)
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
- printk("Built %i zonelists. Total pages: %ld\n",
- num_online_nodes(), vm_total_pages);
+ printk("Built %i zonelists in %s order. Total pages: %ld\n",
+ num_online_nodes(),
+ zonelist_order_name[current_zonelist_order],
+ vm_total_pages);
+#ifdef CONFIG_NUMA
+ printk("Policy zone: %s\n", zone_names[policy_zone]);
+#endif
}
/*
@@ -1953,8 +2192,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
}
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
- unsigned long size)
+static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
+ struct zone *zone, unsigned long size)
{
int order;
for (order = 0; order < MAX_ORDER ; order++) {
@@ -2370,7 +2609,7 @@ void __init push_node_boundaries(unsigned int nid,
}
/* If necessary, push the node boundary out for reserve hotadd */
-static void __init account_node_boundary(unsigned int nid,
+static void __meminit account_node_boundary(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
@@ -2390,7 +2629,7 @@ static void __init account_node_boundary(unsigned int nid,
void __init push_node_boundaries(unsigned int nid,
unsigned long start_pfn, unsigned long end_pfn) {}
-static void __init account_node_boundary(unsigned int nid,
+static void __meminit account_node_boundary(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn) {}
#endif
@@ -2431,7 +2670,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
-unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *ignored)
{
@@ -2519,7 +2758,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
}
/* Return the number of page frames in holes in a zone on a node */
-unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *ignored)
{
@@ -2536,14 +2775,14 @@ unsigned long __meminit zone_absent_pages_in_node(int nid,
}
#else
-static inline unsigned long zone_spanned_pages_in_node(int nid,
+static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zones_size)
{
return zones_size[zone_type];
}
-static inline unsigned long zone_absent_pages_in_node(int nid,
+static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zholes_size)
{
@@ -3355,13 +3594,28 @@ void *__init alloc_large_system_hash(const char *tablename,
for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
;
table = (void*) __get_free_pages(GFP_ATOMIC, order);
+ /*
+ * If bucketsize is not a power-of-two, we may free
+ * some pages at the end of hash table.
+ */
+ if (table) {
+ unsigned long alloc_end = (unsigned long)table +
+ (PAGE_SIZE << order);
+ unsigned long used = (unsigned long)table +
+ PAGE_ALIGN(size);
+ split_page(virt_to_page(table), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+ printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
tablename,
(1U << log2qty),
ilog2(size) - PAGE_SHIFT,
diff --git a/mm/slab.c b/mm/slab.c
index b344e670712..a453383333f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -929,7 +929,7 @@ static void next_reap_node(void)
* the CPUs getting into lockstep and contending for the global cache chain
* lock.
*/
-static void __devinit start_cpu_timer(int cpu)
+static void __cpuinit start_cpu_timer(int cpu)
{
struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
@@ -4157,26 +4157,17 @@ static void print_slabinfo_header(struct seq_file *m)
static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
- struct list_head *p;
mutex_lock(&cache_chain_mutex);
if (!n)
print_slabinfo_header(m);
- p = cache_chain.next;
- while (n--) {
- p = p->next;
- if (p == &cache_chain)
- return NULL;
- }
- return list_entry(p, struct kmem_cache, next);
+
+ return seq_list_start(&cache_chain, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct kmem_cache *cachep = p;
- ++*pos;
- return cachep->next.next == &cache_chain ?
- NULL : list_entry(cachep->next.next, struct kmem_cache, next);
+ return seq_list_next(p, &cache_chain, pos);
}
static void s_stop(struct seq_file *m, void *p)
@@ -4186,7 +4177,7 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = p;
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
@@ -4355,17 +4346,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
static void *leaks_start(struct seq_file *m, loff_t *pos)
{
- loff_t n = *pos;
- struct list_head *p;
-
mutex_lock(&cache_chain_mutex);
- p = cache_chain.next;
- while (n--) {
- p = p->next;
- if (p == &cache_chain)
- return NULL;
- }
- return list_entry(p, struct kmem_cache, next);
+ return seq_list_start(&cache_chain, *pos);
}
static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4430,7 +4412,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = p;
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
struct slab *slabp;
struct kmem_list3 *l3;
const char *name;
diff --git a/mm/slob.c b/mm/slob.c
index 71976c5d40d..b4899079d8b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -3,57 +3,159 @@
*
* Matt Mackall <mpm@selenic.com> 12/30/03
*
+ * NUMA support by Paul Mundt, 2007.
+ *
* How SLOB works:
*
* The core of SLOB is a traditional K&R style heap allocator, with
* support for returning aligned objects. The granularity of this
- * allocator is 8 bytes on x86, though it's perhaps possible to reduce
- * this to 4 if it's deemed worth the effort. The slob heap is a
- * singly-linked list of pages from __get_free_page, grown on demand
- * and allocation from the heap is currently first-fit.
+ * allocator is as little as 2 bytes, however typically most architectures
+ * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
+ *
+ * The slob heap is a linked list of pages from alloc_pages(), and
+ * within each page, there is a singly-linked list of free blocks (slob_t).
+ * The heap is grown on demand and allocation from the heap is currently
+ * first-fit.
*
* Above this is an implementation of kmalloc/kfree. Blocks returned
- * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * from kmalloc are prepended with a 4-byte header with the kmalloc size.
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
- * __get_free_pages directly so that it can return page-aligned blocks
- * and keeps a linked list of such pages and their orders. These
- * objects are detected in kfree() by their page alignment.
+ * alloc_pages() directly, allocating compound pages so the page order
+ * does not have to be separately tracked, and also stores the exact
+ * allocation size in page->private so that it can be used to accurately
+ * provide ksize(). These objects are detected in kfree() because slob_page()
+ * is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
- * destructors for every SLAB allocation. Objects are returned with
- * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is
- * set, in which case the low-level allocator will fragment blocks to
- * create the proper alignment. Again, objects of page-size or greater
- * are allocated by calling __get_free_pages. As SLAB objects know
- * their size, no separate size bookkeeping is necessary and there is
- * essentially no allocation space overhead.
+ * destructors for every SLAB allocation. Objects are returned with the
+ * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
+ * case the low-level allocator will fragment blocks to create the proper
+ * alignment. Again, objects of page-size or greater are allocated by
+ * calling alloc_pages(). As SLAB objects know their size, no separate
+ * size bookkeeping is necessary and there is essentially no allocation
+ * space overhead, and compound pages aren't needed for multi-page
+ * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, alloc_pages_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
*/
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/timer.h>
#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+/*
+ * slob_block has a field 'units', which indicates size of block if +ve,
+ * or offset of next block if -ve (in SLOB_UNITs).
+ *
+ * Free blocks of size 1 unit simply contain the offset of the next block.
+ * Those with larger size contain their size in the first SLOB_UNIT of
+ * memory, and the offset of the next free block in the second SLOB_UNIT.
+ */
+#if PAGE_SIZE <= (32767 * 2)
+typedef s16 slobidx_t;
+#else
+typedef s32 slobidx_t;
+#endif
struct slob_block {
- int units;
- struct slob_block *next;
+ slobidx_t units;
};
typedef struct slob_block slob_t;
+/*
+ * We use struct page fields to manage some slob allocation aspects,
+ * however to avoid the horrible mess in include/linux/mm_types.h, we'll
+ * just define our own struct page type variant here.
+ */
+struct slob_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ slobidx_t units; /* free units left in page */
+ unsigned long pad[2];
+ slob_t *free; /* first free slob_t in page */
+ struct list_head list; /* linked list of free pages */
+ };
+ struct page page;
+ };
+};
+static inline void struct_slob_page_wrong_size(void)
+{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
+
+/*
+ * free_slob_page: call before a slob_page is returned to the page allocator.
+ */
+static inline void free_slob_page(struct slob_page *sp)
+{
+ reset_page_mapcount(&sp->page);
+ sp->page.mapping = NULL;
+}
+
+/*
+ * All (partially) free slob pages go on this list.
+ */
+static LIST_HEAD(free_slob_pages);
+
+/*
+ * slob_page: True for all slob pages (false for bigblock pages)
+ */
+static inline int slob_page(struct slob_page *sp)
+{
+ return test_bit(PG_active, &sp->flags);
+}
+
+static inline void set_slob_page(struct slob_page *sp)
+{
+ __set_bit(PG_active, &sp->flags);
+}
+
+static inline void clear_slob_page(struct slob_page *sp)
+{
+ __clear_bit(PG_active, &sp->flags);
+}
+
+/*
+ * slob_page_free: true for pages on free_slob_pages list.
+ */
+static inline int slob_page_free(struct slob_page *sp)
+{
+ return test_bit(PG_private, &sp->flags);
+}
+
+static inline void set_slob_page_free(struct slob_page *sp)
+{
+ list_add(&sp->list, &free_slob_pages);
+ __set_bit(PG_private, &sp->flags);
+}
+
+static inline void clear_slob_page_free(struct slob_page *sp)
+{
+ list_del(&sp->list);
+ __clear_bit(PG_private, &sp->flags);
+}
+
#define SLOB_UNIT sizeof(slob_t)
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
#define SLOB_ALIGN L1_CACHE_BYTES
-struct bigblock {
- int order;
- void *pages;
- struct bigblock *next;
-};
-typedef struct bigblock bigblock_t;
-
/*
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
* were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
@@ -64,133 +166,285 @@ struct slob_rcu {
int size;
};
-static slob_t arena = { .next = &arena, .units = 1 };
-static slob_t *slobfree = &arena;
-static bigblock_t *bigblocks;
+/*
+ * slob_lock protects all slob allocator structures.
+ */
static DEFINE_SPINLOCK(slob_lock);
-static DEFINE_SPINLOCK(block_lock);
-static void slob_free(void *b, int size);
-static void slob_timer_cbk(void);
+/*
+ * Encode the given size and next info into a free slob block s.
+ */
+static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t offset = next - base;
+ if (size > 1) {
+ s[0].units = size;
+ s[1].units = offset;
+ } else
+ s[0].units = -offset;
+}
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+/*
+ * Return the size of a slob block.
+ */
+static slobidx_t slob_units(slob_t *s)
+{
+ if (s->units > 0)
+ return s->units;
+ return 1;
+}
+
+/*
+ * Return the next free slob block pointer after this one.
+ */
+static slob_t *slob_next(slob_t *s)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t next;
+
+ if (s[0].units < 0)
+ next = -s[0].units;
+ else
+ next = s[1].units;
+ return base+next;
+}
+
+/*
+ * Returns true if s is the last free block in its page.
+ */
+static int slob_last(slob_t *s)
+{
+ return !((unsigned long)slob_next(s) & ~PAGE_MASK);
+}
+
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+ void *page;
+
+#ifdef CONFIG_NUMA
+ if (node != -1)
+ page = alloc_pages_node(node, gfp, order);
+ else
+#endif
+ page = alloc_pages(gfp, order);
+
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+/*
+ * Allocate a slob block within a given slob_page sp.
+ */
+static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
{
slob_t *prev, *cur, *aligned = 0;
int delta = 0, units = SLOB_UNITS(size);
- unsigned long flags;
- spin_lock_irqsave(&slob_lock, flags);
- prev = slobfree;
- for (cur = prev->next; ; prev = cur, cur = cur->next) {
+ for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
+ slobidx_t avail = slob_units(cur);
+
if (align) {
aligned = (slob_t *)ALIGN((unsigned long)cur, align);
delta = aligned - cur;
}
- if (cur->units >= units + delta) { /* room enough? */
+ if (avail >= units + delta) { /* room enough? */
+ slob_t *next;
+
if (delta) { /* need to fragment head to align? */
- aligned->units = cur->units - delta;
- aligned->next = cur->next;
- cur->next = aligned;
- cur->units = delta;
+ next = slob_next(cur);
+ set_slob(aligned, avail - delta, next);
+ set_slob(cur, delta, aligned);
prev = cur;
cur = aligned;
+ avail = slob_units(cur);
}
- if (cur->units == units) /* exact fit? */
- prev->next = cur->next; /* unlink */
- else { /* fragment */
- prev->next = cur + units;
- prev->next->units = cur->units - units;
- prev->next->next = cur->next;
- cur->units = units;
+ next = slob_next(cur);
+ if (avail == units) { /* exact fit? unlink. */
+ if (prev)
+ set_slob(prev, slob_units(prev), next);
+ else
+ sp->free = next;
+ } else { /* fragment */
+ if (prev)
+ set_slob(prev, slob_units(prev), cur + units);
+ else
+ sp->free = cur + units;
+ set_slob(cur + units, avail - units, next);
}
- slobfree = prev;
- spin_unlock_irqrestore(&slob_lock, flags);
+ sp->units -= units;
+ if (!sp->units)
+ clear_slob_page_free(sp);
return cur;
}
- if (cur == slobfree) {
- spin_unlock_irqrestore(&slob_lock, flags);
-
- if (size == PAGE_SIZE) /* trying to shrink arena? */
- return 0;
+ if (slob_last(cur))
+ return NULL;
+ }
+}
- cur = (slob_t *)__get_free_page(gfp);
- if (!cur)
- return 0;
+/*
+ * slob_alloc: entry point into the slob allocator.
+ */
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+{
+ struct slob_page *sp;
+ slob_t *b = NULL;
+ unsigned long flags;
- slob_free(cur, PAGE_SIZE);
- spin_lock_irqsave(&slob_lock, flags);
- cur = slobfree;
+ spin_lock_irqsave(&slob_lock, flags);
+ /* Iterate through each partially free page, try to find room */
+ list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+ /*
+ * If there's a node specification, search for a partial
+ * page with a matching node id in the freelist.
+ */
+ if (node != -1 && page_to_nid(&sp->page) != node)
+ continue;
+#endif
+
+ if (sp->units >= SLOB_UNITS(size)) {
+ b = slob_page_alloc(sp, size, align);
+ if (b)
+ break;
}
}
+ spin_unlock_irqrestore(&slob_lock, flags);
+
+ /* Not enough space: must allocate a new page */
+ if (!b) {
+ b = slob_new_page(gfp, 0, node);
+ if (!b)
+ return 0;
+ sp = (struct slob_page *)virt_to_page(b);
+ set_slob_page(sp);
+
+ spin_lock_irqsave(&slob_lock, flags);
+ sp->units = SLOB_UNITS(PAGE_SIZE);
+ sp->free = b;
+ INIT_LIST_HEAD(&sp->list);
+ set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
+ set_slob_page_free(sp);
+ b = slob_page_alloc(sp, size, align);
+ BUG_ON(!b);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ }
+ return b;
}
+/*
+ * slob_free: entry point into the slob allocator.
+ */
static void slob_free(void *block, int size)
{
- slob_t *cur, *b = (slob_t *)block;
+ struct slob_page *sp;
+ slob_t *prev, *next, *b = (slob_t *)block;
+ slobidx_t units;
unsigned long flags;
if (!block)
return;
+ BUG_ON(!size);
- if (size)
- b->units = SLOB_UNITS(size);
+ sp = (struct slob_page *)virt_to_page(block);
+ units = SLOB_UNITS(size);
- /* Find reinsertion point */
spin_lock_irqsave(&slob_lock, flags);
- for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
- if (cur >= cur->next && (b > cur || b < cur->next))
- break;
- if (b + b->units == cur->next) {
- b->units += cur->next->units;
- b->next = cur->next->next;
- } else
- b->next = cur->next;
+ if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
+ /* Go directly to page allocator. Do not pass slob allocator */
+ if (slob_page_free(sp))
+ clear_slob_page_free(sp);
+ clear_slob_page(sp);
+ free_slob_page(sp);
+ free_page((unsigned long)b);
+ goto out;
+ }
- if (cur + cur->units == b) {
- cur->units += b->units;
- cur->next = b->next;
- } else
- cur->next = b;
+ if (!slob_page_free(sp)) {
+ /* This slob page is about to become partially free. Easy! */
+ sp->units = units;
+ sp->free = b;
+ set_slob(b, units,
+ (void *)((unsigned long)(b +
+ SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
+ set_slob_page_free(sp);
+ goto out;
+ }
- slobfree = cur;
+ /*
+ * Otherwise the page is already partially free, so find reinsertion
+ * point.
+ */
+ sp->units += units;
+ if (b < sp->free) {
+ set_slob(b, units, sp->free);
+ sp->free = b;
+ } else {
+ prev = sp->free;
+ next = slob_next(prev);
+ while (b > next) {
+ prev = next;
+ next = slob_next(prev);
+ }
+
+ if (!slob_last(prev) && b + units == next) {
+ units += slob_units(next);
+ set_slob(b, units, slob_next(next));
+ } else
+ set_slob(b, units, next);
+
+ if (prev + slob_units(prev) == b) {
+ units = slob_units(b) + slob_units(prev);
+ set_slob(prev, units, slob_next(b));
+ } else
+ set_slob(prev, slob_units(prev), b);
+ }
+out:
spin_unlock_irqrestore(&slob_lock, flags);
}
-void *__kmalloc(size_t size, gfp_t gfp)
-{
- slob_t *m;
- bigblock_t *bb;
- unsigned long flags;
+/*
+ * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
+ */
- if (size < PAGE_SIZE - SLOB_UNIT) {
- m = slob_alloc(size + SLOB_UNIT, gfp, 0);
- return m ? (void *)(m + 1) : 0;
- }
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
+#endif
- bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
- if (!bb)
- return 0;
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
+#endif
- bb->order = get_order(size);
- bb->pages = (void *)__get_free_pages(gfp, bb->order);
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+
+ if (size < PAGE_SIZE - align) {
+ unsigned int *m;
+ m = slob_alloc(size + align, gfp, align, node);
+ if (m)
+ *m = size;
+ return (void *)m + align;
+ } else {
+ void *ret;
- if (bb->pages) {
- spin_lock_irqsave(&block_lock, flags);
- bb->next = bigblocks;
- bigblocks = bb;
- spin_unlock_irqrestore(&block_lock, flags);
- return bb->pages;
+ ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+ if (ret) {
+ struct page *page;
+ page = virt_to_page(ret);
+ page->private = size;
+ }
+ return ret;
}
-
- slob_free(bb, sizeof(bigblock_t));
- return 0;
}
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_node);
/**
* krealloc - reallocate memory. The contents will remain unchanged.
@@ -227,52 +481,34 @@ EXPORT_SYMBOL(krealloc);
void kfree(const void *block)
{
- bigblock_t *bb, **last = &bigblocks;
- unsigned long flags;
+ struct slob_page *sp;
if (!block)
return;
- if (!((unsigned long)block & (PAGE_SIZE-1))) {
- /* might be on the big block list */
- spin_lock_irqsave(&block_lock, flags);
- for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
- if (bb->pages == block) {
- *last = bb->next;
- spin_unlock_irqrestore(&block_lock, flags);
- free_pages((unsigned long)block, bb->order);
- slob_free(bb, sizeof(bigblock_t));
- return;
- }
- }
- spin_unlock_irqrestore(&block_lock, flags);
- }
-
- slob_free((slob_t *)block - 1, 0);
- return;
+ sp = (struct slob_page *)virt_to_page(block);
+ if (slob_page(sp)) {
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ slob_free(m, *m + align);
+ } else
+ put_page(&sp->page);
}
-
EXPORT_SYMBOL(kfree);
+/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t ksize(const void *block)
{
- bigblock_t *bb;
- unsigned long flags;
+ struct slob_page *sp;
if (!block)
return 0;
- if (!((unsigned long)block & (PAGE_SIZE-1))) {
- spin_lock_irqsave(&block_lock, flags);
- for (bb = bigblocks; bb; bb = bb->next)
- if (bb->pages == block) {
- spin_unlock_irqrestore(&slob_lock, flags);
- return PAGE_SIZE << bb->order;
- }
- spin_unlock_irqrestore(&block_lock, flags);
- }
-
- return ((slob_t *)block - 1)->units * SLOB_UNIT;
+ sp = (struct slob_page *)virt_to_page(block);
+ if (slob_page(sp))
+ return ((slob_t *)block - 1)->units + SLOB_UNIT;
+ else
+ return sp->page.private;
}
struct kmem_cache {
@@ -289,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
{
struct kmem_cache *c;
- c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+ c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
if (c) {
c->name = name;
@@ -302,6 +538,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
c->ctor = ctor;
/* ignore alignment unless it's forced */
c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+ if (c->align < ARCH_SLAB_MINALIGN)
+ c->align = ARCH_SLAB_MINALIGN;
if (c->align < align)
c->align = align;
} else if (flags & SLAB_PANIC)
@@ -317,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
}
EXPORT_SYMBOL(kmem_cache_destroy);
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
if (c->size < PAGE_SIZE)
- b = slob_alloc(c->size, flags, c->align);
+ b = slob_alloc(c->size, flags, c->align, node);
else
- b = (void *)__get_free_pages(flags, get_order(c->size));
+ b = slob_new_page(flags, get_order(c->size), node);
if (c->ctor)
c->ctor(b, c, 0);
return b;
}
-EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_alloc_node);
void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
{
@@ -385,9 +623,6 @@ const char *kmem_cache_name(struct kmem_cache *c)
}
EXPORT_SYMBOL(kmem_cache_name);
-static struct timer_list slob_timer = TIMER_INITIALIZER(
- (void (*)(unsigned long))slob_timer_cbk, 0, 0);
-
int kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
@@ -399,17 +634,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b)
return 0;
}
-void __init kmem_cache_init(void)
+static unsigned int slob_ready __read_mostly;
+
+int slab_is_available(void)
{
- slob_timer_cbk();
+ return slob_ready;
}
-static void slob_timer_cbk(void)
+void __init kmem_cache_init(void)
{
- void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
-
- if (p)
- free_page((unsigned long)p);
-
- mod_timer(&slob_timer, jiffies + HZ);
+ slob_ready = 1;
}
diff --git a/mm/slub.c b/mm/slub.c
index e0cf6213abc..6aea48942c2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -323,7 +323,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
/*
* Debug settings:
*/
+#ifdef CONFIG_SLUB_DEBUG_ON
+static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#else
static int slub_debug;
+#endif
static char *slub_debug_slabs;
@@ -888,38 +892,57 @@ fail:
static int __init setup_slub_debug(char *str)
{
- if (!str || *str != '=')
- slub_debug = DEBUG_DEFAULT_FLAGS;
- else {
- str++;
- if (*str == 0 || *str == ',')
- slub_debug = DEBUG_DEFAULT_FLAGS;
- else
- for( ;*str && *str != ','; str++)
- switch (*str) {
- case 'f' : case 'F' :
- slub_debug |= SLAB_DEBUG_FREE;
- break;
- case 'z' : case 'Z' :
- slub_debug |= SLAB_RED_ZONE;
- break;
- case 'p' : case 'P' :
- slub_debug |= SLAB_POISON;
- break;
- case 'u' : case 'U' :
- slub_debug |= SLAB_STORE_USER;
- break;
- case 't' : case 'T' :
- slub_debug |= SLAB_TRACE;
- break;
- default:
- printk(KERN_ERR "slub_debug option '%c' "
- "unknown. skipped\n",*str);
- }
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ if (*str == ',')
+ /*
+ * No options but restriction on slabs. This means full
+ * debugging for slabs matching a pattern.
+ */
+ goto check_slabs;
+
+ slub_debug = 0;
+ if (*str == '-')
+ /*
+ * Switch off all debugging measures.
+ */
+ goto out;
+
+ /*
+ * Determine which debug features should be switched on
+ */
+ for ( ;*str && *str != ','; str++) {
+ switch (tolower(*str)) {
+ case 'f':
+ slub_debug |= SLAB_DEBUG_FREE;
+ break;
+ case 'z':
+ slub_debug |= SLAB_RED_ZONE;
+ break;
+ case 'p':
+ slub_debug |= SLAB_POISON;
+ break;
+ case 'u':
+ slub_debug |= SLAB_STORE_USER;
+ break;
+ case 't':
+ slub_debug |= SLAB_TRACE;
+ break;
+ default:
+ printk(KERN_ERR "slub_debug option '%c' "
+ "unknown. skipped\n",*str);
+ }
}
+check_slabs:
if (*str == ',')
slub_debug_slabs = str + 1;
+out:
return 1;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5f7cf2a4cb5..925d5c50f18 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,7 +21,7 @@
/*
* swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_list, to make sync_page look nicer, and to allow
+ * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
* future use of radix_tree tags in the swap cache.
*/
static const struct address_space_operations swap_aops = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index acc172cbe3a..7ff0a81c7b0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type)
/*
* So we could skip searching mms once swap count went
* to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_list will preserve it.
+ * mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
unlock_page(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 4fbe1a2da5f..7c994f2d614 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
}
EXPORT_SYMBOL(truncate_inode_pages);
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, bool be_atomic)
{
struct pagevec pvec;
pgoff_t next = start;
@@ -308,17 +295,38 @@ unlock:
break;
}
pagevec_release(&pvec);
+ if (likely(!be_atomic))
+ cond_resched();
}
return ret;
}
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ return __invalidate_mapping_pages(mapping, start, end, false);
+}
EXPORT_SYMBOL(invalidate_mapping_pages);
/*
* This is like invalidate_complete_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
- * shrink_list() has a temp ref on them, or because they're transiently sitting
- * in the lru_cache_add() pagevecs.
+ * shrink_page_list() has a temp ref on them, or because they're transiently
+ * sitting in the lru_cache_add() pagevecs.
*/
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)