diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memory.c | 117 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 245 | ||||
-rw-r--r-- | mm/page_alloc.c | 749 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 126 | ||||
-rw-r--r-- | mm/slob.c | 3 | ||||
-rw-r--r-- | mm/truncate.c | 25 | ||||
-rw-r--r-- | mm/vmalloc.c | 30 | ||||
-rw-r--r-- | mm/vmscan.c | 30 | ||||
-rw-r--r-- | mm/vmstat.c | 3 |
11 files changed, 1136 insertions, 200 deletions
diff --git a/mm/memory.c b/mm/memory.c index 92a3ebd8d79..601159a46ab 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2256,6 +2256,54 @@ oom: } /* + * do_no_pfn() tries to create a new page mapping for a page without + * a struct_page backing it + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + * + * It is expected that the ->nopfn handler always returns the same pfn + * for a given virtual mapping. + * + * Mark this `noinline' to prevent it from bloating the main pagefault code. + */ +static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + spinlock_t *ptl; + pte_t entry; + unsigned long pfn; + int ret = VM_FAULT_MINOR; + + pte_unmap(page_table); + BUG_ON(!(vma->vm_flags & VM_PFNMAP)); + BUG_ON(is_cow_mapping(vma->vm_flags)); + + pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); + if (pfn == NOPFN_OOM) + return VM_FAULT_OOM; + if (pfn == NOPFN_SIGBUS) + return VM_FAULT_SIGBUS; + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + entry = pfn_pte(pfn, vma->vm_page_prot); + if (write_access) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + } + pte_unmap_unlock(page_table, ptl); + return ret; +} + +/* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable * nonlinear vmas. @@ -2317,11 +2365,17 @@ static inline int handle_pte_fault(struct mm_struct *mm, old_entry = entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); - return do_no_page(mm, vma, address, - pte, pmd, write_access); + if (vma->vm_ops) { + if (vma->vm_ops->nopage) + return do_no_page(mm, vma, address, + pte, pmd, + write_access); + if (unlikely(vma->vm_ops->nopfn)) + return do_no_pfn(mm, vma, address, pte, + pmd, write_access); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); } if (pte_file(entry)) return do_file_page(mm, vma, address, @@ -2550,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset; + void *maddr; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 38f89650bc8..cf18f094255 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1136,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) */ unsigned slab_node(struct mempolicy *policy) { - switch (policy->policy) { + int pol = policy ? policy->policy : MPOL_DEFAULT; + + switch (pol) { case MPOL_INTERLEAVE: return interleave_nodes(policy); diff --git a/mm/nommu.c b/mm/nommu.c index d99dea31e44..56454066219 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp) } /* - * The nommu dodgy version :-) + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) { + struct vm_area_struct *vma; + unsigned long vm_flags; int i; - static struct vm_area_struct dummy_vma; + + /* calculate required read or write permissions. + * - if 'force' is set, we only require the "MAY" flags. + */ + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < len; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP) || + !(vm_flags & vma->vm_flags)) + goto finish_or_fault; + if (pages) { pages[i] = virt_to_page(start); if (pages[i]) page_cache_get(pages[i]); } if (vmas) - vmas[i] = &dummy_vma; + vmas[i] = vma; start += PAGE_SIZE; } - return(i); + + return i; + +finish_or_fault: + return i ? : -EFAULT; } EXPORT_SYMBOL(get_user_pages); @@ -286,6 +310,77 @@ static void show_process_blocks(void) } #endif /* DEBUG */ +/* + * add a VMA into a process's mm_struct in the appropriate place in the list + * - should be called with mm->mmap_sem held writelocked + */ +static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) +{ + struct vm_list_struct **ppv; + + for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) + if ((*ppv)->vma->vm_start > vml->vma->vm_start) + break; + + vml->next = *ppv; + *ppv = vml; +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_list_struct *loop, *vml; + + /* search the vm_start ordered list */ + vml = NULL; + for (loop = mm->context.vmlist; loop; loop = loop->next) { + if (loop->vma->vm_start > addr) + break; + vml = loop; + } + + if (vml && vml->vma->vm_end > addr) + return vml->vma; + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_list_struct *vml; + + /* search the vm_start ordered list */ + for (vml = mm->context.vmlist; vml; vml = vml->next) { + if (vml->vma->vm_start == addr) + return vml->vma; + if (vml->vma->vm_start > addr) + break; + } + + return NULL; +} + +/* + * find a VMA in the global tree + */ static inline struct vm_area_struct *find_nommu_vma(unsigned long start) { struct vm_area_struct *vma; @@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start) return NULL; } +/* + * add a VMA in the global tree + */ static void add_nommu_vma(struct vm_area_struct *vma) { struct vm_area_struct *pvma; @@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma) rb_insert_color(&vma->vm_rb, &nommu_vma_tree); } +/* + * delete a VMA from the global list + */ static void delete_nommu_vma(struct vm_area_struct *vma) { struct address_space *mapping; @@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file, realalloc += kobjsize(vml); askedalloc += sizeof(*vml); - vml->next = current->mm->context.vmlist; - current->mm->context.vmlist = vml; + add_vma_to_mm(current->mm, vml); up_write(&nommu_vma_sem); @@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma) } } +/* + * release a mapping + * - under NOMMU conditions the parameters must match exactly to the mapping to + * be removed + */ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) { struct vm_list_struct *vml, **parent; @@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) printk("do_munmap:\n"); #endif - for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) + for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { + if ((*parent)->vma->vm_start > addr) + break; if ((*parent)->vma->vm_start == addr && ((len == 0) || ((*parent)->vma->vm_end == end))) goto found; + } printk("munmap of non-mmaped memory by process %d (%s): %p\n", current->pid, current->comm, (void *) addr); @@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) return 0; } -/* Release all mmaps. */ +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +/* + * Release all mappings + */ void exit_mmap(struct mm_struct * mm) { struct vm_list_struct *tmp; @@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm) } } -asmlinkage long sys_munmap(unsigned long addr, size_t len) -{ - int ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); - up_write(&mm->mmap_sem); - return ret; -} - unsigned long do_brk(unsigned long addr, unsigned long len) { return -ENOMEM; } /* - * Expand (or shrink) an existing mapping, potentially moving it at the - * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * expand (or shrink) an existing mapping, potentially moving it at the same + * time (controlled by the MREMAP_MAYMOVE flag and available VM space) * - * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise - * This option implies MREMAP_MAYMOVE. + * under NOMMU conditions, we only permit changing a mapping's size, and only + * as long as it stays within the hole allocated by the kmalloc() call in + * do_mmap_pgoff() and the block is not shareable * - * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the - * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable + * MREMAP_FIXED is not supported under NOMMU conditions */ unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { - struct vm_list_struct *vml = NULL; + struct vm_area_struct *vma; /* insanity checks first */ if (new_len == 0) @@ -1012,58 +1122,46 @@ unsigned long do_mremap(unsigned long addr, if (flags & MREMAP_FIXED && new_addr != addr) return (unsigned long) -EINVAL; - for (vml = current->mm->context.vmlist; vml; vml = vml->next) - if (vml->vma->vm_start == addr) - goto found; - - return (unsigned long) -EINVAL; + vma = find_vma_exact(current->mm, addr); + if (!vma) + return (unsigned long) -EINVAL; - found: - if (vml->vma->vm_end != vml->vma->vm_start + old_len) + if (vma->vm_end != vma->vm_start + old_len) return (unsigned long) -EFAULT; - if (vml->vma->vm_flags & VM_MAYSHARE) + if (vma->vm_flags & VM_MAYSHARE) return (unsigned long) -EPERM; if (new_len > kobjsize((void *) addr)) return (unsigned long) -ENOMEM; /* all checks complete - do it */ - vml->vma->vm_end = vml->vma->vm_start + new_len; + vma->vm_end = vma->vm_start + new_len; askedalloc -= old_len; askedalloc += new_len; - return vml->vma->vm_start; + return vma->vm_start; } -/* - * Look up the first VMA which satisfies addr < vm_end, NULL if none - */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) { - struct vm_list_struct *vml; - - for (vml = mm->context.vmlist; vml; vml = vml->next) - if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) - return vml->vma; + unsigned long ret; - return NULL; + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; } -EXPORT_SYMBOL(find_vma); - struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { return NULL; } -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - return NULL; -} - int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot) { @@ -1206,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area, BUG(); return NULL; } + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (vma) { + /* don't overrun this mapping */ + if (addr + len >= vma->vm_end) + len = vma->vm_end - addr; + + /* only read or write mappings where it is permitted */ + if (write && vma->vm_flags & VM_MAYWRITE) + len -= copy_to_user((void *) addr, buf, len); + else if (!write && vma->vm_flags & VM_MAYREAD) + len -= copy_from_user(buf, (void *) addr, len); + else + len = 0; + } else { + len = 0; + } + + up_read(&mm->mmap_sem); + mmput(mm); + return len; +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9810f0a60db..4f59d90b81e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -37,6 +37,8 @@ #include <linux/vmalloc.h> #include <linux/mempolicy.h> #include <linux/stop_machine.h> +#include <linux/sort.h> +#include <linux/pfn.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -102,6 +104,38 @@ int min_free_kbytes = 1024; unsigned long __meminitdata nr_kernel_pages; unsigned long __meminitdata nr_all_pages; +static unsigned long __initdata dma_reserve; + +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct + * ranges of memory (RAM) that may be registered with add_active_range(). + * Ranges passed to add_active_range() will be merged if possible + * so the number of times add_active_range() can be called is + * related to the number of nodes and the number of holes + */ + #ifdef CONFIG_MAX_ACTIVE_REGIONS + /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ + #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS + #else + #if MAX_NUMNODES >= 32 + /* If there can be many nodes, allow up to 50 holes per node */ + #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) + #else + /* By default, allow up to 256 distinct regions */ + #define MAX_ACTIVE_REGIONS 256 + #endif + #endif + + struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; + int __initdata nr_nodemap_entries; + unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; + unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) @@ -908,7 +942,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, */ do { zone = *z; - if (unlikely((gfp_mask & __GFP_THISNODE) && + if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) break; if ((alloc_flags & ALLOC_CPUSET) && @@ -1222,14 +1256,12 @@ unsigned int nr_free_pagecache_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); } -#ifdef CONFIG_NUMA -static void show_node(struct zone *zone) + +static inline void show_node(struct zone *zone) { - printk("Node %ld ", zone_to_nid(zone)); + if (NUMA_BUILD) + printk("Node %ld ", zone_to_nid(zone)); } -#else -#define show_node(zone) do { } while (0) -#endif void si_meminfo(struct sysinfo *val) { @@ -1271,34 +1303,30 @@ void si_meminfo_node(struct sysinfo *val, int nid) */ void show_free_areas(void) { - int cpu, temperature; + int cpu; unsigned long active; unsigned long inactive; unsigned long free; struct zone *zone; for_each_zone(zone) { - show_node(zone); - printk("%s per-cpu:", zone->name); - - if (!populated_zone(zone)) { - printk(" empty\n"); + if (!populated_zone(zone)) continue; - } else - printk("\n"); + + show_node(zone); + printk("%s per-cpu:\n", zone->name); for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; pageset = zone_pcp(zone, cpu); - for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: high %d, batch %d used:%d\n", - cpu, - temperature ? "cold" : "hot", - pageset->pcp[temperature].high, - pageset->pcp[temperature].batch, - pageset->pcp[temperature].count); + printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " + "Cold: hi:%5d, btch:%4d usd:%4d\n", + cpu, pageset->pcp[0].high, + pageset->pcp[0].batch, pageset->pcp[0].count, + pageset->pcp[1].high, pageset->pcp[1].batch, + pageset->pcp[1].count); } } @@ -1320,6 +1348,9 @@ void show_free_areas(void) for_each_zone(zone) { int i; + if (!populated_zone(zone)) + continue; + show_node(zone); printk("%s" " free:%lukB" @@ -1352,12 +1383,11 @@ void show_free_areas(void) for_each_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (!populated_zone(zone)) + continue; + show_node(zone); printk("%s: ", zone->name); - if (!populated_zone(zone)) { - printk("empty\n"); - continue; - } spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -1561,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy) void __meminit build_all_zonelists(void) { if (system_state == SYSTEM_BOOTING) { - __build_all_zonelists(0); + __build_all_zonelists(NULL); cpuset_init_current_mems_allowed(); } else { /* we have to stop all cpus to guaranntee there is no user @@ -1642,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) -static void __init calculate_zone_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long realtotalpages, totalpages = 0; - enum zone_type i; - - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); -} - - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -1818,6 +1829,9 @@ static int __cpuinit process_zones(int cpu) for_each_zone(zone) { + if (!populated_zone(zone)) + continue; + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); if (!zone_pcp(zone, cpu)) @@ -1977,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone, return 0; } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * Basic iterator support. Return the first range of PFNs for a node + * Note: nid == MAX_NUMNODES returns first region regardless of node + */ +static int __init first_active_region_index_in_nid(int nid) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the next active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardles of node + */ +static int __init next_active_region_index_in_nid(int index, int nid) +{ + for (index = index + 1; index < nr_nodemap_entries; index++) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + * Architectures may implement their own version but if add_active_range() + * was used and there are no special requirements, this is a convenient + * alternative + */ +int __init early_pfn_to_nid(unsigned long pfn) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long start_pfn = early_node_map[i].start_pfn; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (start_pfn <= pfn && pfn < end_pfn) + return early_node_map[i].nid; + } + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +/* Basic iterator support to walk early_node_map[] */ +#define for_each_active_range_index_in_nid(i, nid) \ + for (i = first_active_region_index_in_nid(nid); i != -1; \ + i = next_active_region_index_in_nid(i, nid)) + +/** + * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed + * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling free_bootmem() manually. + */ +void __init free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) { + unsigned long size_pages = 0; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (early_node_map[i].start_pfn >= max_low_pfn) + continue; + + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + + size_pages = end_pfn - early_node_map[i].start_pfn; + free_bootmem_node(NODE_DATA(early_node_map[i].nid), + PFN_PHYS(early_node_map[i].start_pfn), + size_pages << PAGE_SHIFT); + } +} + +/** + * sparse_memory_present_with_active_regions - Call memory_present for each active range + * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling memory_present() manually. + */ +void __init sparse_memory_present_with_active_regions(int nid) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + memory_present(early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); +} + +/** + * push_node_boundaries - Push node boundaries to at least the requested boundary + * @nid: The nid of the node to push the boundary for + * @start_pfn: The start pfn of the node + * @end_pfn: The end pfn of the node + * + * In reserve-based hot-add, mem_map is allocated that is unused until hotadd + * time. Specifically, on x86_64, SRAT will report ranges that can potentially + * be hotplugged even though no physical memory exists. This function allows + * an arch to push out the node boundaries so mem_map is allocated that can + * be used later. + */ +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) +{ + printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", + nid, start_pfn, end_pfn); + + /* Initialise the boundary for this node if necessary */ + if (node_boundary_end_pfn[nid] == 0) + node_boundary_start_pfn[nid] = -1UL; + + /* Update the boundaries */ + if (node_boundary_start_pfn[nid] > start_pfn) + node_boundary_start_pfn[nid] = start_pfn; + if (node_boundary_end_pfn[nid] < end_pfn) + node_boundary_end_pfn[nid] = end_pfn; +} + +/* If necessary, push the node boundary out for reserve hotadd */ +static void __init account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", + nid, *start_pfn, *end_pfn); + + /* Return if boundary information has not been provided */ + if (node_boundary_end_pfn[nid] == 0) + return; + + /* Check the boundaries and update if necessary */ + if (node_boundary_start_pfn[nid] < *start_pfn) + *start_pfn = node_boundary_start_pfn[nid]; + if (node_boundary_end_pfn[nid] > *end_pfn) + *end_pfn = node_boundary_end_pfn[nid]; +} +#else +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) {} + +static void __init account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) {} +#endif + + +/** + * get_pfn_range_for_nid - Return the start and end page frames for a node + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned + * @start_pfn: Passed by reference. On return, it will have the node start_pfn + * @end_pfn: Passed by reference. On return, it will have the node end_pfn + * + * It returns the start and end page frame of a node based on information + * provided by an arch calling add_active_range(). If called for a node + * with no available memory, a warning is printed and the start and end + * PFNs will be 0 + */ +void __init get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + int i; + *start_pfn = -1UL; + *end_pfn = 0; + + for_each_active_range_index_in_nid(i, nid) { + *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); + *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + } + + if (*start_pfn == -1UL) { + printk(KERN_WARNING "Node %u active with no memory\n", nid); + *start_pfn = 0; + } + + /* Push the node boundaries out if requested */ + account_node_boundary(nid, start_pfn, end_pfn); +} + +/* + * Return the number of pages a zone spans in a node, including holes + * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() + */ +unsigned long __init zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + /* Get the start and end of the node and zone */ + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + + /* Check that this node has pages within the zone's required range */ + if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + return 0; + + /* Move the zone boundaries inside the node if necessary */ + zone_end_pfn = min(zone_end_pfn, node_end_pfn); + zone_start_pfn = max(zone_start_pfn, node_start_pfn); + + /* Return the spanned pages */ + return zone_end_pfn - zone_start_pfn; +} + +/* + * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, + * then all holes in the requested range will be accounted for + */ +unsigned long __init __absent_pages_in_range(int nid, + unsigned long range_start_pfn, + unsigned long range_end_pfn) +{ + int i = 0; + unsigned long prev_end_pfn = 0, hole_pages = 0; + unsigned long start_pfn; + + /* Find the end_pfn of the first active range of pfns in the node */ + i = first_active_region_index_in_nid(nid); + if (i == -1) + return 0; + + /* Account for ranges before physical memory on this node */ + if (early_node_map[i].start_pfn > range_start_pfn) + hole_pages = early_node_map[i].start_pfn - range_start_pfn; + + prev_end_pfn = early_node_map[i].start_pfn; + + /* Find all holes for the zone within the node */ + for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { + + /* No need to continue if prev_end_pfn is outside the zone */ + if (prev_end_pfn >= range_end_pfn) + break; + + /* Make sure the end of the zone is not within the hole */ + start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + prev_end_pfn = max(prev_end_pfn, range_start_pfn); + + /* Update the hole size cound and move on */ + if (start_pfn > range_start_pfn) { + BUG_ON(prev_end_pfn > start_pfn); + hole_pages += start_pfn - prev_end_pfn; + } + prev_end_pfn = early_node_map[i].end_pfn; + } + + /* Account for ranges past physical memory on this node */ + if (range_end_pfn > prev_end_pfn) + hole_pages = range_end_pfn - + max(range_start_pfn, prev_end_pfn); + + return hole_pages; +} + +/** + * absent_pages_in_range - Return number of page frames in holes within a range + * @start_pfn: The start PFN to start searching for holes + * @end_pfn: The end PFN to stop searching for holes + * + * It returns the number of pages frames in memory holes within a range + */ +unsigned long __init absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn) +{ + return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); +} + +/* Return the number of page frames in holes in a zone on a node */ +unsigned long __init zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], + node_start_pfn); + zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], + node_end_pfn); + + return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); +} + +/* Return the zone index a PFN is in */ +int memmap_zone_idx(struct page *lmem_map) +{ + int i; + unsigned long phys_addr = virt_to_phys(lmem_map); + unsigned long pfn = phys_addr >> PAGE_SHIFT; + + for (i = 0; i < MAX_NR_ZONES; i++) + if (pfn < arch_zone_highest_possible_pfn[i]) + break; + + return i; +} +#else +static inline unsigned long zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zones_size) +{ + return zones_size[zone_type]; +} + +static inline unsigned long zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zholes_size) +{ + if (!zholes_size) + return 0; + + return zholes_size[zone_type]; +} + +static inline int memmap_zone_idx(struct page *lmem_map) +{ + return MAX_NR_ZONES; +} +#endif + +static void __init calculate_node_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + enum zone_type i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, + zones_size); + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= + zone_absent_pages_in_node(pgdat->node_id, i, + zholes_size); + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, + realtotalpages); +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1998,11 +2372,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize; + unsigned long size, realsize, memmap_pages; - realsize = size = zones_size[j]; - if (zholes_size) - realsize -= zholes_size[j]; + size = zone_spanned_pages_in_node(nid, j, zones_size); + realsize = size - zone_absent_pages_in_node(nid, j, + zholes_size); + + /* + * Adjust realsize so that it accounts for how much memory + * is used by this zone for memmap. This affects the watermark + * and per-cpu initialisations + */ + memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; + if (realsize >= memmap_pages) { + realsize -= memmap_pages; + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds realsize %lu\n", + zone_names[j], memmap_pages, realsize); + + /* Account for reserved DMA pages */ + if (j == ZONE_DMA && realsize > dma_reserve) { + realsize -= dma_reserve; + printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", + dma_reserve); + } if (!is_highmem_idx(j)) nr_kernel_pages += realsize; @@ -2011,6 +2408,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; #ifdef CONFIG_NUMA + zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; @@ -2073,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) /* * With no DISCONTIG, the global mem_map is just set as node 0's */ - if (pgdat == NODE_DATA(0)) + if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= pgdat->node_start_pfn; +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } @@ -2085,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, { pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - calculate_zone_totalpages(pgdat, zones_size, zholes_size); + calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/** + * add_active_range - Register a range of PFNs backed by physical memory + * @nid: The node ID the range resides on + * @start_pfn: The start PFN of the available physical memory + * @end_pfn: The end PFN of the available physical memory + * + * These ranges are stored in an early_node_map[] and later used by + * free_area_init_nodes() to calculate zone sizes and holes. If the + * range spans a memory hole, it is up to the architecture to ensure + * the memory is not freed by the bootmem allocator. If possible + * the range being registered will be merged with existing ranges. + */ +void __init add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + + printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); + + /* Merge with existing active regions if possible */ + for (i = 0; i < nr_nodemap_entries; i++) { + if (early_node_map[i].nid != nid) + continue; + + /* Skip if an existing region covers this new one */ + if (start_pfn >= early_node_map[i].start_pfn && + end_pfn <= early_node_map[i].end_pfn) + return; + + /* Merge forward if suitable */ + if (start_pfn <= early_node_map[i].end_pfn && + end_pfn > early_node_map[i].end_pfn) { + early_node_map[i].end_pfn = end_pfn; + return; + } + + /* Merge backward if suitable */ + if (start_pfn < early_node_map[i].end_pfn && + end_pfn >= early_node_map[i].start_pfn) { + early_node_map[i].start_pfn = start_pfn; + return; + } + } + + /* Check that early_node_map is large enough */ + if (i >= MAX_ACTIVE_REGIONS) { + printk(KERN_CRIT "More than %d memory regions, truncating\n", + MAX_ACTIVE_REGIONS); + return; + } + + early_node_map[i].nid = nid; + early_node_map[i].start_pfn = start_pfn; + early_node_map[i].end_pfn = end_pfn; + nr_nodemap_entries = i + 1; +} + +/** + * shrink_active_range - Shrink an existing registered range of PFNs + * @nid: The node id the range is on that should be shrunk + * @old_end_pfn: The old end PFN of the range + * @new_end_pfn: The new PFN of the range + * + * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. + * The map is kept at the end physical page range that has already been + * registered with add_active_range(). This function allows an arch to shrink + * an existing registered range. + */ +void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, + unsigned long new_end_pfn) +{ + int i; + + /* Find the old active region end and shrink */ + for_each_active_range_index_in_nid(i, nid) + if (early_node_map[i].end_pfn == old_end_pfn) { + early_node_map[i].end_pfn = new_end_pfn; + break; + } +} + +/** + * remove_all_active_ranges - Remove all currently registered regions + * During discovery, it may be found that a table like SRAT is invalid + * and an alternative discovery method must be used. This function removes + * all currently registered regions. + */ +void __init remove_all_active_ranges() +{ + memset(early_node_map, 0, sizeof(early_node_map)); + nr_nodemap_entries = 0; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); + memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ +} + +/* Compare two active node_active_regions */ +static int __init cmp_node_active_region(const void *a, const void *b) +{ + struct node_active_region *arange = (struct node_active_region *)a; + struct node_active_region *brange = (struct node_active_region *)b; + + /* Done this way to avoid overflows */ + if (arange->start_pfn > brange->start_pfn) + return 1; + if (arange->start_pfn < brange->start_pfn) + return -1; + + return 0; +} + +/* sort the node_map by start_pfn */ +static void __init sort_node_map(void) +{ + sort(early_node_map, (size_t)nr_nodemap_entries, + sizeof(struct node_active_region), + cmp_node_active_region, NULL); +} + +/* Find the lowest pfn for a node. This depends on a sorted early_node_map */ +unsigned long __init find_min_pfn_for_node(unsigned long nid) +{ + int i; + + /* Assuming a sorted map, the first range found has the starting pfn */ + for_each_active_range_index_in_nid(i, nid) + return early_node_map[i].start_pfn; + + printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); + return 0; +} + +/** + * find_min_pfn_with_active_regions - Find the minimum PFN registered + * + * It returns the minimum PFN based on information provided via + * add_active_range() + */ +unsigned long __init find_min_pfn_with_active_regions(void) +{ + return find_min_pfn_for_node(MAX_NUMNODES); +} + +/** + * find_max_pfn_with_active_regions - Find the maximum PFN registered + * + * It returns the maximum PFN based on information provided via + * add_active_range() + */ +unsigned long __init find_max_pfn_with_active_regions(void) +{ + int i; + unsigned long max_pfn = 0; + + for (i = 0; i < nr_nodemap_entries; i++) + max_pfn = max(max_pfn, early_node_map[i].end_pfn); + + return max_pfn; +} + +/** + * free_area_init_nodes - Initialise all pg_data_t and zone data + * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA + * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32 + * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL + * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM + * + * This will call free_area_init_node() for each active node in the system. + * Using the page ranges provided by add_active_range(), the size of each + * zone in each node and their holes is calculated. If the maximum PFN + * between two adjacent zones match, it is assumed that the zone is empty. + * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed + * that arch_max_dma32_pfn has no pages. It is also assumed that a zone + * starts where the previous one ended. For example, ZONE_DMA32 starts + * at arch_max_dma_pfn. + */ +void __init free_area_init_nodes(unsigned long *max_zone_pfn) +{ + unsigned long nid; + enum zone_type i; + + /* Record where the zone boundaries are */ + memset(arch_zone_lowest_possible_pfn, 0, + sizeof(arch_zone_lowest_possible_pfn)); + memset(arch_zone_highest_possible_pfn, 0, + sizeof(arch_zone_highest_possible_pfn)); + arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); + arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; + for (i = 1; i < MAX_NR_ZONES; i++) { + arch_zone_lowest_possible_pfn[i] = + arch_zone_highest_possible_pfn[i-1]; + arch_zone_highest_possible_pfn[i] = + max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + } + + /* Regions in the early_node_map can be in any order */ + sort_node_map(); + + /* Print out the zone ranges */ + printk("Zone PFN ranges:\n"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %-8s %8lu -> %8lu\n", + zone_names[i], + arch_zone_lowest_possible_pfn[i], + arch_zone_highest_possible_pfn[i]); + + /* Print out the early_node_map[] */ + printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); + for (i = 0; i < nr_nodemap_entries; i++) + printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); + + /* Initialise every node */ + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + free_area_init_node(nid, pgdat, NULL, + find_min_pfn_for_node(nid), NULL); + } +} +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +/** + * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA + * @new_dma_reserve - The number of pages to mark reserved + * + * The per-cpu batchsize and zone watermarks are determined by present_pages. + * In the DMA zone, a significant percentage may be consumed by kernel image + * and other unfreeable allocations which can skew the watermarks badly. This + * function may optionally be used to account for unfreeable pages in + * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize + */ +void __init set_dma_reserve(unsigned long new_dma_reserve) +{ + dma_reserve = new_dma_reserve; +} + #ifndef CONFIG_NEED_MULTIPLE_NODES static bootmem_data_t contig_bootmem_data; struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; diff --git a/mm/shmem.c b/mm/shmem.c index 8631be45b40..eda907c3a86 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1351,7 +1351,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; @@ -2157,8 +2156,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(shmem_inode_cachep)) - printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(shmem_inode_cachep); } static const struct address_space_operations shmem_aops = { diff --git a/mm/slab.c b/mm/slab.c index 7a48eb1a60c..792bfe320a8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -972,7 +972,39 @@ static int transfer_objects(struct array_cache *to, return nr; } -#ifdef CONFIG_NUMA +#ifndef CONFIG_NUMA + +#define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, l3) do { } while (0) + +static inline struct array_cache **alloc_alien_cache(int node, int limit) +{ + return (struct array_cache **)BAD_ALIEN_MAGIC; +} + +static inline void free_alien_cache(struct array_cache **ac_ptr) +{ +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + return 0; +} + +static inline void *alternate_node_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + return NULL; +} + +static inline void *__cache_alloc_node(struct kmem_cache *cachep, + gfp_t flags, int nodeid) +{ + return NULL; +} + +#else /* CONFIG_NUMA */ + static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); @@ -1101,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } return 1; } - -#else - -#define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) - -static inline struct array_cache **alloc_alien_cache(int node, int limit) -{ - return (struct array_cache **)BAD_ALIEN_MAGIC; -} - -static inline void free_alien_cache(struct array_cache **ac_ptr) -{ -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - return 0; -} - #endif static int __cpuinit cpuup_callback(struct notifier_block *nfb, @@ -1564,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) */ flags |= __GFP_COMP; #endif - flags |= cachep->gfpflags; + + /* + * Under NUMA we want memory on the indicated node. We will handle + * the needed fallback ourselves since we want to serve from our + * per node object lists first for other nodes. + */ + flags |= cachep->gfpflags | GFP_THISNODE; page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) @@ -2442,7 +2460,6 @@ EXPORT_SYMBOL(kmem_cache_shrink); * @cachep: the cache to destroy * * Remove a struct kmem_cache object from the slab cache. - * Returns 0 on success. * * It is expected this function will be called by a module when it is * unloaded. This will remove the cache completely, and avoid a duplicate @@ -2454,7 +2471,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); * The caller must guarantee that noone will allocate memory from the cache * during the kmem_cache_destroy(). */ -int kmem_cache_destroy(struct kmem_cache *cachep) +void kmem_cache_destroy(struct kmem_cache *cachep) { BUG_ON(!cachep || in_interrupt()); @@ -2475,7 +2492,7 @@ int kmem_cache_destroy(struct kmem_cache *cachep) list_add(&cachep->next, &cache_chain); mutex_unlock(&cache_chain_mutex); unlock_cpu_hotplug(); - return 1; + return; } if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) @@ -2483,7 +2500,6 @@ int kmem_cache_destroy(struct kmem_cache *cachep) __kmem_cache_destroy(cachep); unlock_cpu_hotplug(); - return 0; } EXPORT_SYMBOL(kmem_cache_destroy); @@ -3030,14 +3046,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) void *objp; struct array_cache *ac; -#ifdef CONFIG_NUMA - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { - objp = alternate_node_alloc(cachep, flags); - if (objp != NULL) - return objp; - } -#endif - check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { @@ -3055,12 +3063,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; - void *objp; + void *objp = NULL; cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - objp = ____cache_alloc(cachep, flags); + + if (unlikely(NUMA_BUILD && + current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) + objp = alternate_node_alloc(cachep, flags); + + if (!objp) + objp = ____cache_alloc(cachep, flags); + /* + * We may just have run out of memory on the local node. + * __cache_alloc_node() knows how to locate memory on other nodes + */ + if (NUMA_BUILD && !objp) + objp = __cache_alloc_node(cachep, flags, numa_node_id()); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); @@ -3079,7 +3099,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) { int nid_alloc, nid_here; - if (in_interrupt()) + if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_node_id(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) @@ -3092,6 +3112,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) } /* + * Fallback function if there was no memory available and no objects on a + * certain node and we are allowed to fall back. We mimick the behavior of + * the page allocator. We fall back according to a zonelist determined by + * the policy layer while obeying cpuset constraints. + */ +void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +{ + struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + struct zone **z; + void *obj = NULL; + + for (z = zonelist->zones; *z && !obj; z++) + if (zone_idx(*z) <= ZONE_NORMAL && + cpuset_zone_allowed(*z, flags)) + obj = __cache_alloc_node(cache, + flags | __GFP_THISNODE, + zone_to_nid(*z)); + return obj; +} + +/* * A interface to enable slab creation on nodeid */ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, @@ -3144,11 +3186,15 @@ retry: must_grow: spin_unlock(&l3->list_lock); x = cache_grow(cachep, flags, nodeid); + if (x) + goto retry; - if (!x) - return NULL; + if (!(flags & __GFP_THISNODE)) + /* Unable to grow the cache. Fall back to other nodes. */ + return fallback_alloc(cachep, flags); + + return NULL; - goto retry; done: return obj; } diff --git a/mm/slob.c b/mm/slob.c index 20188627347..542394184a5 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, } EXPORT_SYMBOL(kmem_cache_create); -int kmem_cache_destroy(struct kmem_cache *c) +void kmem_cache_destroy(struct kmem_cache *c) { slob_free(c, sizeof(struct kmem_cache)); - return 0; } EXPORT_SYMBOL(kmem_cache_destroy); diff --git a/mm/truncate.c b/mm/truncate.c index c6ab55ec688..a654928323d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/pagevec.h> @@ -52,36 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page) /* * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can - * be marked dirty at any time too. So we re-check the dirtiness inside - * ->tree_lock. That provides exclusion against the __set_page_dirty - * functions. + * be marked dirty at any time too, so use remove_mapping which safely + * discards clean, unused pages. * * Returns non-zero if the page was successfully invalidated. */ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { + int ret; + if (page->mapping != mapping) return 0; if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - write_lock_irq(&mapping->tree_lock); - if (PageDirty(page)) - goto failed; - if (page_count(page) != 2) /* caller's ref + pagecache ref */ - goto failed; - - BUG_ON(PagePrivate(page)); - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + ret = remove_mapping(mapping, page); ClearPageUptodate(page); - page_cache_release(page); /* pagecache ref */ - return 1; -failed: - write_unlock_irq(&mapping->tree_lock); - return 0; + + return ret; } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9aad8b0cc6e..1ac191ce564 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -241,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, /** * get_vm_area - reserve a contingous kernel virtual area - * * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * @@ -273,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr) } /* Caller must hold vmlist_lock */ -struct vm_struct *__remove_vm_area(void *addr) +static struct vm_struct *__remove_vm_area(void *addr) { struct vm_struct **p, *tmp; @@ -296,7 +295,6 @@ found: /** * remove_vm_area - find and remove a contingous kernel virtual area - * * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. @@ -355,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages) /** * vfree - release memory allocated by vmalloc() - * * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, as @@ -373,7 +370,6 @@ EXPORT_SYMBOL(vfree); /** * vunmap - release virtual mapping obtained by vmap() - * * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, @@ -390,7 +386,6 @@ EXPORT_SYMBOL(vunmap); /** * vmap - map an array of pages into virtually contiguous space - * * @pages: array of page pointers * @count: number of pages to map * @flags: vm_area->flags @@ -471,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) /** * __vmalloc_node - allocate virtually contiguous memory - * * @size: allocation size * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages @@ -505,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc); /** * vmalloc - allocate virtually contiguous memory - * * @size: allocation size - * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * @@ -521,11 +513,11 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /** - * vmalloc_user - allocate virtually contiguous memory which has - * been zeroed so it can be mapped to userspace without - * leaking data. + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace + * @size: allocation size * - * @size: allocation size + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. */ void *vmalloc_user(unsigned long size) { @@ -544,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user); /** * vmalloc_node - allocate memory on a specific node - * * @size: allocation size * @node: numa node * @@ -566,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node); /** * vmalloc_exec - allocate virtually contiguous, executable memory - * * @size: allocation size * * Kernel-internal function to allocate enough pages to cover @size @@ -584,7 +574,6 @@ void *vmalloc_exec(unsigned long size) /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) - * * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the @@ -597,11 +586,11 @@ void *vmalloc_32(unsigned long size) EXPORT_SYMBOL(vmalloc_32); /** - * vmalloc_32_user - allocate virtually contiguous memory (32bit - * addressable) which is zeroed so it can be - * mapped to userspace without leaking data. - * + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. */ void *vmalloc_32_user(unsigned long size) { @@ -695,7 +684,6 @@ finished: /** * remap_vmalloc_range - map vmalloc pages to userspace - * * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map diff --git a/mm/vmscan.c b/mm/vmscan.c index 87779dda4ec..eca70310adb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> @@ -370,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - + inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -383,11 +384,30 @@ int remove_mapping(struct address_space *mapping, struct page *page) BUG_ON(mapping != page_mapping(page)); write_lock_irq(&mapping->tree_lock); - /* - * The non-racy check for busy page. It is critical to check - * PageDirty _after_ making sure that the page is freeable and - * not in use by anybody. (pagecache + us == 2) + * The non racy check for a busy page. + * + * Must be careful with the order of the tests. When someone has + * a ref to the page, it may be possible that they dirty it then + * drop the reference. So if PageDirty is tested before page_count + * here, then the following race may occur: + * + * get_user_pages(&page); + * [user mapping goes away] + * write_to(page); + * !PageDirty(page) [good] + * SetPageDirty(page); + * put_page(page); + * !page_count(page) [good, discard it] + * + * [oops, our write_to data is lost] + * + * Reversing the order of the tests ensures such a situation cannot + * escape unnoticed. The smp_rmb is needed to ensure the page->flags + * load is not satisfied before that of page->_count. + * + * Note that if SetPageDirty is always performed via set_page_dirty, + * and thus under tree_lock, then this ordering is not required. */ if (unlikely(page_count(page) != 2)) goto cannot_free; diff --git a/mm/vmstat.c b/mm/vmstat.c index 490d8c1a0de..a2b6a9f96e5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -371,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) __inc_zone_state(z, NUMA_MISS); __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); } - if (z->zone_pgdat == NODE_DATA(numa_node_id())) + if (z->node == numa_node_id()) __inc_zone_state(z, NUMA_LOCAL); else __inc_zone_state(z, NUMA_OTHER); @@ -465,6 +465,7 @@ static char *vmstat_text[] = { "nr_writeback", "nr_unstable", "nr_bounce", + "nr_vmscan_write", #ifdef CONFIG_NUMA "numa_hit", |