diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 5 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 227 | ||||
-rw-r--r-- | mm/filemap_xip.c | 55 | ||||
-rw-r--r-- | mm/highmem.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 33 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/madvise.c | 51 | ||||
-rw-r--r-- | mm/memory.c | 106 | ||||
-rw-r--r-- | mm/migrate.c | 15 | ||||
-rw-r--r-- | mm/mmap.c | 50 | ||||
-rw-r--r-- | mm/nommu.c | 37 | ||||
-rw-r--r-- | mm/oom_kill.c | 23 | ||||
-rw-r--r-- | mm/page-writeback.c | 61 | ||||
-rw-r--r-- | mm/page_alloc.c | 130 | ||||
-rw-r--r-- | mm/quicklist.c | 88 | ||||
-rw-r--r-- | mm/readahead.c | 29 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/shmem.c | 135 | ||||
-rw-r--r-- | mm/slab.c | 295 | ||||
-rw-r--r-- | mm/slob.c | 57 | ||||
-rw-r--r-- | mm/slub.c | 3669 | ||||
-rw-r--r-- | mm/sparse.c | 14 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 21 | ||||
-rw-r--r-- | mm/vmscan.c | 23 | ||||
-rw-r--r-- | mm/vmstat.c | 95 |
31 files changed, 4800 insertions, 475 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 7942b333e46..a17da8bafe6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -163,3 +163,8 @@ config ZONE_DMA_FLAG default "0" if !ZONE_DMA default "1" +config NR_QUICK + int + depends on QUICKLIST + default "2" if SUPERH + default "1" diff --git a/mm/Makefile b/mm/Makefile index f3c077eb0b8..a9148ea329a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -25,7 +25,10 @@ obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o +obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o +obj-$(CONFIG_QUICKLIST) += quicklist.o + diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f50a2811f9d..e5de3781d3f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeout) } EXPORT_SYMBOL(congestion_wait); +long congestion_wait_interruptible(int rw, long timeout) +{ + long ret; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[rw]; + + prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE); + if (signal_pending(current)) + ret = -ERESTARTSYS; + else + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + return ret; +} +EXPORT_SYMBOL(congestion_wait_interruptible); + /** * congestion_end - wake up sleepers on a congested backing_dev_info * @rw: READ or WRITE diff --git a/mm/bounce.c b/mm/bounce.c index 643efbe8240..ad401fc5744 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -204,7 +204,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) < q->bounce_pfn) + if (page_to_pfn(page) <= q->bounce_pfn) continue; /* diff --git a/mm/filemap.c b/mm/filemap.c index d1060b8d3cd..7b48b2ad00e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -750,6 +750,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, read_unlock_irq(&mapping->tree_lock); return i; } +EXPORT_SYMBOL(find_get_pages_contig); /** * find_get_pages_tag - find and return pages that match @tag @@ -778,6 +779,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, read_unlock_irq(&mapping->tree_lock); return ret; } +EXPORT_SYMBOL(find_get_pages_tag); /** * grab_cache_page_nowait - returns locked page at given index in given cache @@ -868,6 +870,7 @@ void do_generic_mapping_read(struct address_space *mapping, unsigned long last_index; unsigned long next_index; unsigned long prev_index; + unsigned int prev_offset; loff_t isize; struct page *cached_page; int error; @@ -876,7 +879,8 @@ void do_generic_mapping_read(struct address_space *mapping, cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; - prev_index = ra.prev_page; + prev_index = ra.prev_index; + prev_offset = ra.prev_offset; last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; @@ -924,10 +928,10 @@ page_ok: flush_dcache_page(page); /* - * When (part of) the same page is read multiple times - * in succession, only mark it as accessed the first time. + * When a sequential read accesses a page several times, + * only mark it as accessed the first time. */ - if (prev_index != index) + if (prev_index != index || offset != prev_offset) mark_page_accessed(page); prev_index = index; @@ -945,6 +949,8 @@ page_ok: offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; + prev_offset = offset; + ra.prev_offset = offset; page_cache_release(page); if (ret == nr && desc->count) @@ -1106,6 +1112,45 @@ success: return size; } +/* + * Performs necessary checks before doing a write + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @count: number of bytes to write + * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE + * + * Adjust number of segments and amount of bytes to write (nr_segs should be + * properly initialized first). Returns appropriate error code that caller + * should return or zero in case that write should be allowed. + */ +int generic_segment_checks(const struct iovec *iov, + unsigned long *nr_segs, size_t *count, int access_flags) +{ + unsigned long seg; + size_t cnt = 0; + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(access_flags, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ + break; + } + *count = cnt; + return 0; +} +EXPORT_SYMBOL(generic_segment_checks); + /** * generic_file_aio_read - generic filesystem read routine * @iocb: kernel I/O control block @@ -1127,24 +1172,9 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, loff_t *ppos = &iocb->ki_pos; count = 0; - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - count += iv->iov_len; - if (unlikely((ssize_t)(count|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - nr_segs = seg; - count -= iv->iov_len; /* This segment is no good */ - break; - } + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { @@ -1446,30 +1476,6 @@ page_not_uptodate: majmin = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } - lock_page(page); - - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } /* * Umm, take care of errors if the page isn't up-to-date. @@ -1726,7 +1732,7 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); -static inline struct page *__read_cache_page(struct address_space *mapping, +static struct page *__read_cache_page(struct address_space *mapping, unsigned long index, int (*filler)(void *,struct page*), void *data) @@ -1763,17 +1769,11 @@ repeat: return page; } -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Read into the page cache. If a page already exists, - * and PageUptodate() is not set, try to fill the page. +/* + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. */ -struct page *read_cache_page(struct address_space *mapping, +struct page *read_cache_page_async(struct address_space *mapping, unsigned long index, int (*filler)(void *,struct page*), void *data) @@ -1784,7 +1784,7 @@ struct page *read_cache_page(struct address_space *mapping, retry: page = __read_cache_page(mapping, index, filler, data); if (IS_ERR(page)) - goto out; + return page; mark_page_accessed(page); if (PageUptodate(page)) goto out; @@ -1802,7 +1802,40 @@ retry: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); + return ERR_PTR(err); + } +out: + mark_page_accessed(page); + return page; +} +EXPORT_SYMBOL(read_cache_page_async); + +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page then wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + + page = read_cache_page_async(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); } out: return page; @@ -2211,30 +2244,14 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - unsigned long seg; loff_t pos; ssize_t written; ssize_t err; ocount = 0; - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - ocount += iv->iov_len; - if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - nr_segs = seg; - ocount -= iv->iov_len; /* This segment is no good */ - break; - } + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + return err; count = ocount; pos = *ppos; @@ -2294,10 +2311,10 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, * semantics. */ endbyte = pos + written_buffered - written - 1; - err = do_sync_file_range(file, pos, endbyte, - SYNC_FILE_RANGE_WAIT_BEFORE| - SYNC_FILE_RANGE_WRITE| - SYNC_FILE_RANGE_WAIT_AFTER); + err = do_sync_mapping_range(file->f_mapping, pos, endbyte, + SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); if (err == 0) { written = written_buffered; invalidate_mapping_pages(mapping, @@ -2379,7 +2396,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t retval; - size_t write_len = 0; + size_t write_len; + pgoff_t end = 0; /* silence gcc */ /* * If it's a write, unmap all mmappings of the file up-front. This @@ -2388,23 +2406,46 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ if (rw == WRITE) { write_len = iov_length(iov, nr_segs); + end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; if (mapping_mapped(mapping)) unmap_mapping_range(mapping, offset, write_len, 0); } retval = filemap_write_and_wait(mapping); - if (retval == 0) { - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) { - pgoff_t end = (offset + write_len - 1) - >> PAGE_CACHE_SHIFT; - int err = invalidate_inode_pages2_range(mapping, + if (retval) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (rw == WRITE && mapping->nrpages) { + retval = invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - if (err) - retval = err; - } + if (retval) + goto out; } + + retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); + if (retval) + goto out; + + /* + * Finally, try again to invalidate clean pages which might have been + * faulted in by get_user_pages() if the source of the write was an + * mmap()ed region of the file we're writing. That's a pretty crazy + * thing to do, so we don't support it 100%. If this invalidation + * fails and we have -EIOCBQUEUED we ignore the failure. + */ + if (rw == WRITE && mapping->nrpages) { + int err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, end); + if (err && retval >= 0) + retval = err; + } +out: return retval; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9dd9fbb7513..1b49dab9b25 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -17,6 +17,29 @@ #include "filemap.h" /* + * We do use our own empty page to avoid interference with other users + * of ZERO_PAGE(), such as /dev/zero + */ +static struct page *__xip_sparse_page; + +static struct page *xip_sparse_page(void) +{ + if (!__xip_sparse_page) { + unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER); + if (zeroes) { + static DEFINE_SPINLOCK(xip_alloc_lock); + spin_lock(&xip_alloc_lock); + if (!__xip_sparse_page) + __xip_sparse_page = virt_to_page(zeroes); + else + free_page(zeroes); + spin_unlock(&xip_alloc_lock); + } + } + return __xip_sparse_page; +} + +/* * This is a file read routine for execute in place files, and uses * the mapping->a_ops->get_xip_page() function for the actual low-level * stuff. @@ -162,7 +185,7 @@ EXPORT_SYMBOL_GPL(xip_file_sendfile); * xip_write * * This function walks all vmas of the address_space and unmaps the - * ZERO_PAGE when found at pgoff. Should it go in rmap.c? + * __xip_sparse_page when found at pgoff. */ static void __xip_unmap (struct address_space * mapping, @@ -177,13 +200,16 @@ __xip_unmap (struct address_space * mapping, spinlock_t *ptl; struct page *page; + page = __xip_sparse_page; + if (!page) + return; + spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - page = ZERO_PAGE(0); pte = page_check_address(page, mm, address, &ptl); if (pte) { /* Nuke the page table entry. */ @@ -222,16 +248,14 @@ xip_file_nopage(struct vm_area_struct * area, + area->vm_pgoff; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) { - return NULL; - } + if (pgoff >= size) + return NOPAGE_SIGBUS; page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) goto out; - } if (PTR_ERR(page) != -ENODATA) - return NULL; + return NOPAGE_SIGBUS; /* sparse block */ if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && @@ -241,12 +265,14 @@ xip_file_nopage(struct vm_area_struct * area, page = mapping->a_ops->get_xip_page (mapping, pgoff*(PAGE_SIZE/512), 1); if (IS_ERR(page)) - return NULL; + return NOPAGE_SIGBUS; /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, pgoff); } else { - /* not shared and writable, use ZERO_PAGE() */ - page = ZERO_PAGE(0); + /* not shared and writable, use xip_sparse_page() */ + page = xip_sparse_page(); + if (!page) + return NOPAGE_OOM; } out: @@ -408,7 +434,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from) unsigned blocksize; unsigned length; struct page *page; - void *kaddr; BUG_ON(!mapping->a_ops->get_xip_page); @@ -432,11 +457,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from) else return PTR_ERR(page); } - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - kunmap_atomic(kaddr, KM_USER0); - - flush_dcache_page(page); + zero_user_page(page, offset, length, KM_USER0); return 0; } EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/highmem.c b/mm/highmem.c index 51e1c1995fe..be8f8d36a8b 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -99,6 +99,15 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } +/* Flush all unused kmap mappings in order to remove stray + mappings. */ +void kmap_flush_unused(void) +{ + spin_lock(&kmap_lock); + flush_all_zero_pkmaps(); + spin_unlock(&kmap_lock); +} + static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 36db012b38d..eb7180db303 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -140,6 +140,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; fail: + if (vma->vm_flags & VM_MAYSHARE) + resv_huge_pages++; spin_unlock(&hugetlb_lock); return NULL; } @@ -172,6 +174,17 @@ static int __init hugetlb_setup(char *s) } __setup("hugepages=", hugetlb_setup); +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + #ifdef CONFIG_SYSCTL static void update_and_free_page(struct page *page) { @@ -817,6 +830,26 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) chg = region_chg(&inode->i_mapping->private_list, from, to); if (chg < 0) return chg; + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + if (chg > cpuset_mems_nr(free_huge_pages_node)) + return -ENOMEM; + ret = hugetlb_acct_memory(chg); if (ret < 0) return ret; diff --git a/mm/internal.h b/mm/internal.h index d527b80b292..a3110c02aea 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v) */ static inline void set_page_refcounted(struct page *page) { - VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + VM_BUG_ON(PageCompound(page) && PageTail(page)); VM_BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } diff --git a/mm/madvise.c b/mm/madvise.c index 4e196155a0c..e75096b5a6d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -12,6 +12,24 @@ #include <linux/hugetlb.h> /* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} + +/* * We can potentially split a vm area into separate * areas, each area with its own behavior. */ @@ -155,10 +173,14 @@ static long madvise_dontneed(struct vm_area_struct * vma, * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct address_space *mapping; - loff_t offset, endoff; + loff_t offset, endoff; + int error; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; @@ -177,7 +199,12 @@ static long madvise_remove(struct vm_area_struct *vma, + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - return vmtruncate_range(mapping->host, offset, endoff); + + /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + up_read(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); + down_read(¤t->mm->mmap_sem); + return error; } static long @@ -199,7 +226,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, error = madvise_behavior(vma, prev, start, end, behavior); break; case MADV_REMOVE: - error = madvise_remove(vma, start, end); + error = madvise_remove(vma, prev, start, end); break; case MADV_WILLNEED: @@ -261,7 +288,10 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) int error = -EINVAL; size_t len; - down_write(¤t->mm->mmap_sem); + if (madvise_need_mmap_write(behavior)) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); if (start & ~PAGE_MASK) goto out; @@ -312,14 +342,21 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) if (error) goto out; start = tmp; - if (start < prev->vm_end) + if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) goto out; - vma = prev->vm_next; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); } out: - up_write(¤t->mm->mmap_sem); + if (madvise_need_mmap_write(behavior)) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + return error; } diff --git a/mm/memory.c b/mm/memory.c index e7066e71dfa..1d647ab0ee7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1448,6 +1448,100 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + struct page *pmd_page; + spinlock_t *uninitialized_var(ptl); + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + pmd_page = pmd_page(*pmd); + + do { + err = fn(pte, pmd_page, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + /* * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on @@ -2539,12 +2633,6 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } -#else -/* Workaround for gcc 2.96 */ -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - return 0; -} #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED @@ -2573,12 +2661,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } -#else -/* Workaround for gcc 2.96 */ -int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - return 0; -} #endif /* __PAGETABLE_PMD_FOLDED */ int make_pages_present(unsigned long addr, unsigned long end) diff --git a/mm/migrate.c b/mm/migrate.c index 7a66ca25dc8..a91ca00abeb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -297,7 +297,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, void **pslot; if (!mapping) { - /* Anonymous page */ + /* Anonymous page without mapping */ if (page_count(page) != 1) return -EAGAIN; return 0; @@ -333,6 +333,19 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ __put_page(page); + /* + * If moved to a different zone then also account + * the page for that zone. Other VM counters will be + * taken care of when we establish references to the + * new page and drop references to the old page. + * + * Note that anonymous pages are accounted for + * via NR_FILE_PAGES and NR_ANON_PAGES if they + * are mapped to swap space. + */ + __dec_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + write_unlock_irq(&mapping->tree_lock); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index 84f997da78d..68b9ad2ef1d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> +#include <asm/mmu_context.h> #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) @@ -1199,6 +1200,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (len > TASK_SIZE) return -ENOMEM; + if (flags & MAP_FIXED) + return addr; + if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); @@ -1272,6 +1276,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (len > TASK_SIZE) return -ENOMEM; + if (flags & MAP_FIXED) + return addr; + /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); @@ -1359,39 +1366,21 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long ret; - - if (!(flags & MAP_FIXED)) { - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - addr = get_area(file, addr, len, pgoff, flags); - if (IS_ERR_VALUE(addr)) - return addr; - } + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + addr = get_area(file, addr, len, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (addr & ~PAGE_MASK) return -EINVAL; - if (file && is_file_hugepages(file)) { - /* - * Check if the given range is hugepage aligned, and - * can be made suitable for hugepages. - */ - ret = prepare_hugepage_range(addr, len, pgoff); - } else { - /* - * Ensure that a normal request is not falling in a - * reserved hugepage range. For some archs like IA-64, - * there is a separate region for hugepages. - */ - ret = is_hugepage_only_range(current->mm, addr, len); - } - if (ret) - return -EINVAL; + return addr; } @@ -1731,7 +1720,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, /* * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the the tail. + * either for the first part or the tail. */ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) @@ -1979,6 +1968,9 @@ void exit_mmap(struct mm_struct *mm) unsigned long nr_accounted = 0; unsigned long end; + /* mm's last user has gone, and its about to be pulled down */ + arch_exit_mmap(mm); + lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); diff --git a/mm/nommu.c b/mm/nommu.c index 23fb033e596..2b16b00a5b1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -45,6 +45,7 @@ int heap_stack_gap = 0; EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(__vm_enough_memory); +EXPORT_SYMBOL(num_physpages); /* list of shareable VMAs */ struct rb_root nommu_vma_tree = RB_ROOT; @@ -261,6 +262,14 @@ void vunmap(void *addr) } /* + * Implement a stub for vmalloc_sync_all() if the architecture chose not to + * have one. + */ +void __attribute__((weak)) vmalloc_sync_all(void) +{ +} + +/* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty * like trying to un-brk an area that has already been mapped @@ -826,6 +835,11 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long vmpglen; + /* suppress VMA sharing for shared regions */ + if (vm_flags & VM_SHARED && + capabilities & BDI_CAP_MAP_DIRECT) + goto dont_share_VMAs; + for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { vma = rb_entry(rb, struct vm_area_struct, vm_rb); @@ -859,6 +873,7 @@ unsigned long do_mmap_pgoff(struct file *file, goto shared; } + dont_share_VMAs: vma = NULL; /* obtain the address at which to make a shared mapping @@ -1193,6 +1208,28 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* + * ask for an unmapped area at which to create a mapping on a file + */ +unsigned long get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long (*get_area)(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + + if (!get_area) + return -ENOSYS; + + return get_area(file, addr, len, pgoff, flags); +} + +EXPORT_SYMBOL(get_unmapped_area); + +/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b278b8d60ee..a7001410ab1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -147,9 +147,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Adjust the score by oomkilladj. */ if (p->oomkilladj) { - if (p->oomkilladj > 0) + if (p->oomkilladj > 0) { + if (!points) + points = 1; points <<= p->oomkilladj; - else + } else points >>= -(p->oomkilladj); } @@ -176,6 +178,8 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) struct zone **z; nodemask_t nodes; int node; + + nodes_clear(nodes); /* node has memory ? */ for_each_online_node(node) if (NODE_DATA(node)->node_present_pages) @@ -320,7 +324,7 @@ static int oom_kill_task(struct task_struct *p) * Don't kill the process if any threads are set to OOM_DISABLE */ do_each_thread(g, q) { - if (q->mm == mm && p->oomkilladj == OOM_DISABLE) + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) return 1; } while_each_thread(g, q); @@ -333,7 +337,7 @@ static int oom_kill_task(struct task_struct *p) */ do_each_thread(g, q) { if (q->mm == mm && q->tgid != p->tgid) - force_sig(SIGKILL, p); + force_sig(SIGKILL, q); } while_each_thread(g, q); return 0; @@ -395,6 +399,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) struct task_struct *p; unsigned long points = 0; unsigned long freed = 0; + int constraint; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) @@ -409,14 +414,18 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) show_mem(); } - cpuset_lock(); - read_lock(&tasklist_lock); + if (sysctl_panic_on_oom == 2) + panic("out of memory. Compulsory panic_on_oom is selected.\n"); /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - switch (constrained_alloc(zonelist, gfp_mask)) { + constraint = constrained_alloc(zonelist, gfp_mask); + cpuset_lock(); + read_lock(&tasklist_lock); + + switch (constraint) { case CONSTRAINT_MEMORY_POLICY: oom_kill_process(current, points, "No available memory (MPOL_BIND)"); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f469e3cd08e..63cd88840eb 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -67,12 +67,12 @@ static inline long sync_writeback_pages(void) /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 10; +int dirty_background_ratio = 5; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 40; +int vm_dirty_ratio = 10; /* * The interval between `kupdate'-style writebacks, in jiffies @@ -119,6 +119,44 @@ static void background_writeout(unsigned long _min_pages); * We make sure that the background writeout level is below the adjusted * clamping level. */ + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_online_node(node) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_page_state(z, NR_INACTIVE) + + zone_page_state(z, NR_ACTIVE); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +static unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + + global_page_state(NR_INACTIVE) + + global_page_state(NR_ACTIVE); + x -= highmem_dirtyable_memory(x); + return x + 1; /* Ensure that we never return 0 */ +} + static void get_dirty_limits(long *pbackground, long *pdirty, struct address_space *mapping) @@ -128,20 +166,12 @@ get_dirty_limits(long *pbackground, long *pdirty, int unmapped_ratio; long background; long dirty; - unsigned long available_memory = vm_total_pages; + unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; -#ifdef CONFIG_HIGHMEM - /* - * We always exclude high memory from our count. - */ - available_memory -= totalhigh_pages; -#endif - - unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; + available_memory; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -653,12 +683,7 @@ retry: } ret = (*writepage)(page, wbc); - if (ret) { - if (ret == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else - set_bit(AS_EIO, &mapping->flags); - } + mapping_set_error(mapping, ret); if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) unlock_page(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 353ce9039a8..ae96dd84443 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -103,7 +103,7 @@ int min_free_kbytes = 1024; unsigned long __meminitdata nr_kernel_pages; unsigned long __meminitdata nr_all_pages; -static unsigned long __initdata dma_reserve; +static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP /* @@ -126,10 +126,10 @@ static unsigned long __initdata dma_reserve; #endif #endif - struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; - int __initdata nr_nodemap_entries; - unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; - unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; + struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; + int __meminitdata nr_nodemap_entries; + unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; @@ -156,10 +156,8 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) static int page_is_consistent(struct zone *zone, struct page *page) { -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (!pfn_valid_within(page_to_pfn(page))) return 0; -#endif if (zone != page_zone(page)) return 0; @@ -227,7 +225,7 @@ static void bad_page(struct page *page) static void free_compound_page(struct page *page) { - __free_pages_ok(page, (unsigned long)page[1].lru.prev); + __free_pages_ok(page, compound_order(page)); } static void prep_compound_page(struct page *page, unsigned long order) @@ -236,12 +234,13 @@ static void prep_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; set_compound_page_dtor(page, free_compound_page); - page[1].lru.prev = (void *)order; - for (i = 0; i < nr_pages; i++) { + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageCompound(p); - set_page_private(p, (unsigned long)page); + __SetPageTail(p); + p->first_page = page; } } @@ -250,16 +249,19 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (unlikely((unsigned long)page[1].lru.prev != order)) + if (unlikely(compound_order(page) != order)) bad_page(page); - for (i = 0; i < nr_pages; i++) { + if (unlikely(!PageHead(page))) + bad_page(page); + __ClearPageHead(page); + for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageCompound(p) | - (page_private(p) != (unsigned long)page))) + if (unlikely(!PageTail(p) | + (p->first_page != page))) bad_page(page); - __ClearPageCompound(p); + __ClearPageTail(p); } } @@ -346,10 +348,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) static inline int page_is_buddy(struct page *page, struct page *buddy, int order) { -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(buddy))) + if (!pfn_valid_within(page_to_pfn(buddy))) return 0; -#endif if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -433,13 +433,18 @@ static inline int free_pages_check(struct page *page) 1 << PG_private | 1 << PG_locked | 1 << PG_active | - 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | 1 << PG_buddy )))) bad_page(page); + /* + * PageReclaim == PageTail. It is only an error + * for PageReclaim to be set if PageCompound is clear. + */ + if (unlikely(!PageCompound(page) && PageReclaim(page))) + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -665,7 +670,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, } #if MAX_NUMNODES > 1 -int nr_node_ids __read_mostly; +int nr_node_ids __read_mostly = MAX_NUMNODES; EXPORT_SYMBOL(nr_node_ids); /* @@ -686,43 +691,26 @@ static void __init setup_nr_node_ids(void) {} #ifdef CONFIG_NUMA /* - * Called from the slab reaper to drain pagesets on a particular node that - * belongs to the currently executing processor. + * Called from the vmstat counter updater to drain pagesets of this + * currently executing processor on remote nodes after they have + * expired. + * * Note that this function must be called with the thread pinned to * a single processor. */ -void drain_node_pages(int nodeid) +void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - int i; - enum zone_type z; unsigned long flags; + int to_drain; - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = NODE_DATA(nodeid)->node_zones + z; - struct per_cpu_pageset *pset; - - if (!populated_zone(zone)) - continue; - - pset = zone_pcp(zone, smp_processor_id()); - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - if (pcp->count) { - int to_drain; - - local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; - else - to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); - pcp->count -= to_drain; - local_irq_restore(flags); - } - } - } + local_irq_save(flags); + if (pcp->count >= pcp->batch) + to_drain = pcp->batch; + else + to_drain = pcp->count; + free_pages_bulk(zone, to_drain, &pcp->list, 0); + pcp->count -= to_drain; + local_irq_restore(flags); } #endif @@ -770,8 +758,8 @@ void mark_free_pages(struct zone *zone) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - if (!PageNosave(page)) - ClearPageNosaveFree(page); + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); } for (order = MAX_ORDER - 1; order >= 0; --order) @@ -780,7 +768,7 @@ void mark_free_pages(struct zone *zone) pfn = page_to_pfn(list_entry(curr, struct page, lru)); for (i = 0; i < (1UL << order); i++) - SetPageNosaveFree(pfn_to_page(pfn + i)); + swsusp_set_page_free(pfn_to_page(pfn + i)); } spin_unlock_irqrestore(&zone->lock, flags); @@ -2143,11 +2131,14 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: if (process_zones(cpu)) ret = NOTIFY_BAD; break; case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: + case CPU_DEAD_FROZEN: free_zone_pagesets(cpu); break; default: @@ -2174,7 +2165,7 @@ void __init setup_per_cpu_pageset(void) #endif -static __meminit +static __meminit noinline int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -2262,7 +2253,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, * Basic iterator support. Return the first range of PFNs for a node * Note: nid == MAX_NUMNODES returns first region regardless of node */ -static int __init first_active_region_index_in_nid(int nid) +static int __meminit first_active_region_index_in_nid(int nid) { int i; @@ -2277,7 +2268,7 @@ static int __init first_active_region_index_in_nid(int nid) * Basic iterator support. Return the next active range of PFNs for a node * Note: nid == MAX_NUMNODES returns next region regardles of node */ -static int __init next_active_region_index_in_nid(int index, int nid) +static int __meminit next_active_region_index_in_nid(int index, int nid) { for (index = index + 1; index < nr_nodemap_entries; index++) if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) @@ -2293,7 +2284,7 @@ static int __init next_active_region_index_in_nid(int index, int nid) * was used and there are no special requirements, this is a convenient * alternative */ -int __init early_pfn_to_nid(unsigned long pfn) +int __meminit early_pfn_to_nid(unsigned long pfn) { int i; @@ -2430,7 +2421,7 @@ static void __init account_node_boundary(unsigned int nid, * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ -void __init get_pfn_range_for_nid(unsigned int nid, +void __meminit get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { int i; @@ -2455,7 +2446,7 @@ void __init get_pfn_range_for_nid(unsigned int nid, * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() */ -unsigned long __init zone_spanned_pages_in_node(int nid, +unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { @@ -2483,7 +2474,7 @@ unsigned long __init zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __init __absent_pages_in_range(int nid, +unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -2543,7 +2534,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, } /* Return the number of page frames in holes in a zone on a node */ -unsigned long __init zone_absent_pages_in_node(int nid, +unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { @@ -2579,7 +2570,7 @@ static inline unsigned long zone_absent_pages_in_node(int nid, #endif -static void __init calculate_node_totalpages(struct pglist_data *pgdat, +static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; @@ -2687,7 +2678,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, } } -static void __init alloc_node_mem_map(struct pglist_data *pgdat) +static void __meminit alloc_node_mem_map(struct pglist_data *pgdat) { /* Skip empty nodes */ if (!pgdat->node_spanned_pages) @@ -3007,7 +2998,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, { int cpu = (unsigned long)hcpu; - if (action == CPU_DEAD) { + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { local_irq_disable(); __drain_pages(cpu); vm_events_fold_cpu(cpu); @@ -3203,7 +3194,8 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, file, buffer, length, ppos); - setup_per_zone_pages_min(); + if (write) + setup_per_zone_pages_min(); return 0; } diff --git a/mm/quicklist.c b/mm/quicklist.c new file mode 100644 index 00000000000..ae8189c2799 --- /dev/null +++ b/mm/quicklist.c @@ -0,0 +1,88 @@ +/* + * Quicklist support. + * + * Quicklists are light weight lists of pages that have a defined state + * on alloc and free. Pages must be in the quicklist specific defined state + * (zero by default) when the page is freed. It seems that the initial idea + * for such lists first came from Dave Miller and then various other people + * improved on it. + * + * Copyright (C) 2007 SGI, + * Christoph Lameter <clameter@sgi.com> + * Generalized, added support for multiple lists and + * constructors / destructors. + */ +#include <linux/kernel.h> + +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/quicklist.h> + +DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; + +#define FRACTION_OF_NODE_MEM 16 + +static unsigned long max_pages(unsigned long min_pages) +{ + unsigned long node_free_pages, max; + + node_free_pages = node_page_state(numa_node_id(), + NR_FREE_PAGES); + max = node_free_pages / FRACTION_OF_NODE_MEM; + return max(max, min_pages); +} + +static long min_pages_to_free(struct quicklist *q, + unsigned long min_pages, long max_free) +{ + long pages_to_free; + + pages_to_free = q->nr_pages - max_pages(min_pages); + + return min(pages_to_free, max_free); +} + +/* + * Trim down the number of pages in the quicklist + */ +void quicklist_trim(int nr, void (*dtor)(void *), + unsigned long min_pages, unsigned long max_free) +{ + long pages_to_free; + struct quicklist *q; + + q = &get_cpu_var(quicklist)[nr]; + if (q->nr_pages > min_pages) { + pages_to_free = min_pages_to_free(q, min_pages, max_free); + + while (pages_to_free > 0) { + /* + * We pass a gfp_t of 0 to quicklist_alloc here + * because we will never call into the page allocator. + */ + void *p = quicklist_alloc(nr, 0, NULL); + + if (dtor) + dtor(p); + free_page((unsigned long)p); + pages_to_free--; + } + } + put_cpu_var(quicklist); +} + +unsigned long quicklist_total_size(void) +{ + unsigned long count = 0; + int cpu; + struct quicklist *ql, *q; + + for_each_online_cpu(cpu) { + ql = per_cpu(quicklist, cpu); + for (q = ql; q < ql + CONFIG_NR_QUICK; q++) + count += q->nr_pages; + } + return count; +} + diff --git a/mm/readahead.c b/mm/readahead.c index 93d9ee692fd..9861e883fe5 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -37,7 +37,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = mapping->backing_dev_info->ra_pages; - ra->prev_page = -1; + ra->prev_index = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); @@ -202,17 +202,19 @@ out: * size: Number of pages in that read * Together, these form the "current window". * Together, start and size represent the `readahead window'. - * prev_page: The page which the readahead algorithm most-recently inspected. + * prev_index: The page which the readahead algorithm most-recently inspected. * It is mainly used to detect sequential file reading. * If page_cache_readahead sees that it is again being called for * a page which it just looked at, it can return immediately without * making any state changes. + * offset: Offset in the prev_index where the last read ended - used for + * detection of sequential file reading. * ahead_start, * ahead_size: Together, these form the "ahead window". * ra_pages: The externally controlled max readahead for this fd. * * When readahead is in the off state (size == 0), readahead is disabled. - * In this state, prev_page is used to detect the resumption of sequential I/O. + * In this state, prev_index is used to detect the resumption of sequential I/O. * * The readahead code manages two windows - the "current" and the "ahead" * windows. The intent is that while the application is walking the pages @@ -415,7 +417,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, ra->ahead_size = get_next_ra_size(ra); ra->ahead_start = ra->start + ra->size; - block = force || (ra->prev_page >= ra->ahead_start); + block = force || (ra->prev_index >= ra->ahead_start); ret = blockable_page_cache_readahead(mapping, filp, ra->ahead_start, ra->ahead_size, ra, block); @@ -467,12 +469,13 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * We avoid doing extra work and bogusly perturbing the readahead * window expansion logic. */ - if (offset == ra->prev_page && --req_size) + if (offset == ra->prev_index && --req_size) ++offset; - /* Note that prev_page == -1 if it is a first read */ - sequential = (offset == ra->prev_page + 1); - ra->prev_page = offset; + /* Note that prev_index == -1 if it is a first read */ + sequential = (offset == ra->prev_index + 1); + ra->prev_index = offset; + ra->prev_offset = 0; max = get_max_readahead(ra); newsize = min(req_size, max); @@ -481,7 +484,7 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) goto out; - ra->prev_page += newsize - 1; + ra->prev_index += newsize - 1; /* * Special case - first read at start of file. We'll assume it's @@ -537,18 +540,18 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * we get called back on the first page of the ahead window which * will allow us to submit more IO. */ - if (ra->prev_page >= ra->ahead_start) { + if (ra->prev_index >= ra->ahead_start) { ra->start = ra->ahead_start; ra->size = ra->ahead_size; make_ahead_window(mapping, filp, ra, 0); recheck: - /* prev_page shouldn't overrun the ahead window */ - ra->prev_page = min(ra->prev_page, + /* prev_index shouldn't overrun the ahead window */ + ra->prev_index = min(ra->prev_index, ra->ahead_start + ra->ahead_size - 1); } out: - return ra->prev_page + 1; + return ra->prev_index + 1; } EXPORT_SYMBOL_GPL(page_cache_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index 22ed3f71a67..304f51985c7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -162,8 +162,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) static void anon_vma_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) { - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { + if (flags & SLAB_CTOR_CONSTRUCTOR) { struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); @@ -498,12 +497,15 @@ int page_mkclean(struct page *page) struct address_space *mapping = page_mapping(page); if (mapping) ret = page_mkclean_file(mapping, page); + if (page_test_dirty(page)) { + page_clear_dirty(page); + ret = 1; + } } - if (page_test_and_clear_dirty(page)) - ret = 1; return ret; } +EXPORT_SYMBOL_GPL(page_mkclean); /** * page_set_anon_rmap - setup new anonymous rmap @@ -605,8 +607,10 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - if (page_test_and_clear_dirty(page)) + if (page_test_dirty(page)) { + page_clear_dirty(page); set_page_dirty(page); + } __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } diff --git a/mm/shmem.c b/mm/shmem.c index b8c429a2d27..f01e8deed64 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -402,26 +402,38 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long /* * shmem_free_swp - free some swap entries in a directory * - * @dir: pointer to the directory - * @edir: pointer after last entry of the directory + * @dir: pointer to the directory + * @edir: pointer after last entry of the directory + * @punch_lock: pointer to spinlock when needed for the holepunch case */ -static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir) +static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, + spinlock_t *punch_lock) { + spinlock_t *punch_unlock = NULL; swp_entry_t *ptr; int freed = 0; for (ptr = dir; ptr < edir; ptr++) { if (ptr->val) { + if (unlikely(punch_lock)) { + punch_unlock = punch_lock; + punch_lock = NULL; + spin_lock(punch_unlock); + if (!ptr->val) + continue; + } free_swap_and_cache(*ptr); *ptr = (swp_entry_t){0}; freed++; } } + if (punch_unlock) + spin_unlock(punch_unlock); return freed; } -static int shmem_map_and_free_swp(struct page *subdir, - int offset, int limit, struct page ***dir) +static int shmem_map_and_free_swp(struct page *subdir, int offset, + int limit, struct page ***dir, spinlock_t *punch_lock) { swp_entry_t *ptr; int freed = 0; @@ -431,7 +443,8 @@ static int shmem_map_and_free_swp(struct page *subdir, int size = limit - offset; if (size > LATENCY_LIMIT) size = LATENCY_LIMIT; - freed += shmem_free_swp(ptr+offset, ptr+offset+size); + freed += shmem_free_swp(ptr+offset, ptr+offset+size, + punch_lock); if (need_resched()) { shmem_swp_unmap(ptr); if (*dir) { @@ -481,7 +494,10 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) long nr_swaps_freed = 0; int offset; int freed; - int punch_hole = 0; + int punch_hole; + spinlock_t *needs_lock; + spinlock_t *punch_lock; + unsigned long upper_limit; inode->i_ctime = inode->i_mtime = CURRENT_TIME; idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -492,11 +508,20 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; + upper_limit = SHMEM_MAX_INDEX; info->next_index = idx; + needs_lock = NULL; + punch_hole = 0; } else { - limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (limit > info->next_index) - limit = info->next_index; + if (end + 1 >= inode->i_size) { /* we may free a little more */ + limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + upper_limit = SHMEM_MAX_INDEX; + } else { + limit = (end + 1) >> PAGE_CACHE_SHIFT; + upper_limit = limit; + } + needs_lock = &info->lock; punch_hole = 1; } @@ -513,17 +538,30 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) size = limit; if (size > SHMEM_NR_DIRECT) size = SHMEM_NR_DIRECT; - nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); + nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); } /* * If there are no indirect blocks or we are punching a hole * below indirect blocks, nothing to be done. */ - if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) + if (!topdir || limit <= SHMEM_NR_DIRECT) goto done2; - BUG_ON(limit <= SHMEM_NR_DIRECT); + /* + * The truncation case has already dropped info->lock, and we're safe + * because i_size and next_index have already been lowered, preventing + * access beyond. But in the punch_hole case, we still need to take + * the lock when updating the swap directory, because there might be + * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or + * shmem_writepage. However, whenever we find we can remove a whole + * directory page (not at the misaligned start or end of the range), + * we first NULLify its pointer in the level above, and then have no + * need to take the lock when updating its contents: needs_lock and + * punch_lock (either pointing to info->lock or NULL) manage this. + */ + + upper_limit -= SHMEM_NR_DIRECT; limit -= SHMEM_NR_DIRECT; idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; offset = idx % ENTRIES_PER_PAGE; @@ -543,8 +581,14 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) if (*dir) { diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; - if (!diroff && !offset) { - *dir = NULL; + if (!diroff && !offset && upper_limit >= stage) { + if (needs_lock) { + spin_lock(needs_lock); + *dir = NULL; + spin_unlock(needs_lock); + needs_lock = NULL; + } else + *dir = NULL; nr_pages_to_free++; list_add(&middir->lru, &pages_to_free); } @@ -570,39 +614,55 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) } stage = idx + ENTRIES_PER_PAGEPAGE; middir = *dir; - *dir = NULL; - nr_pages_to_free++; - list_add(&middir->lru, &pages_to_free); + if (punch_hole) + needs_lock = &info->lock; + if (upper_limit >= stage) { + if (needs_lock) { + spin_lock(needs_lock); + *dir = NULL; + spin_unlock(needs_lock); + needs_lock = NULL; + } else + *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + } shmem_dir_unmap(dir); cond_resched(); dir = shmem_dir_map(middir); diroff = 0; } + punch_lock = needs_lock; subdir = dir[diroff]; - if (subdir && page_private(subdir)) { + if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { + if (needs_lock) { + spin_lock(needs_lock); + dir[diroff] = NULL; + spin_unlock(needs_lock); + punch_lock = NULL; + } else + dir[diroff] = NULL; + nr_pages_to_free++; + list_add(&subdir->lru, &pages_to_free); + } + if (subdir && page_private(subdir) /* has swap entries */) { size = limit - idx; if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; freed = shmem_map_and_free_swp(subdir, - offset, size, &dir); + offset, size, &dir, punch_lock); if (!dir) dir = shmem_dir_map(middir); nr_swaps_freed += freed; - if (offset) + if (offset || punch_lock) { spin_lock(&info->lock); - set_page_private(subdir, page_private(subdir) - freed); - if (offset) + set_page_private(subdir, + page_private(subdir) - freed); spin_unlock(&info->lock); - if (!punch_hole) - BUG_ON(page_private(subdir) > offset); - } - if (offset) - offset = 0; - else if (subdir && !page_private(subdir)) { - dir[diroff] = NULL; - nr_pages_to_free++; - list_add(&subdir->lru, &pages_to_free); + } else + BUG_ON(page_private(subdir) != freed); } + offset = 0; } done1: shmem_dir_unmap(dir); @@ -614,8 +674,16 @@ done2: * generic_delete_inode did it, before we lowered next_index. * Also, though shmem_getpage checks i_size before adding to * cache, no recheck after: so fix the narrow window there too. + * + * Recalling truncate_inode_pages_range and unmap_mapping_range + * every time for punch_hole (which never got a chance to clear + * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, + * yet hardly ever necessary: try to optimize them out later. */ truncate_inode_pages_range(inode->i_mapping, start, end); + if (punch_hole) + unmap_mapping_range(inode->i_mapping, start, + end - start, 1); } spin_lock(&info->lock); @@ -2290,8 +2358,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { + if (flags & SLAB_CTOR_CONSTRUCTOR) { inode_init_once(&p->vfs_inode); #ifdef CONFIG_TMPFS_POSIX_ACL p->i_acl = NULL; diff --git a/mm/slab.c b/mm/slab.c index 57f7aa42006..944b20581f8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -116,8 +116,7 @@ #include <asm/page.h> /* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, - * SLAB_RED_ZONE & SLAB_POISON. + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * * STATS - 1 to collect stats for /proc/slabinfo. @@ -149,10 +148,11 @@ * Usually, the kmalloc caches are cache_line_size() aligned, except when * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. - * Note that this flag disables some debug features. + * alignment larger than the alignment of a 64-bit integer. + * ARCH_KMALLOC_MINALIGN allows that. + * Note that increasing this value may disable some debug features. */ -#define ARCH_KMALLOC_MINALIGN 0 +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) #endif #ifndef ARCH_SLAB_MINALIGN @@ -172,15 +172,15 @@ /* Legal flag mask for kmem_cache_create(). */ #if DEBUG -# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ +# define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #endif @@ -389,7 +389,6 @@ struct kmem_cache { unsigned int buffer_size; u32 reciprocal_buffer_size; /* 3) touched by every alloc & free from the backend */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ @@ -444,6 +443,17 @@ struct kmem_cache { int obj_offset; int obj_size; #endif + /* + * We put nodelists[] at the end of kmem_cache, because we want to size + * this array to nr_node_ids slots instead of MAX_NUMNODES + * (see kmem_cache_init()) + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of nodes. + */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + /* + * Do not add fields after nodelists[] + */ }; #define CFLGS_OFF_SLAB (0x80000000UL) @@ -527,19 +537,22 @@ static int obj_size(struct kmem_cache *cachep) return cachep->obj_size; } -static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp) +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); + return (unsigned long long*) (objp + obj_offset(cachep) - + sizeof(unsigned long long)); } -static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp) +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); if (cachep->flags & SLAB_STORE_USER) - return (unsigned long *)(objp + cachep->buffer_size - - 2 * BYTES_PER_WORD); - return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); + return (unsigned long long *)(objp + cachep->buffer_size - + sizeof(unsigned long long) - + BYTES_PER_WORD); + return (unsigned long long *) (objp + cachep->buffer_size - + sizeof(unsigned long long)); } static void **dbg_userword(struct kmem_cache *cachep, void *objp) @@ -552,8 +565,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #define obj_offset(x) 0 #define obj_size(cachep) (cachep->buffer_size) -#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) -#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) #endif @@ -592,8 +605,7 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) static inline struct kmem_cache *page_get_cache(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); + page = compound_head(page); BUG_ON(!PageSlab(page)); return (struct kmem_cache *)page->lru.next; } @@ -605,21 +617,19 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); BUG_ON(!PageSlab(page)); return (struct slab *)page->lru.prev; } static inline struct kmem_cache *virt_to_cache(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_cache(page); } static inline struct slab *virt_to_slab(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_slab(page); } @@ -678,9 +688,6 @@ static struct kmem_cache cache_cache = { .shared = 1, .buffer_size = sizeof(struct kmem_cache), .name = "kmem_cache", -#if DEBUG - .obj_size = sizeof(struct kmem_cache), -#endif }; #define BAD_ALIEN_MAGIC 0x01020304ul @@ -921,12 +928,6 @@ static void next_reap_node(void) { int node = __get_cpu_var(reap_node); - /* - * Also drain per cpu pages on remote zones - */ - if (node != numa_node_id()) - drain_node_pages(node); - node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); @@ -1146,7 +1147,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) + if (likely(slabp->nodeid == node)) return 0; l3 = cachep->nodelists[node]; @@ -1179,8 +1180,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, int memsize = sizeof(struct kmem_list3); switch (action) { - case CPU_UP_PREPARE: + case CPU_LOCK_ACQUIRE: mutex_lock(&cache_chain_mutex); + break; + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: /* * We need to do this right in the beginning since * alloc_arraycache's are going to use this list. @@ -1223,19 +1227,20 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, */ list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; - struct array_cache *shared; + struct array_cache *shared = NULL; struct array_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount); if (!nc) goto bad; - shared = alloc_arraycache(node, + if (cachep->shared) { + shared = alloc_arraycache(node, cachep->shared * cachep->batchcount, 0xbaadf00d); - if (!shared) - goto bad; - + if (!shared) + goto bad; + } if (use_alien_caches) { alien = alloc_alien_cache(node, cachep->limit); if (!alien) @@ -1266,17 +1271,28 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, } break; case CPU_ONLINE: - mutex_unlock(&cache_chain_mutex); + case CPU_ONLINE_FROZEN: start_cpu_timer(cpu); break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - mutex_lock(&cache_chain_mutex); - break; - case CPU_DOWN_FAILED: - mutex_unlock(&cache_chain_mutex); - break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Shutdown cache reaper. Note that the cache_chain_mutex is + * held so that if cache_reap() is invoked it cannot do + * anything expensive but will only modify reap_work + * and reschedule the timer. + */ + cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(reap_work, cpu).work.func = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + start_cpu_timer(cpu); + break; case CPU_DEAD: + case CPU_DEAD_FROZEN: /* * Even if all the cpus of a node are down, we don't free the * kmem_list3 of any cache. This to avoid a race between @@ -1288,6 +1304,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, /* fall thru */ #endif case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1317,8 +1334,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, shared = l3->shared; if (shared) { - free_block(cachep, l3->shared->entry, - l3->shared->avail, node); + free_block(cachep, shared->entry, + shared->avail, node); l3->shared = NULL; } @@ -1346,6 +1363,8 @@ free_array_cache: continue; drain_freelist(cachep, l3, l3->free_objects); } + break; + case CPU_LOCK_RELEASE: mutex_unlock(&cache_chain_mutex); break; } @@ -1394,6 +1413,9 @@ void __init kmem_cache_init(void) int order; int node; + if (num_possible_nodes() == 1) + use_alien_caches = 0; + for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) @@ -1436,6 +1458,15 @@ void __init kmem_cache_init(void) cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; + /* + * struct kmem_cache size depends on nr_node_ids, which + * can be less than MAX_NUMNODES. + */ + cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + + nr_node_ids * sizeof(struct kmem_list3 *); +#if DEBUG + cache_cache.obj_size = cache_cache.buffer_size; +#endif cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); cache_cache.reciprocal_buffer_size = @@ -1760,7 +1791,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", + printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -1802,8 +1833,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: start=%p, len=%d\n", - realobj, size); + "Slab corruption: %s start=%p, len=%d\n", + cachep->name, realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1929,7 +1960,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) * For setting up all the kmem_list3s for cache whose buffer_size is same as * size of kmem_list3. */ -static void set_up_list3s(struct kmem_cache *cachep, int index) +static void __init set_up_list3s(struct kmem_cache *cachep, int index) { int node; @@ -2151,13 +2182,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ res = probe_kernel_address(pc->name, tmp); if (res) { - printk("SLAB: cache with size %d has lost its name\n", + printk(KERN_ERR + "SLAB: cache with size %d has lost its name\n", pc->buffer_size); continue; } if (!strcmp(pc->name, name)) { - printk("kmem_cache_create: duplicate cache %s\n", name); + printk(KERN_ERR + "kmem_cache_create: duplicate cache %s\n", name); dump_stack(); goto oops; } @@ -2165,12 +2198,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, #if DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ - if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { - /* No constructor, but inital state check requested */ - printk(KERN_ERR "%s: No con, but init state check " - "requested - %s\n", __FUNCTION__, name); - flags &= ~SLAB_DEBUG_INITIAL; - } #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with @@ -2227,7 +2254,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * is greater than BYTES_PER_WORD. */ if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) - ralign = BYTES_PER_WORD; + ralign = __alignof__(unsigned long long); /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { @@ -2238,7 +2265,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, ralign = align; } /* disable debug if necessary */ - if (ralign > BYTES_PER_WORD) + if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2259,8 +2286,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += BYTES_PER_WORD; - size += 2 * BYTES_PER_WORD; + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of @@ -2294,7 +2321,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) { - printk("kmem_cache_create: couldn't create cache %s.\n", name); + printk(KERN_ERR + "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; @@ -2733,19 +2761,10 @@ static int cache_grow(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); - if (flags & __GFP_NO_GROW) - return 0; + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); ctor_flags = SLAB_CTOR_CONSTRUCTOR; local_flags = (flags & GFP_LEVEL_MASK); - if (!(local_flags & __GFP_WAIT)) - /* - * Not allowed to sleep. Need to tell a constructor about - * this - it might need to know... - */ - ctor_flags |= SLAB_CTOR_ATOMIC; - /* Take the l3 list lock to change the colour_next on this node */ check_irq_off(); l3 = cachep->nodelists[nodeid]; @@ -2829,7 +2848,7 @@ static void kfree_debugcheck(const void *objp) static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) { - unsigned long redzone1, redzone2; + unsigned long long redzone1, redzone2; redzone1 = *dbg_redzone1(cache, obj); redzone2 = *dbg_redzone2(cache, obj); @@ -2845,7 +2864,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", + printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", obj, redzone1, redzone2); } @@ -2858,7 +2877,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, objp -= obj_offset(cachep); kfree_debugcheck(objp); - page = virt_to_page(objp); + page = virt_to_head_page(objp); slabp = page_get_slab(page); @@ -2875,15 +2894,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); - if (cachep->flags & SLAB_DEBUG_INITIAL) { - /* - * Need to call the slab's constructor so the caller can - * perform a verify of its state (debugging). Called without - * the cache-lock held. - */ - cachep->ctor(objp + obj_offset(cachep), - cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); - } if (cachep->flags & SLAB_POISON && cachep->dtor) { /* we want to cache poison the object, * call the destruction callback @@ -2987,6 +2997,14 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); check_spinlock_acquired(cachep); + + /* + * The slab was either on partial or free list so + * there must be at least one object available for + * allocation. + */ + BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num); + while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); @@ -3062,7 +3080,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, slab_error(cachep, "double free, or memory outside" " object was overwritten"); printk(KERN_ERR - "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", + "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3074,20 +3092,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, struct slab *slabp; unsigned objnr; - slabp = page_get_slab(virt_to_page(objp)); + slabp = page_get_slab(virt_to_head_page(objp)); objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; } #endif objp += obj_offset(cachep); - if (cachep->ctor && cachep->flags & SLAB_POISON) { - unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; - - if (!(flags & __GFP_WAIT)) - ctor_flags |= SLAB_CTOR_ATOMIC; - - cachep->ctor(objp, cachep, ctor_flags); - } + if (cachep->ctor && cachep->flags & SLAB_POISON) + cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR); #if ARCH_SLAB_MINALIGN if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", @@ -3142,7 +3154,7 @@ static int __init failslab_debugfs(void) struct dentry *dir; int err; - err = init_fault_attr_dentries(&failslab.attr, "failslab"); + err = init_fault_attr_dentries(&failslab.attr, "failslab"); if (err) return err; dir = failslab.attr.dentries.dir; @@ -3180,9 +3192,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); - if (should_failslab(cachep, flags)) - return NULL; - ac = cpu_cache_get(cachep); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); @@ -3256,7 +3265,7 @@ retry: flags | GFP_THISNODE, nid); } - if (!obj && !(flags & __GFP_NO_GROW)) { + if (!obj) { /* * This allocation will be performed within the constraints * of the current cpuset / memory policy requirements. @@ -3374,6 +3383,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; + if (should_failslab(cachep, flags)) + return NULL; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3444,6 +3456,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; + if (should_failslab(cachep, flags)) + return NULL; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3563,7 +3578,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + if (use_alien_caches && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -3737,6 +3752,52 @@ EXPORT_SYMBOL(__kmalloc); #endif /** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + struct kmem_cache *cache, *new_cache; + void *ret; + + if (unlikely(!p)) + return kmalloc_track_caller(new_size, flags); + + if (unlikely(!new_size)) { + kfree(p); + return NULL; + } + + cache = virt_to_cache(p); + new_cache = __find_general_cachep(new_size, flags); + + /* + * If new size fits in the current cache, bail out. + */ + if (likely(cache == new_cache)) + return (void *)p; + + /* + * We are on the slow-path here so do not use __cache_alloc + * because it bloats kernel text. + */ + ret = kmalloc_track_caller(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ksize(p))); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. * @objp: The previously allocated object. @@ -3812,12 +3873,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) goto fail; } - new_shared = alloc_arraycache(node, + new_shared = NULL; + if (cachep->shared) { + new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, 0xbaadf00d); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; + if (!new_shared) { + free_alien_cache(new_alien); + goto fail; + } } l3 = cachep->nodelists[node]; @@ -3975,10 +4039,8 @@ static int enable_cpucache(struct kmem_cache *cachep) * to a larger limit. Thus disabled by default. */ shared = 0; -#ifdef CONFIG_SMP - if (cachep->buffer_size <= PAGE_SIZE) + if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; -#endif #if DEBUG /* @@ -4088,7 +4150,6 @@ next: check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); - refresh_cpu_vm_stats(smp_processor_id()); out: /* Set up the next iteration */ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); @@ -4380,16 +4441,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) static void show_symbol(struct seq_file *m, unsigned long address) { #ifdef CONFIG_KALLSYMS - char *modname; - const char *name; unsigned long offset, size; - char namebuf[KSYM_NAME_LEN+1]; - - name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1]; - if (name) { + if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { seq_printf(m, "%s+%#lx/%#lx", name, offset, size); - if (modname) + if (modname[0]) seq_printf(m, " [%s]", modname); return; } @@ -4478,7 +4535,7 @@ const struct seq_operations slabstats_op = { * allocated with either kmalloc() or kmem_cache_alloc(). The object * must not be freed during the duration of the call. */ -unsigned int ksize(const void *objp) +size_t ksize(const void *objp) { if (unlikely(objp == NULL)) return 0; diff --git a/mm/slob.c b/mm/slob.c index 5adc29cb58d..c6933bc19bc 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -21,7 +21,7 @@ * * SLAB is emulated on top of SLOB by simply calling constructors and * destructors for every SLAB allocation. Objects are returned with - * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is + * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is * set, in which case the low-level allocator will fragment blocks to * create the proper alignment. Again, objects of page-size or greater * are allocated by calling __get_free_pages. As SLAB objects know @@ -150,15 +150,6 @@ static void slob_free(void *block, int size) spin_unlock_irqrestore(&slob_lock, flags); } -static int FASTCALL(find_order(int size)); -static int fastcall find_order(int size) -{ - int order = 0; - for ( ; size > 4096 ; size >>=1) - order++; - return order; -} - void *__kmalloc(size_t size, gfp_t gfp) { slob_t *m; @@ -174,7 +165,7 @@ void *__kmalloc(size_t size, gfp_t gfp) if (!bb) return 0; - bb->order = find_order(size); + bb->order = get_order(size); bb->pages = (void *)__get_free_pages(gfp, bb->order); if (bb->pages) { @@ -190,6 +181,39 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!p)) + return kmalloc_track_caller(new_size, flags); + + if (unlikely(!new_size)) { + kfree(p); + return NULL; + } + + ret = kmalloc_track_caller(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ksize(p))); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + void kfree(const void *block) { bigblock_t *bb, **last = &bigblocks; @@ -219,7 +243,7 @@ void kfree(const void *block) EXPORT_SYMBOL(kfree); -unsigned int ksize(const void *block) +size_t ksize(const void *block) { bigblock_t *bb; unsigned long flags; @@ -262,10 +286,11 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, c->ctor = ctor; c->dtor = dtor; /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; if (c->align < align) c->align = align; - } + } else if (flags & SLAB_PANIC) + panic("Cannot create slab cache %s\n", name); return c; } @@ -284,7 +309,7 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) if (c->size < PAGE_SIZE) b = slob_alloc(c->size, flags, c->align); else - b = (void *)__get_free_pages(flags, find_order(c->size)); + b = (void *)__get_free_pages(flags, get_order(c->size)); if (c->ctor) c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR); @@ -311,7 +336,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b) if (c->size < PAGE_SIZE) slob_free(b, c->size); else - free_pages((unsigned long)b, find_order(c->size)); + free_pages((unsigned long)b, get_order(c->size)); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c new file mode 100644 index 00000000000..b39c8a69a4f --- /dev/null +++ b/mm/slub.c @@ -0,0 +1,3669 @@ +/* + * SLUB: A slab allocator that limits cache line use instead of queuing + * objects in per cpu and per node lists. + * + * The allocator synchronizes using per slab locks and only + * uses a centralized lock to manage a pool of partial slabs. + * + * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/bit_spinlock.h> +#include <linux/interrupt.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/mempolicy.h> +#include <linux/ctype.h> +#include <linux/kallsyms.h> + +/* + * Lock order: + * 1. slab_lock(page) + * 2. slab->list_lock + * + * The slab_lock protects operations on the object of a particular + * slab and its metadata in the page struct. If the slab lock + * has been taken then no allocations nor frees can be performed + * on the objects in the slab nor can the slab be added or removed + * from the partial or full lists since this would mean modifying + * the page_struct of the slab. + * + * The list_lock protects the partial and full list on each node and + * the partial slab counter. If taken then no new slabs may be added or + * removed from the lists nor make the number of partial slabs be modified. + * (Note that the total number of slabs is an atomic value that may be + * modified without taking the list lock). + * + * The list_lock is a centralized lock and thus we avoid taking it as + * much as possible. As long as SLUB does not have to handle partial + * slabs, operations can continue without any centralized lock. F.e. + * allocating a long series of objects that fill up slabs does not require + * the list lock. + * + * The lock order is sometimes inverted when we are trying to get a slab + * off a list. We take the list_lock and then look for a page on the list + * to use. While we do that objects in the slabs may be freed. We can + * only operate on the slab if we have also taken the slab_lock. So we use + * a slab_trylock() on the slab. If trylock was successful then no frees + * can occur anymore and we can use the slab for allocations etc. If the + * slab_trylock() does not succeed then frees are in progress in the slab and + * we must stay away from it for a while since we may cause a bouncing + * cacheline if we try to acquire the lock. So go onto the next slab. + * If all pages are busy then we may allocate a new slab instead of reusing + * a partial slab. A new slab has noone operating on it and thus there is + * no danger of cacheline contention. + * + * Interrupts are disabled during allocation and deallocation in order to + * make the slab allocator safe to use in the context of an irq. In addition + * interrupts are disabled to ensure that the processor does not change + * while handling per_cpu slabs, due to kernel preemption. + * + * SLUB assigns one slab for allocation to each processor. + * Allocations only occur from these slabs called cpu slabs. + * + * Slabs with free elements are kept on a partial list and during regular + * operations no list for full slabs is used. If an object in a full slab is + * freed then the slab will show up again on the partial lists. + * We track full slabs for debugging purposes though because otherwise we + * cannot scan all objects. + * + * Slabs are freed when they become empty. Teardown and setup is + * minimal so we rely on the page allocators per cpu caches for + * fast frees and allocs. + * + * Overloading of page flags that are otherwise used for LRU management. + * + * PageActive The slab is used as a cpu cache. Allocations + * may be performed from the slab. The slab is not + * on any slab list and cannot be moved onto one. + * The cpu slab may be equipped with an additioanl + * lockless_freelist that allows lockless access to + * free objects in addition to the regular freelist + * that requires the slab lock. + * + * PageError Slab requires special handling due to debug + * options set. This moves slab handling out of + * the fast path and disables lockless freelists. + */ + +static inline int SlabDebug(struct page *page) +{ +#ifdef CONFIG_SLUB_DEBUG + return PageError(page); +#else + return 0; +#endif +} + +static inline void SetSlabDebug(struct page *page) +{ +#ifdef CONFIG_SLUB_DEBUG + SetPageError(page); +#endif +} + +static inline void ClearSlabDebug(struct page *page) +{ +#ifdef CONFIG_SLUB_DEBUG + ClearPageError(page); +#endif +} + +/* + * Issues still to be resolved: + * + * - The per cpu array is updated for each new slab and and is a remote + * cacheline for most nodes. This could become a bouncing cacheline given + * enough frequent updates. There are 16 pointers in a cacheline, so at + * max 16 cpus could compete for the cacheline which may be okay. + * + * - Support PAGE_ALLOC_DEBUG. Should be easy to do. + * + * - Variable sizing of the per node arrays + */ + +/* Enable to test recovery from slab corruption on boot */ +#undef SLUB_RESILIENCY_TEST + +#if PAGE_SHIFT <= 12 + +/* + * Small page size. Make sure that we do not fragment memory + */ +#define DEFAULT_MAX_ORDER 1 +#define DEFAULT_MIN_OBJECTS 4 + +#else + +/* + * Large page machines are customarily able to handle larger + * page orders. + */ +#define DEFAULT_MAX_ORDER 2 +#define DEFAULT_MIN_OBJECTS 8 + +#endif + +/* + * Mininum number of partial slabs. These will be left on the partial + * lists even if they are empty. kmem_cache_shrink may reclaim them. + */ +#define MIN_PARTIAL 2 + +/* + * Maximum number of desirable partial slabs. + * The existence of more partial slabs makes kmem_cache_shrink + * sort the partial list by the number of objects in the. + */ +#define MAX_PARTIAL 10 + +#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_STORE_USER) + +/* + * Set of flags that will prevent slab merging + */ +#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DESTROY_BY_RCU) + +#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA) + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) +#endif + +/* Internal SLUB flags */ +#define __OBJECT_POISON 0x80000000 /* Poison object */ + +/* Not all arches define cache_line_size */ +#ifndef cache_line_size +#define cache_line_size() L1_CACHE_BYTES +#endif + +static int kmem_size = sizeof(struct kmem_cache); + +#ifdef CONFIG_SMP +static struct notifier_block slab_notifier; +#endif + +static enum { + DOWN, /* No slab functionality available */ + PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + UP, /* Everything works but does not show up in sysfs */ + SYSFS /* Sysfs up */ +} slab_state = DOWN; + +/* A list of all slab caches on the system */ +static DECLARE_RWSEM(slub_lock); +LIST_HEAD(slab_caches); + +/* + * Tracking user of a slab. + */ +struct track { + void *addr; /* Called from address */ + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; + +#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) +static int sysfs_slab_add(struct kmem_cache *); +static int sysfs_slab_alias(struct kmem_cache *, const char *); +static void sysfs_slab_remove(struct kmem_cache *); +#else +static int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } +static void sysfs_slab_remove(struct kmem_cache *s) {} +#endif + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +int slab_is_available(void) +{ + return slab_state >= UP; +} + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ +#ifdef CONFIG_NUMA + return s->node[node]; +#else + return &s->local_node; +#endif +} + +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, const void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + if (object < base || object >= base + s->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +/* + * Slow version of get and set free pointer. + * + * This version requires touching the cache lines of kmem_cache which + * we avoid to do in the fast alloc free paths. There we obtain the offset + * from the page struct. + */ +static inline void *get_freepointer(struct kmem_cache *s, void *object) +{ + return *(void **)(object + s->offset); +} + +static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) +{ + *(void **)(object + s->offset) = fp; +} + +/* Loop over all objects in a slab */ +#define for_each_object(__p, __s, __addr) \ + for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ + __p += (__s)->size) + +/* Scan freelist */ +#define for_each_free_object(__p, __s, __free) \ + for (__p = (__free); __p; __p = get_freepointer((__s), __p)) + +/* Determine object index from a given position */ +static inline int slab_index(void *p, struct kmem_cache *s, void *addr) +{ + return (p - addr) / s->size; +} + +#ifdef CONFIG_SLUB_DEBUG +/* + * Debug settings: + */ +static int slub_debug; + +static char *slub_debug_slabs; + +/* + * Object debugging + */ +static void print_section(char *text, u8 *addr, unsigned int length) +{ + int i, offset; + int newline = 1; + char ascii[17]; + + ascii[16] = 0; + + for (i = 0; i < length; i++) { + if (newline) { + printk(KERN_ERR "%10s 0x%p: ", text, addr + i); + newline = 0; + } + printk(" %02x", addr[i]); + offset = i % 16; + ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; + if (offset == 15) { + printk(" %s\n",ascii); + newline = 1; + } + } + if (!newline) { + i %= 16; + while (i < 16) { + printk(" "); + ascii[i] = ' '; + i++; + } + printk(" %s\n", ascii); + } +} + +static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + return p + alloc; +} + +static void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, void *addr) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + p += alloc; + if (addr) { + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current ? current->pid : -1; + p->when = jiffies; + } else + memset(p, 0, sizeof(struct track)); +} + +static void init_tracking(struct kmem_cache *s, void *object) +{ + if (s->flags & SLAB_STORE_USER) { + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); + } +} + +static void print_track(const char *s, struct track *t) +{ + if (!t->addr) + return; + + printk(KERN_ERR "%s: ", s); + __print_symbol("%s", (unsigned long)t->addr); + printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); +} + +static void print_trailer(struct kmem_cache *s, u8 *p) +{ + unsigned int off; /* Offset of last byte */ + + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone", p + s->objsize, + s->inuse - s->objsize); + + printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", + p + s->offset, + get_freepointer(s, p)); + + if (s->offset) + off = s->offset + sizeof(void *); + else + off = s->inuse; + + if (s->flags & SLAB_STORE_USER) { + print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); + print_track("Last free ", get_track(s, p, TRACK_FREE)); + off += 2 * sizeof(struct track); + } + + if (off != s->size) + /* Beginning of the filler is the free pointer */ + print_section("Filler", p + off, s->size - off); +} + +static void object_err(struct kmem_cache *s, struct page *page, + u8 *object, char *reason) +{ + u8 *addr = page_address(page); + + printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", + s->name, reason, object, page); + printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", + object - addr, page->flags, page->inuse, page->freelist); + if (object > addr + 16) + print_section("Bytes b4", object - 16, 16); + print_section("Object", object, min(s->objsize, 128)); + print_trailer(s, object); + dump_stack(); +} + +static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +{ + va_list args; + char buf[100]; + + va_start(args, reason); + vsnprintf(buf, sizeof(buf), reason, args); + va_end(args); + printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, + page); + dump_stack(); +} + +static void init_object(struct kmem_cache *s, void *object, int active) +{ + u8 *p = object; + + if (s->flags & __OBJECT_POISON) { + memset(p, POISON_FREE, s->objsize - 1); + p[s->objsize -1] = POISON_END; + } + + if (s->flags & SLAB_RED_ZONE) + memset(p + s->objsize, + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, + s->inuse - s->objsize); +} + +static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +{ + while (bytes) { + if (*start != (u8)value) + return 0; + start++; + bytes--; + } + return 1; +} + +/* + * Object layout: + * + * object address + * Bytes of the object to be managed. + * If the freepointer may overlay the object then the free + * pointer is the first word of the object. + * + * Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) + * + * object + s->objsize + * Padding to reach word boundary. This is also used for Redzoning. + * Padding is extended by another word if Redzoning is enabled and + * objsize == inuse. + * + * We fill with 0xbb (RED_INACTIVE) for inactive objects and with + * 0xcc (RED_ACTIVE) for objects in use. + * + * object + s->inuse + * Meta data starts here. + * + * A. Free pointer (if we cannot overwrite object on free) + * B. Tracking data for SLAB_STORE_USER + * C. Padding to reach required alignment boundary or at mininum + * one word if debuggin is on to be able to detect writes + * before the word boundary. + * + * Padding is done using 0x5a (POISON_INUSE) + * + * object + s->size + * Nothing is used beyond s->size. + * + * If slabcaches are merged then the objsize and inuse boundaries are mostly + * ignored. And therefore no slab options that rely on these boundaries + * may be used with merged slabcaches. + */ + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", + s->name, message, data, from, to - 1); + memset(from, data, to - from); +} + +static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned long off = s->inuse; /* The end of info */ + + if (s->offset) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); + + if (s->size == off) + return 1; + + if (check_bytes(p + off, POISON_INUSE, s->size - off)) + return 1; + + object_err(s, page, p, "Object padding check fails"); + + /* + * Restore padding + */ + restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); + return 0; +} + +static int slab_pad_check(struct kmem_cache *s, struct page *page) +{ + u8 *p; + int length, remainder; + + if (!(s->flags & SLAB_POISON)) + return 1; + + p = page_address(page); + length = s->objects * s->size; + remainder = (PAGE_SIZE << s->order) - length; + if (!remainder) + return 1; + + if (!check_bytes(p + length, POISON_INUSE, remainder)) { + slab_err(s, page, "Padding check failed"); + restore_bytes(s, "slab padding", POISON_INUSE, p + length, + p + length + remainder); + return 0; + } + return 1; +} + +static int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) +{ + u8 *p = object; + u8 *endobject = object + s->objsize; + + if (s->flags & SLAB_RED_ZONE) { + unsigned int red = + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; + + if (!check_bytes(endobject, red, s->inuse - s->objsize)) { + object_err(s, page, object, + active ? "Redzone Active" : "Redzone Inactive"); + restore_bytes(s, "redzone", red, + endobject, object + s->inuse); + return 0; + } + } else { + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && + !check_bytes(endobject, POISON_INUSE, + s->inuse - s->objsize)) { + object_err(s, page, p, "Alignment padding check fails"); + /* + * Fix it so that there will not be another report. + * + * Hmmm... We may be corrupting an object that now expects + * to be longer than allowed. + */ + restore_bytes(s, "alignment padding", POISON_INUSE, + endobject, object + s->inuse); + } + } + + if (s->flags & SLAB_POISON) { + if (!active && (s->flags & __OBJECT_POISON) && + (!check_bytes(p, POISON_FREE, s->objsize - 1) || + p[s->objsize - 1] != POISON_END)) { + + object_err(s, page, p, "Poison check failed"); + restore_bytes(s, "Poison", POISON_FREE, + p, p + s->objsize -1); + restore_bytes(s, "Poison", POISON_END, + p + s->objsize - 1, p + s->objsize); + return 0; + } + /* + * check_pad_bytes cleans up on its own. + */ + check_pad_bytes(s, page, p); + } + + if (!s->offset && active) + /* + * Object and freepointer overlap. Cannot check + * freepointer while object is allocated. + */ + return 1; + + /* Check free pointer validity */ + if (!check_valid_pointer(s, page, get_freepointer(s, p))) { + object_err(s, page, p, "Freepointer corrupt"); + /* + * No choice but to zap it and thus loose the remainder + * of the free objects in this slab. May cause + * another error because the object count is now wrong. + */ + set_freepointer(s, p, NULL); + return 0; + } + return 1; +} + +static int check_slab(struct kmem_cache *s, struct page *page) +{ + VM_BUG_ON(!irqs_disabled()); + + if (!PageSlab(page)) { + slab_err(s, page, "Not a valid slab page flags=%lx " + "mapping=0x%p count=%d", page->flags, page->mapping, + page_count(page)); + return 0; + } + if (page->offset * sizeof(void *) != s->offset) { + slab_err(s, page, "Corrupted offset %lu flags=0x%lx " + "mapping=0x%p count=%d", + (unsigned long)(page->offset * sizeof(void *)), + page->flags, + page->mapping, + page_count(page)); + return 0; + } + if (page->inuse > s->objects) { + slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " + "mapping=0x%p count=%d", + s->name, page->inuse, s->objects, page->flags, + page->mapping, page_count(page)); + return 0; + } + /* Slab_pad_check fixes things up after itself */ + slab_pad_check(s, page); + return 1; +} + +/* + * Determine if a certain object on a page is on the freelist. Must hold the + * slab lock to guarantee that the chains are in a consistent state. + */ +static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +{ + int nr = 0; + void *fp = page->freelist; + void *object = NULL; + + while (fp && nr <= s->objects) { + if (fp == search) + return 1; + if (!check_valid_pointer(s, page, fp)) { + if (object) { + object_err(s, page, object, + "Freechain corrupt"); + set_freepointer(s, object, NULL); + break; + } else { + slab_err(s, page, "Freepointer 0x%p corrupt", + fp); + page->freelist = NULL; + page->inuse = s->objects; + printk(KERN_ERR "@@@ SLUB %s: Freelist " + "cleared. Slab 0x%p\n", + s->name, page); + return 0; + } + break; + } + object = fp; + fp = get_freepointer(s, object); + nr++; + } + + if (page->inuse != s->objects - nr) { + slab_err(s, page, "Wrong object count. Counter is %d but " + "counted were %d", s, page, page->inuse, + s->objects - nr); + page->inuse = s->objects - nr; + printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " + "Slab @0x%p\n", s->name, page); + } + return search == NULL; +} + +/* + * Tracking of fully allocated slabs for debugging purposes. + */ +static void add_full(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + list_add(&page->lru, &n->full); + spin_unlock(&n->list_lock); +} + +static void remove_full(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n; + + if (!(s->flags & SLAB_STORE_USER)) + return; + + n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + spin_unlock(&n->list_lock); +} + +static int alloc_object_checks(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!check_slab(s, page)) + goto bad; + + if (object && !on_freelist(s, page, object)) { + slab_err(s, page, "Object 0x%p already allocated", object); + goto bad; + } + + if (!check_valid_pointer(s, page, object)) { + object_err(s, page, object, "Freelist Pointer check fails"); + goto bad; + } + + if (!object) + return 1; + + if (!check_object(s, page, object, 0)) + goto bad; + + return 1; +bad: + if (PageSlab(page)) { + /* + * If this is a slab page then lets do the best we can + * to avoid issues in the future. Marking all objects + * as used avoids touching the remaining objects. + */ + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", + s->name, page); + page->inuse = s->objects; + page->freelist = NULL; + /* Fix up fields that may be corrupted */ + page->offset = s->offset / sizeof(void *); + } + return 0; +} + +static int free_object_checks(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!check_slab(s, page)) + goto fail; + + if (!check_valid_pointer(s, page, object)) { + slab_err(s, page, "Invalid object pointer 0x%p", object); + goto fail; + } + + if (on_freelist(s, page, object)) { + slab_err(s, page, "Object 0x%p already free", object); + goto fail; + } + + if (!check_object(s, page, object, 1)) + return 0; + + if (unlikely(s != page->slab)) { + if (!PageSlab(page)) + slab_err(s, page, "Attempt to free object(0x%p) " + "outside of slab", object); + else + if (!page->slab) { + printk(KERN_ERR + "SLUB <none>: no slab for object 0x%p.\n", + object); + dump_stack(); + } + else + slab_err(s, page, "object at 0x%p belongs " + "to slab %s", object, page->slab->name); + goto fail; + } + return 1; +fail: + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", + s->name, page, object); + return 0; +} + +static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) +{ + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + s->name, + alloc ? "alloc" : "free", + object, page->inuse, + page->freelist); + + if (!alloc) + print_section("Object", (void *)object, s->objsize); + + dump_stack(); + } +} + +static int __init setup_slub_debug(char *str) +{ + if (!str || *str != '=') + slub_debug = DEBUG_DEFAULT_FLAGS; + else { + str++; + if (*str == 0 || *str == ',') + slub_debug = DEBUG_DEFAULT_FLAGS; + else + for( ;*str && *str != ','; str++) + switch (*str) { + case 'f' : case 'F' : + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z' : case 'Z' : + slub_debug |= SLAB_RED_ZONE; + break; + case 'p' : case 'P' : + slub_debug |= SLAB_POISON; + break; + case 'u' : case 'U' : + slub_debug |= SLAB_STORE_USER; + break; + case 't' : case 'T' : + slub_debug |= SLAB_TRACE; + break; + default: + printk(KERN_ERR "slub_debug option '%c' " + "unknown. skipped\n",*str); + } + } + + if (*str == ',') + slub_debug_slabs = str + 1; + return 1; +} + +__setup("slub_debug", setup_slub_debug); + +static void kmem_cache_open_debug_check(struct kmem_cache *s) +{ + /* + * The page->offset field is only 16 bit wide. This is an offset + * in units of words from the beginning of an object. If the slab + * size is bigger then we cannot move the free pointer behind the + * object anymore. + * + * On 32 bit platforms the limit is 256k. On 64bit platforms + * the limit is 512k. + * + * Debugging or ctor/dtors may create a need to move the free + * pointer. Fail if this happens. + */ + if (s->size >= 65535 * sizeof(void *)) { + BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); + BUG_ON(s->ctor || s->dtor); + } + else + /* + * Enable debugging if selected on the kernel commandline. + */ + if (slub_debug && (!slub_debug_slabs || + strncmp(slub_debug_slabs, s->name, + strlen(slub_debug_slabs)) == 0)) + s->flags |= slub_debug; +} +#else + +static inline int alloc_object_checks(struct kmem_cache *s, + struct page *page, void *object) { return 0; } + +static inline int free_object_checks(struct kmem_cache *s, + struct page *page, void *object) { return 0; } + +static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct page *page) {} +static inline void trace(struct kmem_cache *s, struct page *page, + void *object, int alloc) {} +static inline void init_object(struct kmem_cache *s, + void *object, int active) {} +static inline void init_tracking(struct kmem_cache *s, void *object) {} +static inline int slab_pad_check(struct kmem_cache *s, struct page *page) + { return 1; } +static inline int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) { return 1; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, void *addr) {} +static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} +#define slub_debug 0 +#endif +/* + * Slab allocation and freeing + */ +static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page * page; + int pages = 1 << s->order; + + if (s->order) + flags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + flags |= SLUB_DMA; + + if (node == -1) + page = alloc_pages(flags, s->order); + else + page = alloc_pages_node(node, flags, s->order); + + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + pages); + + return page; +} + +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + if (SlabDebug(page)) { + init_object(s, object, 0); + init_tracking(s, object); + } + + if (unlikely(s->ctor)) + s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR); +} + +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + struct kmem_cache_node *n; + void *start; + void *end; + void *last; + void *p; + + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); + + if (flags & __GFP_WAIT) + local_irq_enable(); + + page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); + if (!page) + goto out; + + n = get_node(s, page_to_nid(page)); + if (n) + atomic_long_inc(&n->nr_slabs); + page->offset = s->offset / sizeof(void *); + page->slab = s; + page->flags |= 1 << PG_slab; + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_TRACE)) + SetSlabDebug(page); + + start = page_address(page); + end = start + s->objects * s->size; + + if (unlikely(s->flags & SLAB_POISON)) + memset(start, POISON_INUSE, PAGE_SIZE << s->order); + + last = start; + for_each_object(p, s, start) { + setup_object(s, page, last); + set_freepointer(s, last, p); + last = p; + } + setup_object(s, page, last); + set_freepointer(s, last, NULL); + + page->freelist = start; + page->lockless_freelist = NULL; + page->inuse = 0; +out: + if (flags & __GFP_WAIT) + local_irq_disable(); + return page; +} + +static void __free_slab(struct kmem_cache *s, struct page *page) +{ + int pages = 1 << s->order; + + if (unlikely(SlabDebug(page) || s->dtor)) { + void *p; + + slab_pad_check(s, page); + for_each_object(p, s, page_address(page)) { + if (s->dtor) + s->dtor(p, s, 0); + check_object(s, page, p, 0); + } + } + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + - pages); + + page->mapping = NULL; + __free_pages(page, s->order); +} + +static void rcu_free_slab(struct rcu_head *h) +{ + struct page *page; + + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); +} + +static void free_slab(struct kmem_cache *s, struct page *page) +{ + if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + /* + * RCU free overloads the RCU head over the LRU + */ + struct rcu_head *head = (void *)&page->lru; + + call_rcu(head, rcu_free_slab); + } else + __free_slab(s, page); +} + +static void discard_slab(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + atomic_long_dec(&n->nr_slabs); + reset_page_mapcount(page); + ClearSlabDebug(page); + __ClearPageSlab(page); + free_slab(s, page); +} + +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + bit_spin_unlock(PG_locked, &page->flags); +} + +static __always_inline int slab_trylock(struct page *page) +{ + int rc = 1; + + rc = bit_spin_trylock(PG_locked, &page->flags); + return rc; +} + +/* + * Management of partially allocated slabs + */ +static void add_partial_tail(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} + +static void add_partial(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + n->nr_partial++; + list_add(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} + +static void remove_partial(struct kmem_cache *s, + struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + n->nr_partial--; + spin_unlock(&n->list_lock); +} + +/* + * Lock slab and remove from the partial list. + * + * Must hold list_lock. + */ +static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) +{ + if (slab_trylock(page)) { + list_del(&page->lru); + n->nr_partial--; + return 1; + } + return 0; +} + +/* + * Try to allocate a partial slab from a specific node. + */ +static struct page *get_partial_node(struct kmem_cache_node *n) +{ + struct page *page; + + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab and there is none available then get_partials() + * will return NULL. + */ + if (!n || !n->nr_partial) + return NULL; + + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_del_slab(n, page)) + goto out; + page = NULL; +out: + spin_unlock(&n->list_lock); + return page; +} + +/* + * Get a page from somewhere. Search in increasing NUMA distances. + */ +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist; + struct zone **z; + struct page *page; + + /* + * The defrag ratio allows a configuration of the tradeoffs between + * inter node defragmentation and node local allocations. A lower + * defrag_ratio increases the tendency to do local allocations + * instead of attempting to obtain partial slabs from other nodes. + * + * If the defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If the ratio is higher then kmalloc() + * may return off node objects because partial slabs are obtained + * from other nodes and filled up. + * + * If /sys/slab/xx/defrag_ratio is set to 100 (which makes + * defrag_ratio = 1000) then every (well almost) allocation will + * first attempt to defrag slab caches on other nodes. This means + * scanning over all nodes to look for partial slabs which may be + * expensive if we do it every time we are trying to find a slab + * with available objects. + */ + if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) + return NULL; + + zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + for (z = zonelist->zones; *z; z++) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(*z)); + + if (n && cpuset_zone_allowed_hardwall(*z, flags) && + n->nr_partial > MIN_PARTIAL) { + page = get_partial_node(n); + if (page) + return page; + } + } +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ +static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; + + page = get_partial_node(get_node(s, searchnode)); + if (page || (flags & __GFP_THISNODE)) + return page; + + return get_any_partial(s, flags); +} + +/* + * Move a page back to the lists. + * + * Must be called with the slab lock held. + * + * On exit the slab lock will have been dropped. + */ +static void putback_slab(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + if (page->inuse) { + + if (page->freelist) + add_partial(n, page); + else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) + add_full(n, page); + slab_unlock(page); + + } else { + if (n->nr_partial < MIN_PARTIAL) { + /* + * Adding an empty slab to the partial slabs in order + * to avoid page allocator overhead. This slab needs + * to come after the other slabs with objects in + * order to fill them up. That way the size of the + * partial list stays small. kmem_cache_shrink can + * reclaim empty slabs from the partial list. + */ + add_partial_tail(n, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } + } +} + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +{ + /* + * Merge cpu freelist into freelist. Typically we get here + * because both freelists are empty. So this is unlikely + * to occur. + */ + while (unlikely(page->lockless_freelist)) { + void **object; + + /* Retrieve object from cpu_freelist */ + object = page->lockless_freelist; + page->lockless_freelist = page->lockless_freelist[page->offset]; + + /* And put onto the regular freelist */ + object[page->offset] = page->freelist; + page->freelist = object; + page->inuse--; + } + s->cpu_slab[cpu] = NULL; + ClearPageActive(page); + + putback_slab(s, page); +} + +static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +{ + slab_lock(page); + deactivate_slab(s, page, cpu); +} + +/* + * Flush cpu slab. + * Called from IPI handler with interrupts disabled. + */ +static void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct page *page = s->cpu_slab[cpu]; + + if (likely(page)) + flush_slab(s, page, cpu); +} + +static void flush_cpu_slab(void *d) +{ + struct kmem_cache *s = d; + int cpu = smp_processor_id(); + + __flush_cpu_slab(s, cpu); +} + +static void flush_all(struct kmem_cache *s) +{ +#ifdef CONFIG_SMP + on_each_cpu(flush_cpu_slab, s, 1, 1); +#else + unsigned long flags; + + local_irq_save(flags); + flush_cpu_slab(s); + local_irq_restore(flags); +#endif +} + +/* + * Slow path. The lockless freelist is empty or we need to perform + * debugging duties. + * + * Interrupts are disabled. + * + * Processing is still very fast if new objects have been freed to the + * regular freelist. In that case we simply take over the regular freelist + * as the lockless freelist and zap the regular freelist. + * + * If that is not working then we fall back to the partial lists. We take the + * first element of the freelist as the object to allocate now and move the + * rest of the freelist to the lockless freelist. + * + * And if we were unable to get a new slab from the partial slab lists then + * we need to allocate a new slab. This is slowest path since we may sleep. + */ +static void *__slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr, struct page *page) +{ + void **object; + int cpu = smp_processor_id(); + + if (!page) + goto new_slab; + + slab_lock(page); + if (unlikely(node != -1 && page_to_nid(page) != node)) + goto another_slab; +load_freelist: + object = page->freelist; + if (unlikely(!object)) + goto another_slab; + if (unlikely(SlabDebug(page))) + goto debug; + + object = page->freelist; + page->lockless_freelist = object[page->offset]; + page->inuse = s->objects; + page->freelist = NULL; + slab_unlock(page); + return object; + +another_slab: + deactivate_slab(s, page, cpu); + +new_slab: + page = get_partial(s, gfpflags, node); + if (page) { +have_slab: + s->cpu_slab[cpu] = page; + SetPageActive(page); + goto load_freelist; + } + + page = new_slab(s, gfpflags, node); + if (page) { + cpu = smp_processor_id(); + if (s->cpu_slab[cpu]) { + /* + * Someone else populated the cpu_slab while we + * enabled interrupts, or we have gotten scheduled + * on another cpu. The page may not be on the + * requested node even if __GFP_THISNODE was + * specified. So we need to recheck. + */ + if (node == -1 || + page_to_nid(s->cpu_slab[cpu]) == node) { + /* + * Current cpuslab is acceptable and we + * want the current one since its cache hot + */ + discard_slab(s, page); + page = s->cpu_slab[cpu]; + slab_lock(page); + goto load_freelist; + } + /* New slab does not fit our expectations */ + flush_slab(s, s->cpu_slab[cpu], cpu); + } + slab_lock(page); + goto have_slab; + } + return NULL; +debug: + object = page->freelist; + if (!alloc_object_checks(s, page, object)) + goto another_slab; + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_ALLOC, addr); + trace(s, page, object, 1); + init_object(s, object, 1); + + page->inuse++; + page->freelist = object[page->offset]; + slab_unlock(page); + return object; +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static void __always_inline *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr) +{ + struct page *page; + void **object; + unsigned long flags; + + local_irq_save(flags); + page = s->cpu_slab[smp_processor_id()]; + if (unlikely(!page || !page->lockless_freelist || + (node != -1 && page_to_nid(page) != node))) + + object = __slab_alloc(s, gfpflags, node, addr, page); + + else { + object = page->lockless_freelist; + page->lockless_freelist = object[page->offset]; + } + local_irq_restore(flags); + return object; +} + +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_NUMA +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +{ + return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node); +#endif + +/* + * Slow patch handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. + * + * So we still attempt to reduce cache line usage. Just take the slab + * lock and free the item. If there is no additional partial page + * handling required then we can return immediately. + */ +static void __slab_free(struct kmem_cache *s, struct page *page, + void *x, void *addr) +{ + void *prior; + void **object = (void *)x; + + slab_lock(page); + + if (unlikely(SlabDebug(page))) + goto debug; +checks_ok: + prior = object[page->offset] = page->freelist; + page->freelist = object; + page->inuse--; + + if (unlikely(PageActive(page))) + /* + * Cpu slabs are never on partial lists and are + * never freed. + */ + goto out_unlock; + + if (unlikely(!page->inuse)) + goto slab_empty; + + /* + * Objects left in the slab. If it + * was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) + add_partial(get_node(s, page_to_nid(page)), page); + +out_unlock: + slab_unlock(page); + return; + +slab_empty: + if (prior) + /* + * Slab still on the partial list. + */ + remove_partial(s, page); + + slab_unlock(page); + discard_slab(s, page); + return; + +debug: + if (!free_object_checks(s, page, x)) + goto out_unlock; + if (!PageActive(page) && !page->freelist) + remove_full(s, page); + if (s->flags & SLAB_STORE_USER) + set_track(s, x, TRACK_FREE, addr); + trace(s, page, object, 0); + init_object(s, object, 0); + goto checks_ok; +} + +/* + * Fastpath with forced inlining to produce a kfree and kmem_cache_free that + * can perform fastpath freeing without additional function calls. + * + * The fastpath is only possible if we are freeing to the current cpu slab + * of this processor. This typically the case if we have just allocated + * the item before. + * + * If fastpath is not possible then fall back to __slab_free where we deal + * with all sorts of special processing. + */ +static void __always_inline slab_free(struct kmem_cache *s, + struct page *page, void *x, void *addr) +{ + void **object = (void *)x; + unsigned long flags; + + local_irq_save(flags); + if (likely(page == s->cpu_slab[smp_processor_id()] && + !SlabDebug(page))) { + object[page->offset] = page->lockless_freelist; + page->lockless_freelist = object; + } else + __slab_free(s, page, x, addr); + + local_irq_restore(flags); +} + +void kmem_cache_free(struct kmem_cache *s, void *x) +{ + struct page *page; + + page = virt_to_head_page(x); + + slab_free(s, page, x, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_free); + +/* Figure out on which slab object the object resides */ +static struct page *get_object_page(const void *x) +{ + struct page *page = virt_to_head_page(x); + + if (!PageSlab(page)) + return NULL; + + return page; +} + +/* + * Object placement in a slab is made very easy because we always start at + * offset 0. If we tune the size of the object to the alignment then we can + * get the required alignment by putting one properly sized object after + * another. + * + * Notice that the allocation order determines the sizes of the per cpu + * caches. Each processor has always one slab available for allocations. + * Increasing the allocation order reduces the number of times that slabs + * must be moved on and off the partial lists and is therefore a factor in + * locking overhead. + */ + +/* + * Mininum / Maximum order of slab pages. This influences locking overhead + * and slab fragmentation. A higher order reduces the number of partial slabs + * and increases the number of allocations possible without having to + * take the list_lock. + */ +static int slub_min_order; +static int slub_max_order = DEFAULT_MAX_ORDER; +static int slub_min_objects = DEFAULT_MIN_OBJECTS; + +/* + * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) + */ +static int slub_nomerge; + +/* + * Calculate the order of allocation given an slab object size. + * + * The order of allocation has significant impact on performance and other + * system components. Generally order 0 allocations should be preferred since + * order 0 does not cause fragmentation in the page allocator. Larger objects + * be problematic to put into order 0 slabs because there may be too much + * unused space left. We go to a higher order if more than 1/8th of the slab + * would be wasted. + * + * In order to reach satisfactory performance we must ensure that a minimum + * number of objects is in one slab. Otherwise we may generate too much + * activity on the partial lists which requires taking the list_lock. This is + * less a concern for large slabs though which are rarely used. + * + * slub_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slub_max_order then + * we try to keep the page order as low as possible. So we accept more waste + * of space in favor of a small page order. + * + * Higher order allocations also allow the placement of more objects in a + * slab and thereby reduce object handling overhead. If the user has + * requested a higher mininum order then we start with that one instead of + * the smallest order which will fit the object. + */ +static inline int slab_order(int size, int min_objects, + int max_order, int fract_leftover) +{ + int order; + int rem; + + for (order = max(slub_min_order, + fls(min_objects * size - 1) - PAGE_SHIFT); + order <= max_order; order++) { + + unsigned long slab_size = PAGE_SIZE << order; + + if (slab_size < min_objects * size) + continue; + + rem = slab_size % size; + + if (rem <= slab_size / fract_leftover) + break; + + } + + return order; +} + +static inline int calculate_order(int size) +{ + int order; + int min_objects; + int fraction; + + /* + * Attempt to find best configuration for a slab. This + * works by first attempting to generate a layout with + * the best configuration and backing off gradually. + * + * First we reduce the acceptable waste in a slab. Then + * we reduce the minimum objects required in a slab. + */ + min_objects = slub_min_objects; + while (min_objects > 1) { + fraction = 8; + while (fraction >= 4) { + order = slab_order(size, min_objects, + slub_max_order, fraction); + if (order <= slub_max_order) + return order; + fraction /= 2; + } + min_objects /= 2; + } + + /* + * We were unable to place multiple objects in a slab. Now + * lets see if we can place a single object there. + */ + order = slab_order(size, 1, slub_max_order, 1); + if (order <= slub_max_order) + return order; + + /* + * Doh this slab cannot be placed using slub_max_order. + */ + order = slab_order(size, 1, MAX_ORDER, 1); + if (order <= MAX_ORDER) + return order; + return -ENOSYS; +} + +/* + * Figure out what the alignment of the objects will be. + */ +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then + * follow that suggestion if the object is sufficiently + * large. + * + * The hardware cache alignment cannot override the + * specified alignment though. If that is greater + * then use it. + */ + if ((flags & SLAB_HWCACHE_ALIGN) && + size > cache_line_size() / 2) + return max_t(unsigned long, align, cache_line_size()); + + if (align < ARCH_SLAB_MINALIGN) + return ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +static void init_kmem_cache_node(struct kmem_cache_node *n) +{ + n->nr_partial = 0; + atomic_long_set(&n->nr_slabs, 0); + spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); + INIT_LIST_HEAD(&n->full); +} + +#ifdef CONFIG_NUMA +/* + * No kmalloc_node yet so do it by hand. We know that this is the first + * slab on the node for this slabcache. There are no concurrent accesses + * possible. + * + * Note that this function only works on the kmalloc_node_cache + * when allocating for the kmalloc_node_cache. + */ +static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, + int node) +{ + struct page *page; + struct kmem_cache_node *n; + + BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + + page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + /* new_slab() disables interupts */ + local_irq_enable(); + + BUG_ON(!page); + n = page->freelist; + BUG_ON(!n); + page->freelist = get_freepointer(kmalloc_caches, n); + page->inuse++; + kmalloc_caches->node[node] = n; + init_object(kmalloc_caches, n, 1); + init_kmem_cache_node(n); + atomic_long_inc(&n->nr_slabs); + add_partial(n, page); + return n; +} + +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ + int node; + + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + if (n && n != &s->local_node) + kmem_cache_free(kmalloc_caches, n); + s->node[node] = NULL; + } +} + +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + int node; + int local_node; + + if (slab_state >= UP) + local_node = page_to_nid(virt_to_page(s)); + else + local_node = 0; + + for_each_online_node(node) { + struct kmem_cache_node *n; + + if (local_node == node) + n = &s->local_node; + else { + if (slab_state == DOWN) { + n = early_kmem_cache_node_alloc(gfpflags, + node); + continue; + } + n = kmem_cache_alloc_node(kmalloc_caches, + gfpflags, node); + + if (!n) { + free_kmem_cache_nodes(s); + return 0; + } + + } + s->node[node] = n; + init_kmem_cache_node(n); + } + return 1; +} +#else +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ +} + +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + init_kmem_cache_node(&s->local_node); + return 1; +} +#endif + +/* + * calculate_sizes() determines the order and the distribution of data within + * a slab object. + */ +static int calculate_sizes(struct kmem_cache *s) +{ + unsigned long flags = s->flags; + unsigned long size = s->objsize; + unsigned long align = s->align; + + /* + * Determine if we can poison the object itself. If the user of + * the slab may touch the object after free or before allocation + * then we should never poison the object itself. + */ + if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + !s->ctor && !s->dtor) + s->flags |= __OBJECT_POISON; + else + s->flags &= ~__OBJECT_POISON; + + /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); + +#ifdef CONFIG_SLUB_DEBUG + /* + * If we are Redzoning then check if there is some space between the + * end of the object and the free pointer. If not then add an + * additional word to have some bytes to store Redzone information. + */ + if ((flags & SLAB_RED_ZONE) && size == s->objsize) + size += sizeof(void *); +#endif + + /* + * With that we have determined the number of bytes in actual use + * by the object. This is the potential offset to the free pointer. + */ + s->inuse = size; + +#ifdef CONFIG_SLUB_DEBUG + if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + s->ctor || s->dtor)) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or + * destructor or are poisoning the objects. + */ + s->offset = size; + size += sizeof(void *); + } + + if (flags & SLAB_STORE_USER) + /* + * Need to store information about allocs and frees after + * the object. + */ + size += 2 * sizeof(struct track); + + if (flags & SLAB_RED_ZONE) + /* + * Add some empty padding so that we can catch + * overwrites from earlier objects rather than let + * tracking information or the free pointer be + * corrupted if an user writes before the start + * of the object. + */ + size += sizeof(void *); +#endif + + /* + * Determine the alignment based on various parameters that the + * user specified and the dynamic determination of cache line size + * on bootup. + */ + align = calculate_alignment(flags, align, s->objsize); + + /* + * SLUB stores one object immediately after another beginning from + * offset 0. In order to align the objects we have to simply size + * each object to conform to the alignment. + */ + size = ALIGN(size, align); + s->size = size; + + s->order = calculate_order(size); + if (s->order < 0) + return 0; + + /* + * Determine the number of objects per slab + */ + s->objects = (PAGE_SIZE << s->order) / size; + + /* + * Verify that the number of objects is within permitted limits. + * The page->inuse field is only 16 bit wide! So we cannot have + * more than 64k objects per slab. + */ + if (!s->objects || s->objects > 65535) + return 0; + return 1; + +} + +static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, + const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + memset(s, 0, kmem_size); + s->name = name; + s->ctor = ctor; + s->dtor = dtor; + s->objsize = size; + s->flags = flags; + s->align = align; + kmem_cache_open_debug_check(s); + + if (!calculate_sizes(s)) + goto error; + + s->refcount = 1; +#ifdef CONFIG_NUMA + s->defrag_ratio = 100; +#endif + + if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + return 1; +error: + if (flags & SLAB_PANIC) + panic("Cannot create slab %s size=%lu realsize=%u " + "order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)size, s->size, s->order, + s->offset, flags); + return 0; +} +EXPORT_SYMBOL(kmem_cache_open); + +/* + * Check if a given pointer is valid + */ +int kmem_ptr_validate(struct kmem_cache *s, const void *object) +{ + struct page * page; + + page = get_object_page(object); + + if (!page || s != page->slab) + /* No slab or wrong slab */ + return 0; + + if (!check_valid_pointer(s, page, object)) + return 0; + + /* + * We could also check if the object is on the slabs freelist. + * But this would be too expensive and it seems that the main + * purpose of kmem_ptr_valid is to check if the object belongs + * to a certain slab. + */ + return 1; +} +EXPORT_SYMBOL(kmem_ptr_validate); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->objsize; +} +EXPORT_SYMBOL(kmem_cache_size); + +const char *kmem_cache_name(struct kmem_cache *s) +{ + return s->name; +} +EXPORT_SYMBOL(kmem_cache_name); + +/* + * Attempt to free all slabs on a node. Return the number of slabs we + * were unable to free. + */ +static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, + struct list_head *list) +{ + int slabs_inuse = 0; + unsigned long flags; + struct page *page, *h; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry_safe(page, h, list, lru) + if (!page->inuse) { + list_del(&page->lru); + discard_slab(s, page); + } else + slabs_inuse++; + spin_unlock_irqrestore(&n->list_lock, flags); + return slabs_inuse; +} + +/* + * Release all resources used by a slab cache. + */ +static int kmem_cache_close(struct kmem_cache *s) +{ + int node; + + flush_all(s); + + /* Attempt to free all objects */ + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + n->nr_partial -= free_list(s, n, &n->partial); + if (atomic_long_read(&n->nr_slabs)) + return 1; + } + free_kmem_cache_nodes(s); + return 0; +} + +/* + * Close a cache and release the kmem_cache structure + * (must be used for caches created using kmem_cache_create) + */ +void kmem_cache_destroy(struct kmem_cache *s) +{ + down_write(&slub_lock); + s->refcount--; + if (!s->refcount) { + list_del(&s->list); + if (kmem_cache_close(s)) + WARN_ON(1); + sysfs_slab_remove(s); + kfree(s); + } + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +/******************************************************************** + * Kmalloc subsystem + *******************************************************************/ + +struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +EXPORT_SYMBOL(kmalloc_caches); + +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; +#endif + +static int __init setup_slub_min_order(char *str) +{ + get_option (&str, &slub_min_order); + + return 1; +} + +__setup("slub_min_order=", setup_slub_min_order); + +static int __init setup_slub_max_order(char *str) +{ + get_option (&str, &slub_max_order); + + return 1; +} + +__setup("slub_max_order=", setup_slub_max_order); + +static int __init setup_slub_min_objects(char *str) +{ + get_option (&str, &slub_min_objects); + + return 1; +} + +__setup("slub_min_objects=", setup_slub_min_objects); + +static int __init setup_slub_nomerge(char *str) +{ + slub_nomerge = 1; + return 1; +} + +__setup("slub_nomerge", setup_slub_nomerge); + +static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, + const char *name, int size, gfp_t gfp_flags) +{ + unsigned int flags = 0; + + if (gfp_flags & SLUB_DMA) + flags = SLAB_CACHE_DMA; + + down_write(&slub_lock); + if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + flags, NULL, NULL)) + goto panic; + + list_add(&s->list, &slab_caches); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto panic; + return s; + +panic: + panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); +} + +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index = kmalloc_index(size); + + if (!index) + return NULL; + + /* Allocation too large? */ + BUG_ON(index < 0); + +#ifdef CONFIG_ZONE_DMA + if ((flags & SLUB_DMA)) { + struct kmem_cache *s; + struct kmem_cache *x; + char *text; + size_t realsize; + + s = kmalloc_caches_dma[index]; + if (s) + return s; + + /* Dynamically create dma cache */ + x = kmalloc(kmem_size, flags & ~SLUB_DMA); + if (!x) + panic("Unable to allocate memory for dma cache\n"); + + if (index <= KMALLOC_SHIFT_HIGH) + realsize = 1 << index; + else { + if (index == 1) + realsize = 96; + else + realsize = 192; + } + + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", + (unsigned int)realsize); + s = create_kmalloc_cache(x, text, realsize, flags); + kmalloc_caches_dma[index] = s; + return s; + } +#endif + return &kmalloc_caches[index]; +} + +void *__kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *s = get_slab(size, flags); + + if (s) + return slab_alloc(s, flags, -1, __builtin_return_address(0)); + return NULL; +} +EXPORT_SYMBOL(__kmalloc); + +#ifdef CONFIG_NUMA +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + struct kmem_cache *s = get_slab(size, flags); + + if (s) + return slab_alloc(s, flags, node, __builtin_return_address(0)); + return NULL; +} +EXPORT_SYMBOL(__kmalloc_node); +#endif + +size_t ksize(const void *object) +{ + struct page *page = get_object_page(object); + struct kmem_cache *s; + + BUG_ON(!page); + s = page->slab; + BUG_ON(!s); + + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} +EXPORT_SYMBOL(ksize); + +void kfree(const void *x) +{ + struct kmem_cache *s; + struct page *page; + + if (!x) + return; + + page = virt_to_head_page(x); + s = page->slab; + + slab_free(s, page, (void *)x, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kfree); + +/* + * kmem_cache_shrink removes empty slabs from the partial lists and sorts + * the remaining slabs by the number of items in use. The slabs with the + * most items in use come first. New allocations will then fill those up + * and thus they can be removed from the partial lists. + * + * The slabs with the least items are placed last. This results in them + * being allocated from last increasing the chance that the last objects + * are freed in them. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + int i; + struct kmem_cache_node *n; + struct page *page; + struct page *t; + struct list_head *slabs_by_inuse = + kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + unsigned long flags; + + if (!slabs_by_inuse) + return -ENOMEM; + + flush_all(s); + for_each_online_node(node) { + n = get_node(s, node); + + if (!n->nr_partial) + continue; + + for (i = 0; i < s->objects; i++) + INIT_LIST_HEAD(slabs_by_inuse + i); + + spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists indexed by the items in use in each slab. + * + * Note that concurrent frees may occur while we hold the + * list_lock. page->inuse here is the upper limit. + */ + list_for_each_entry_safe(page, t, &n->partial, lru) { + if (!page->inuse && slab_trylock(page)) { + /* + * Must hold slab lock here because slab_free + * may have freed the last object and be + * waiting to release the slab. + */ + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + } else { + if (n->nr_partial > MAX_PARTIAL) + list_move(&page->lru, + slabs_by_inuse + page->inuse); + } + } + + if (n->nr_partial <= MAX_PARTIAL) + goto out; + + /* + * Rebuild the partial list with the slabs filled up most + * first and the least used slabs at the end. + */ + for (i = s->objects - 1; i >= 0; i--) + list_splice(slabs_by_inuse + i, n->partial.prev); + + out: + spin_unlock_irqrestore(&n->list_lock, flags); + } + + kfree(slabs_by_inuse); + return 0; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks; + + if (unlikely(!p)) + return kmalloc(new_size, flags); + + if (unlikely(!new_size)) { + kfree(p); + return NULL; + } + + ks = ksize(p); + if (ks >= new_size) + return (void *)p; + + ret = kmalloc(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ks)); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + +/******************************************************************** + * Basic setup of slabs + *******************************************************************/ + +void __init kmem_cache_init(void) +{ + int i; + +#ifdef CONFIG_NUMA + /* + * Must first have the slab cache available for the allocations of the + * struct kmem_cache_node's. There is special bootstrap code in + * kmem_cache_open for slab_state == DOWN. + */ + create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", + sizeof(struct kmem_cache_node), GFP_KERNEL); +#endif + + /* Able to allocate the per node structures */ + slab_state = PARTIAL; + + /* Caches that are not of the two-to-the-power-of size */ + create_kmalloc_cache(&kmalloc_caches[1], + "kmalloc-96", 96, GFP_KERNEL); + create_kmalloc_cache(&kmalloc_caches[2], + "kmalloc-192", 192, GFP_KERNEL); + + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + create_kmalloc_cache(&kmalloc_caches[i], + "kmalloc", 1 << i, GFP_KERNEL); + + slab_state = UP; + + /* Provide the correct kmalloc names now that the caches are up */ + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + kmalloc_caches[i]. name = + kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + +#ifdef CONFIG_SMP + register_cpu_notifier(&slab_notifier); +#endif + + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct page *); + + printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + " Processors=%d, Nodes=%d\n", + KMALLOC_SHIFT_HIGH, cache_line_size(), + slub_min_order, slub_max_order, slub_min_objects, + nr_cpu_ids, nr_node_ids); +} + +/* + * Find a mergeable slab cache + */ +static int slab_unmergeable(struct kmem_cache *s) +{ + if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) + return 1; + + if (s->ctor || s->dtor) + return 1; + + return 0; +} + +static struct kmem_cache *find_mergeable(size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + struct list_head *h; + + if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) + return NULL; + + if (ctor || dtor) + return NULL; + + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + if (slab_unmergeable(s)) + continue; + + if (size > s->size) + continue; + + if (((flags | slub_debug) & SLUB_MERGE_SAME) != + (s->flags & SLUB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align -1)) != s->size) + continue; + + if (s->size - size >= sizeof(void *)) + continue; + + return s; + } + return NULL; +} + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + struct kmem_cache *s; + + down_write(&slub_lock); + s = find_mergeable(size, align, flags, dtor, ctor); + if (s) { + s->refcount++; + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->objsize = max(s->objsize, (int)size); + s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + if (sysfs_slab_alias(s, name)) + goto err; + } else { + s = kmalloc(kmem_size, GFP_KERNEL); + if (s && kmem_cache_open(s, GFP_KERNEL, name, + size, align, flags, ctor, dtor)) { + if (sysfs_slab_add(s)) { + kfree(s); + goto err; + } + list_add(&s->list, &slab_caches); + } else + kfree(s); + } + up_write(&slub_lock); + return s; + +err: + up_write(&slub_lock); + if (flags & SLAB_PANIC) + panic("Cannot create slabcache %s\n", name); + else + s = NULL; + return s; +} +EXPORT_SYMBOL(kmem_cache_create); + +void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) +{ + void *x; + + x = slab_alloc(s, flags, -1, __builtin_return_address(0)); + if (x) + memset(x, 0, s->objsize); + return x; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + +#ifdef CONFIG_SMP +static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) +{ + struct list_head *h; + + down_read(&slub_lock); + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + func(s, cpu); + } + up_read(&slub_lock); +} + +/* + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. + */ +static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + for_all_slabs(__flush_cpu_slab, cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata slab_notifier = + { &slab_cpuup_callback, NULL, 0 }; + +#endif + +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +{ + struct kmem_cache *s = get_slab(size, gfpflags); + + if (!s) + return NULL; + + return slab_alloc(s, gfpflags, -1, caller); +} + +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, void *caller) +{ + struct kmem_cache *s = get_slab(size, gfpflags); + + if (!s) + return NULL; + + return slab_alloc(s, gfpflags, node, caller); +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) +static int validate_slab(struct kmem_cache *s, struct page *page) +{ + void *p; + void *addr = page_address(page); + DECLARE_BITMAP(map, s->objects); + + if (!check_slab(s, page) || + !on_freelist(s, page, NULL)) + return 0; + + /* Now we know that a valid freelist exists */ + bitmap_zero(map, s->objects); + + for_each_free_object(p, s, page->freelist) { + set_bit(slab_index(p, s, addr), map); + if (!check_object(s, page, p, 0)) + return 0; + } + + for_each_object(p, s, addr) + if (!test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, 1)) + return 0; + return 1; +} + +static void validate_slab_slab(struct kmem_cache *s, struct page *page) +{ + if (slab_trylock(page)) { + validate_slab(s, page); + slab_unlock(page); + } else + printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", + s->name, page); + + if (s->flags & DEBUG_DEFAULT_FLAGS) { + if (!SlabDebug(page)) + printk(KERN_ERR "SLUB %s: SlabDebug not set " + "on slab 0x%p\n", s->name, page); + } else { + if (SlabDebug(page)) + printk(KERN_ERR "SLUB %s: SlabDebug set on " + "slab 0x%p\n", s->name, page); + } +} + +static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) +{ + unsigned long count = 0; + struct page *page; + unsigned long flags; + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(page, &n->partial, lru) { + validate_slab_slab(s, page); + count++; + } + if (count != n->nr_partial) + printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " + "counter=%ld\n", s->name, count, n->nr_partial); + + if (!(s->flags & SLAB_STORE_USER)) + goto out; + + list_for_each_entry(page, &n->full, lru) { + validate_slab_slab(s, page); + count++; + } + if (count != atomic_long_read(&n->nr_slabs)) + printk(KERN_ERR "SLUB: %s %ld slabs counted but " + "counter=%ld\n", s->name, count, + atomic_long_read(&n->nr_slabs)); + +out: + spin_unlock_irqrestore(&n->list_lock, flags); + return count; +} + +static unsigned long validate_slab_cache(struct kmem_cache *s) +{ + int node; + unsigned long count = 0; + + flush_all(s); + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + count += validate_slab_node(s, n); + } + return count; +} + +#ifdef SLUB_RESILIENCY_TEST +static void resiliency_test(void) +{ + u8 *p; + + printk(KERN_ERR "SLUB resiliency testing\n"); + printk(KERN_ERR "-----------------------\n"); + printk(KERN_ERR "A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" + " 0x12->0x%p\n\n", p + 16); + + validate_slab_cache(kmalloc_caches + 4); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches + 5); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches + 6); + + printk(KERN_ERR "\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 7); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 8); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 9); +} +#else +static void resiliency_test(void) {}; +#endif + +/* + * Generate lists of code addresses where slabcache objects are allocated + * and freed. + */ + +struct location { + unsigned long count; + void *addr; + long long sum_time; + long min_time; + long max_time; + long min_pid; + long max_pid; + cpumask_t cpus; + nodemask_t nodes; +}; + +struct loc_track { + unsigned long max; + unsigned long count; + struct location *loc; +}; + +static void free_loc_track(struct loc_track *t) +{ + if (t->max) + free_pages((unsigned long)t->loc, + get_order(sizeof(struct location) * t->max)); +} + +static int alloc_loc_track(struct loc_track *t, unsigned long max) +{ + struct location *l; + int order; + + if (!max) + max = PAGE_SIZE / sizeof(struct location); + + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(GFP_KERNEL, order); + + if (!l) + return 0; + + if (t->count) { + memcpy(l, t->loc, sizeof(struct location) * t->count); + free_loc_track(t); + } + t->max = max; + t->loc = l; + return 1; +} + +static int add_location(struct loc_track *t, struct kmem_cache *s, + const struct track *track) +{ + long start, end, pos; + struct location *l; + void *caddr; + unsigned long age = jiffies - track->when; + + start = -1; + end = t->count; + + for ( ; ; ) { + pos = start + (end - start + 1) / 2; + + /* + * There is nothing at "end". If we end up there + * we need to add something to before end. + */ + if (pos == end) + break; + + caddr = t->loc[pos].addr; + if (track->addr == caddr) { + + l = &t->loc[pos]; + l->count++; + if (track->when) { + l->sum_time += age; + if (age < l->min_time) + l->min_time = age; + if (age > l->max_time) + l->max_time = age; + + if (track->pid < l->min_pid) + l->min_pid = track->pid; + if (track->pid > l->max_pid) + l->max_pid = track->pid; + + cpu_set(track->cpu, l->cpus); + } + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; + } + + if (track->addr < caddr) + end = pos; + else + start = pos; + } + + /* + * Not found. Insert new tracking element. + */ + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) + return 0; + + l = t->loc + pos; + if (pos < t->count) + memmove(l + 1, l, + (t->count - pos) * sizeof(struct location)); + t->count++; + l->count = 1; + l->addr = track->addr; + l->sum_time = age; + l->min_time = age; + l->max_time = age; + l->min_pid = track->pid; + l->max_pid = track->pid; + cpus_clear(l->cpus); + cpu_set(track->cpu, l->cpus); + nodes_clear(l->nodes); + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; +} + +static void process_slab(struct loc_track *t, struct kmem_cache *s, + struct page *page, enum track_item alloc) +{ + void *addr = page_address(page); + DECLARE_BITMAP(map, s->objects); + void *p; + + bitmap_zero(map, s->objects); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr) + if (!test_bit(slab_index(p, s, addr), map)) + add_location(t, s, get_track(s, p, alloc)); +} + +static int list_locations(struct kmem_cache *s, char *buf, + enum track_item alloc) +{ + int n = 0; + unsigned long i; + struct loc_track t; + int node; + + t.count = 0; + t.max = 0; + + /* Push back cpu slabs */ + flush_all(s); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long flags; + struct page *page; + + if (!atomic_read(&n->nr_slabs)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + process_slab(&t, s, page, alloc); + list_for_each_entry(page, &n->full, lru) + process_slab(&t, s, page, alloc); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + for (i = 0; i < t.count; i++) { + struct location *l = &t.loc[i]; + + if (n > PAGE_SIZE - 100) + break; + n += sprintf(buf + n, "%7ld ", l->count); + + if (l->addr) + n += sprint_symbol(buf + n, (unsigned long)l->addr); + else + n += sprintf(buf + n, "<not-available>"); + + if (l->sum_time != l->min_time) { + unsigned long remainder; + + n += sprintf(buf + n, " age=%ld/%ld/%ld", + l->min_time, + div_long_long_rem(l->sum_time, l->count, &remainder), + l->max_time); + } else + n += sprintf(buf + n, " age=%ld", + l->min_time); + + if (l->min_pid != l->max_pid) + n += sprintf(buf + n, " pid=%ld-%ld", + l->min_pid, l->max_pid); + else + n += sprintf(buf + n, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) { + n += sprintf(buf + n, " cpus="); + n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, + l->cpus); + } + + if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) { + n += sprintf(buf + n, " nodes="); + n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, + l->nodes); + } + + n += sprintf(buf + n, "\n"); + } + + free_loc_track(&t); + if (!t.count) + n += sprintf(buf, "No data\n"); + return n; +} + +static unsigned long count_partial(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += page->inuse; + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +enum slab_stat_type { + SL_FULL, + SL_PARTIAL, + SL_CPU, + SL_OBJECTS +}; + +#define SO_FULL (1 << SL_FULL) +#define SO_PARTIAL (1 << SL_PARTIAL) +#define SO_CPU (1 << SL_CPU) +#define SO_OBJECTS (1 << SL_OBJECTS) + +static unsigned long slab_objects(struct kmem_cache *s, + char *buf, unsigned long flags) +{ + unsigned long total = 0; + int cpu; + int node; + int x; + unsigned long *nodes; + unsigned long *per_cpu; + + nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + per_cpu = nodes + nr_node_ids; + + for_each_possible_cpu(cpu) { + struct page *page = s->cpu_slab[cpu]; + int node; + + if (page) { + node = page_to_nid(page); + if (flags & SO_CPU) { + int x = 0; + + if (flags & SO_OBJECTS) + x = page->inuse; + else + x = 1; + total += x; + nodes[node] += x; + } + per_cpu[node]++; + } + } + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_PARTIAL) { + if (flags & SO_OBJECTS) + x = count_partial(n); + else + x = n->nr_partial; + total += x; + nodes[node] += x; + } + + if (flags & SO_FULL) { + int full_slabs = atomic_read(&n->nr_slabs) + - per_cpu[node] + - n->nr_partial; + + if (flags & SO_OBJECTS) + x = full_slabs * s->objects; + else + x = full_slabs; + total += x; + nodes[node] += x; + } + } + + x = sprintf(buf, "%lu", total); +#ifdef CONFIG_NUMA + for_each_online_node(node) + if (nodes[node]) + x += sprintf(buf + x, " N%d=%lu", + node, nodes[node]); +#endif + kfree(nodes); + return x + sprintf(buf + x, "\n"); +} + +static int any_slab_objects(struct kmem_cache *s) +{ + int node; + int cpu; + + for_each_possible_cpu(cpu) + if (s->cpu_slab[cpu]) + return 1; + + for_each_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + if (n->nr_partial || atomic_read(&n->nr_slabs)) + return 1; + } + return 0; +} + +#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) +#define to_slab(n) container_of(n, struct kmem_cache, kobj); + +struct slab_attribute { + struct attribute attr; + ssize_t (*show)(struct kmem_cache *s, char *buf); + ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); +}; + +#define SLAB_ATTR_RO(_name) \ + static struct slab_attribute _name##_attr = __ATTR_RO(_name) + +#define SLAB_ATTR(_name) \ + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t slab_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->size); +} +SLAB_ATTR_RO(slab_size); + +static ssize_t align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->align); +} +SLAB_ATTR_RO(align); + +static ssize_t object_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->objsize); +} +SLAB_ATTR_RO(object_size); + +static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->objects); +} +SLAB_ATTR_RO(objs_per_slab); + +static ssize_t order_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->order); +} +SLAB_ATTR_RO(order); + +static ssize_t ctor_show(struct kmem_cache *s, char *buf) +{ + if (s->ctor) { + int n = sprint_symbol(buf, (unsigned long)s->ctor); + + return n + sprintf(buf + n, "\n"); + } + return 0; +} +SLAB_ATTR_RO(ctor); + +static ssize_t dtor_show(struct kmem_cache *s, char *buf) +{ + if (s->dtor) { + int n = sprint_symbol(buf, (unsigned long)s->dtor); + + return n + sprintf(buf + n, "\n"); + } + return 0; +} +SLAB_ATTR_RO(dtor); + +static ssize_t aliases_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->refcount - 1); +} +SLAB_ATTR_RO(aliases); + +static ssize_t slabs_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); +} +SLAB_ATTR_RO(slabs); + +static ssize_t partial_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_PARTIAL); +} +SLAB_ATTR_RO(partial); + +static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_CPU); +} +SLAB_ATTR_RO(cpu_slabs); + +static ssize_t objects_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); +} +SLAB_ATTR_RO(objects); + +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); +} + +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') + s->flags |= SLAB_DEBUG_FREE; + return length; +} +SLAB_ATTR(sanity_checks); + +static ssize_t trace_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); +} + +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') + s->flags |= SLAB_TRACE; + return length; +} +SLAB_ATTR(trace); + +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); +} + +static ssize_t reclaim_account_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_RECLAIM_ACCOUNT; + if (buf[0] == '1') + s->flags |= SLAB_RECLAIM_ACCOUNT; + return length; +} +SLAB_ATTR(reclaim_account); + +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); +} +SLAB_ATTR_RO(hwcache_align); + +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); +} +SLAB_ATTR_RO(cache_dma); +#endif + +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); +} +SLAB_ATTR_RO(destroy_by_rcu); + +static ssize_t red_zone_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); +} + +static ssize_t red_zone_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_RED_ZONE; + if (buf[0] == '1') + s->flags |= SLAB_RED_ZONE; + calculate_sizes(s); + return length; +} +SLAB_ATTR(red_zone); + +static ssize_t poison_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); +} + +static ssize_t poison_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_POISON; + if (buf[0] == '1') + s->flags |= SLAB_POISON; + calculate_sizes(s); + return length; +} +SLAB_ATTR(poison); + +static ssize_t store_user_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); +} + +static ssize_t store_user_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_STORE_USER; + if (buf[0] == '1') + s->flags |= SLAB_STORE_USER; + calculate_sizes(s); + return length; +} +SLAB_ATTR(store_user); + +static ssize_t validate_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t validate_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') + validate_slab_cache(s); + else + return -EINVAL; + return length; +} +SLAB_ATTR(validate); + +static ssize_t shrink_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t shrink_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') { + int rc = kmem_cache_shrink(s); + + if (rc) + return rc; + } else + return -EINVAL; + return length; +} +SLAB_ATTR(shrink); + +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); + +#ifdef CONFIG_NUMA +static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->defrag_ratio / 10); +} + +static ssize_t defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int n = simple_strtoul(buf, NULL, 10); + + if (n < 100) + s->defrag_ratio = n * 10; + return length; +} +SLAB_ATTR(defrag_ratio); +#endif + +static struct attribute * slab_attrs[] = { + &slab_size_attr.attr, + &object_size_attr.attr, + &objs_per_slab_attr.attr, + &order_attr.attr, + &objects_attr.attr, + &slabs_attr.attr, + &partial_attr.attr, + &cpu_slabs_attr.attr, + &ctor_attr.attr, + &dtor_attr.attr, + &aliases_attr.attr, + &align_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, + &hwcache_align_attr.attr, + &reclaim_account_attr.attr, + &destroy_by_rcu_attr.attr, + &red_zone_attr.attr, + &poison_attr.attr, + &store_user_attr.attr, + &validate_attr.attr, + &shrink_attr.attr, + &alloc_calls_attr.attr, + &free_calls_attr.attr, +#ifdef CONFIG_ZONE_DMA + &cache_dma_attr.attr, +#endif +#ifdef CONFIG_NUMA + &defrag_ratio_attr.attr, +#endif + NULL +}; + +static struct attribute_group slab_attr_group = { + .attrs = slab_attrs, +}; + +static ssize_t slab_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->show) + return -EIO; + + err = attribute->show(s, buf); + + return err; +} + +static ssize_t slab_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->store) + return -EIO; + + err = attribute->store(s, buf, len); + + return err; +} + +static struct sysfs_ops slab_sysfs_ops = { + .show = slab_attr_show, + .store = slab_attr_store, +}; + +static struct kobj_type slab_ktype = { + .sysfs_ops = &slab_sysfs_ops, +}; + +static int uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &slab_ktype) + return 1; + return 0; +} + +static struct kset_uevent_ops slab_uevent_ops = { + .filter = uevent_filter, +}; + +decl_subsys(slab, &slab_ktype, &slab_uevent_ops); + +#define ID_STR_LENGTH 64 + +/* Create a unique string id for a slab cache: + * format + * :[flags-]size:[memory address of kmemcache] + */ +static char *create_unique_id(struct kmem_cache *s) +{ + char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); + char *p = name; + + BUG_ON(!name); + + *p++ = ':'; + /* + * First flags affecting slabcache operations. We will only + * get here for aliasable slabs so we do not need to support + * too many flags. The flags here must cover all flags that + * are matched during merging to guarantee that the id is + * unique. + */ + if (s->flags & SLAB_CACHE_DMA) + *p++ = 'd'; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + *p++ = 'a'; + if (s->flags & SLAB_DEBUG_FREE) + *p++ = 'F'; + if (p != name + 1) + *p++ = '-'; + p += sprintf(p, "%07d", s->size); + BUG_ON(p > name + ID_STR_LENGTH - 1); + return name; +} + +static int sysfs_slab_add(struct kmem_cache *s) +{ + int err; + const char *name; + int unmergeable; + + if (slab_state < SYSFS) + /* Defer until later */ + return 0; + + unmergeable = slab_unmergeable(s); + if (unmergeable) { + /* + * Slabcache can never be merged so we can use the name proper. + * This is typically the case for debug situations. In that + * case we can catch duplicate names easily. + */ + sysfs_remove_link(&slab_subsys.kobj, s->name); + name = s->name; + } else { + /* + * Create a unique name for the slab as a target + * for the symlinks. + */ + name = create_unique_id(s); + } + + kobj_set_kset_s(s, slab_subsys); + kobject_set_name(&s->kobj, name); + kobject_init(&s->kobj); + err = kobject_add(&s->kobj); + if (err) + return err; + + err = sysfs_create_group(&s->kobj, &slab_attr_group); + if (err) + return err; + kobject_uevent(&s->kobj, KOBJ_ADD); + if (!unmergeable) { + /* Setup first alias */ + sysfs_slab_alias(s, s->name); + kfree(name); + } + return 0; +} + +static void sysfs_slab_remove(struct kmem_cache *s) +{ + kobject_uevent(&s->kobj, KOBJ_REMOVE); + kobject_del(&s->kobj); +} + +/* + * Need to buffer aliases during bootup until sysfs becomes + * available lest we loose that information. + */ +struct saved_alias { + struct kmem_cache *s; + const char *name; + struct saved_alias *next; +}; + +struct saved_alias *alias_list; + +static int sysfs_slab_alias(struct kmem_cache *s, const char *name) +{ + struct saved_alias *al; + + if (slab_state == SYSFS) { + /* + * If we have a leftover link then remove it. + */ + sysfs_remove_link(&slab_subsys.kobj, name); + return sysfs_create_link(&slab_subsys.kobj, + &s->kobj, name); + } + + al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); + if (!al) + return -ENOMEM; + + al->s = s; + al->name = name; + al->next = alias_list; + alias_list = al; + return 0; +} + +static int __init slab_sysfs_init(void) +{ + struct list_head *h; + int err; + + err = subsystem_register(&slab_subsys); + if (err) { + printk(KERN_ERR "Cannot register slab subsystem.\n"); + return -ENOSYS; + } + + slab_state = SYSFS; + + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + err = sysfs_slab_add(s); + BUG_ON(err); + } + + while (alias_list) { + struct saved_alias *al = alias_list; + + alias_list = alias_list->next; + err = sysfs_slab_alias(al->s, al->name); + BUG_ON(err); + kfree(al); + } + + resiliency_test(); + return 0; +} + +__initcall(slab_sysfs_init); +#endif diff --git a/mm/sparse.c b/mm/sparse.c index ac26eb0d73c..6f3fff907bc 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -44,7 +44,7 @@ EXPORT_SYMBOL(page_to_nid); #endif #ifdef CONFIG_SPARSEMEM_EXTREME -static struct mem_section *sparse_index_alloc(int nid) +static struct mem_section noinline *sparse_index_alloc(int nid) { struct mem_section *section = NULL; unsigned long array_size = SECTIONS_PER_ROOT * @@ -61,7 +61,7 @@ static struct mem_section *sparse_index_alloc(int nid) return section; } -static int sparse_index_init(unsigned long section_nr, int nid) +static int __meminit sparse_index_init(unsigned long section_nr, int nid) { static DEFINE_SPINLOCK(index_init_lock); unsigned long root = SECTION_NR_TO_ROOT(section_nr); @@ -138,7 +138,7 @@ static inline int sparse_early_nid(struct mem_section *section) } /* Record a memory area against a node. */ -void memory_present(int nid, unsigned long start, unsigned long end) +void __init memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; @@ -197,7 +197,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } -static int sparse_init_one_section(struct mem_section *ms, +static int __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map) { if (!valid_section(ms)) @@ -209,7 +209,7 @@ static int sparse_init_one_section(struct mem_section *ms, return 1; } -static struct page *sparse_early_mem_map_alloc(unsigned long pnum) +static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; struct mem_section *ms = __nr_to_section(pnum); @@ -272,7 +272,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. */ -void sparse_init(void) +void __init sparse_init(void) { unsigned long pnum; struct page *map; @@ -288,6 +288,7 @@ void sparse_init(void) } } +#ifdef CONFIG_MEMORY_HOTPLUG /* * returns the number of sections whose mem_maps were properly * set. If this is <=0, then that means that the passed-in @@ -327,3 +328,4 @@ out: __kfree_section_memmap(memmap, nr_pages); return ret; } +#endif diff --git a/mm/swap.c b/mm/swap.c index 2ed7be39795..d3cb966fe99 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -55,7 +55,7 @@ static void fastcall __page_cache_release(struct page *page) static void put_compound_page(struct page *page) { - page = (struct page *)page_private(page); + page = compound_head(page); if (put_page_testzero(page)) { compound_page_dtor *dtor; @@ -488,7 +488,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, long *committed; committed = &per_cpu(committed_space, (long)hcpu); - if (action == CPU_DEAD) { + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { atomic_add(*committed, &vm_committed_space); *committed = 0; __lru_add_drain((long)hcpu); diff --git a/mm/swapfile.c b/mm/swapfile.c index a2d9bb4e80d..acc172cbe3a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1531,9 +1531,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = PTR_ERR(page); goto bad_swap; } - wait_on_page_locked(page); - if (!PageUptodate(page)) - goto bad_swap; kmap(page); swap_header = page_address(page); diff --git a/mm/truncate.c b/mm/truncate.c index 0f4b6d18ab0..4fbe1a2da5f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -12,6 +12,7 @@ #include <linux/swap.h> #include <linux/module.h> #include <linux/pagemap.h> +#include <linux/highmem.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, @@ -46,7 +47,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { - memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0); if (PagePrivate(page)) do_invalidatepage(page, partial); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9eef486da90..faa2a521dea 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -431,7 +431,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, - (gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)), + (gfp_mask & GFP_LEVEL_MASK), node); } area->pages = pages; @@ -577,6 +577,14 @@ void *vmalloc_exec(unsigned long size) return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); } +#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) +#define GFP_VMALLOC32 GFP_DMA32 +#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +#define GFP_VMALLOC32 GFP_DMA +#else +#define GFP_VMALLOC32 GFP_KERNEL +#endif + /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size @@ -586,7 +594,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); } EXPORT_SYMBOL(vmalloc_32); @@ -602,7 +610,7 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); if (ret) { write_lock(&vmlist_lock); area = __find_vm_area(ret); @@ -747,3 +755,10 @@ out_einval_locked: } EXPORT_SYMBOL(remap_vmalloc_range); +/* + * Implement a stub for vmalloc_sync_all() if the architecture chose not to + * have one. + */ +void __attribute__((weak)) vmalloc_sync_all(void) +{ +} diff --git a/mm/vmscan.c b/mm/vmscan.c index db023e2ff38..1be5a6376ef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -284,12 +284,8 @@ static void handle_write_error(struct address_space *mapping, struct page *page, int error) { lock_page(page); - if (page_mapping(page) == mapping) { - if (error == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else - set_bit(AS_EIO, &mapping->flags); - } + if (page_mapping(page) == mapping) + mapping_set_error(mapping, error); unlock_page(page); } @@ -1323,8 +1319,6 @@ static int kswapd(void *p) for ( ; ; ) { unsigned long new_order; - try_to_freeze(); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; pgdat->kswapd_max_order = 0; @@ -1335,12 +1329,19 @@ static int kswapd(void *p) */ order = new_order; } else { - schedule(); + if (!freezing(current)) + schedule(); + order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - balance_pgdat(pgdat, order); + if (!try_to_freeze()) { + /* We can speed up thawing tasks if we don't call + * balance_pgdat after returning from the refrigerator + */ + balance_pgdat(pgdat, order); + } } return 0; } @@ -1527,7 +1528,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, pg_data_t *pgdat; cpumask_t mask; - if (action == CPU_ONLINE) { + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { for_each_online_pgdat(pgdat) { mask = node_to_cpumask(pgdat->node_id); if (any_online_cpu(mask) != NR_CPUS) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6c488d6ac42..9832d9a41d8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state); /* * Update the zone counters for one cpu. + * + * Note that refresh_cpu_vm_stats strives to only access + * node local memory. The per cpu pagesets on remote zones are placed + * in the memory local to the processor using that pageset. So the + * loop over all zones will access a series of cachelines local to + * the processor. + * + * The call to zone_page_state_add updates the cachelines with the + * statistics in the remote zone struct as well as the global cachelines + * with the global counters. These could cause remote node cache line + * bouncing and will have to be only done when necessary. */ void refresh_cpu_vm_stats(int cpu) { @@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu) unsigned long flags; for_each_zone(zone) { - struct per_cpu_pageset *pcp; + struct per_cpu_pageset *p; if (!populated_zone(zone)) continue; - pcp = zone_pcp(zone, cpu); + p = zone_pcp(zone, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (pcp->vm_stat_diff[i]) { + if (p->vm_stat_diff[i]) { local_irq_save(flags); - zone_page_state_add(pcp->vm_stat_diff[i], + zone_page_state_add(p->vm_stat_diff[i], zone, i); - pcp->vm_stat_diff[i] = 0; + p->vm_stat_diff[i] = 0; +#ifdef CONFIG_NUMA + /* 3 seconds idle till flush */ + p->expire = 3; +#endif local_irq_restore(flags); } +#ifdef CONFIG_NUMA + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count)) + continue; + + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + p->expire = 0; + continue; + } + + p->expire--; + if (p->expire) + continue; + + if (p->pcp[0].count) + drain_zone_pages(zone, p->pcp + 0); + + if (p->pcp[1].count) + drain_zone_pages(zone, p->pcp + 1); +#endif } } @@ -640,6 +684,24 @@ const struct seq_operations vmstat_op = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct delayed_work, vmstat_work); +int sysctl_stat_interval __read_mostly = HZ; + +static void vmstat_update(struct work_struct *w) +{ + refresh_cpu_vm_stats(smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(vmstat_work), + sysctl_stat_interval); +} + +static void __devinit start_cpu_timer(int cpu) +{ + struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); + + INIT_DELAYED_WORK(vmstat_work, vmstat_update); + schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); +} + /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. @@ -648,10 +710,24 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { + long cpu = (long)hcpu; + switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_CANCELED: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + start_cpu_timer(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); + per_cpu(vmstat_work, cpu).work.func = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + start_cpu_timer(cpu); + break; case CPU_DEAD: + case CPU_DEAD_FROZEN: refresh_zone_stat_thresholds(); break; default: @@ -665,8 +741,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier = int __init setup_vmstat(void) { + int cpu; + refresh_zone_stat_thresholds(); register_cpu_notifier(&vmstat_notifier); + + for_each_online_cpu(cpu) + start_cpu_timer(cpu); return 0; } module_init(setup_vmstat) |