From 994fc28c7b1e697ac56befe4aecabf23f0689f46 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 15 Dec 2005 14:28:17 -0800 Subject: [PATCH] add AOP_TRUNCATED_PAGE, prepend AOP_ to WRITEPAGE_ACTIVATE readpage(), prepare_write(), and commit_write() callers are updated to understand the special return code AOP_TRUNCATED_PAGE in the style of writepage() and WRITEPAGE_ACTIVATE. AOP_TRUNCATED_PAGE tells the caller that the callee has unlocked the page and that the operation should be tried again with a new page. OCFS2 uses this to detect and work around a lock inversion in its aop methods. There should be no change in behaviour for methods that don't return AOP_TRUNCATED_PAGE. WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are made enums so that kerneldoc can be used to document their semantics. Signed-off-by: Zach Brown --- mm/filemap.c | 73 ++++++++++++++++++++++++++++++++++++++++------------------ mm/readahead.c | 15 +++++++----- mm/shmem.c | 2 +- mm/vmscan.c | 2 +- 4 files changed, 61 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 33a28bfde15..6e1d08a2b8b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -831,8 +831,13 @@ readpage: /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); - if (unlikely(error)) + if (unlikely(error)) { + if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } goto readpage_error; + } if (!PageUptodate(page)) { lock_page(page); @@ -1152,26 +1157,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) { struct address_space *mapping = file->f_mapping; struct page *page; - int error; + int ret; - page = page_cache_alloc_cold(mapping); - if (!page) - return -ENOMEM; + do { + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (ret == 0) + ret = mapping->a_ops->readpage(file, page); + else if (ret == -EEXIST) + ret = 0; /* losing race to add is OK */ - error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); - if (!error) { - error = mapping->a_ops->readpage(file, page); page_cache_release(page); - return error; - } - /* - * We arrive here in the unlikely event that someone - * raced with us and added our page to the cache first - * or we are out of memory for radix-tree nodes. - */ - page_cache_release(page); - return error == -EEXIST ? 0 : error; + } while (ret == AOP_TRUNCATED_PAGE); + + return ret; } #define MMAP_LOTSAMISS (100) @@ -1331,10 +1334,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1358,10 +1365,14 @@ page_not_uptodate: goto success; } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1444,10 +1455,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1470,10 +1485,14 @@ page_not_uptodate: } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1934,12 +1953,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); + + if (status != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (status == AOP_TRUNCATED_PAGE) + continue; /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ - unlock_page(page); - page_cache_release(page); if (pos + bytes > isize) vmtruncate(inode, isize); break; @@ -1952,6 +1975,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, cur_iov, iov_base, bytes); flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); + if (status == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + continue; + } if (likely(copied > 0)) { if (!status) status = copied; diff --git a/mm/readahead.c b/mm/readahead.c index 72e7adbb87c..8d6eeaaa629 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, { unsigned page_idx; struct pagevec lru_pvec; - int ret = 0; + int ret; if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); @@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp, list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { - mapping->a_ops->readpage(filp, page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else { - page_cache_release(page); + ret = mapping->a_ops->readpage(filp, page); + if (ret != AOP_TRUNCATED_PAGE) { + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + continue; + } /* else fall through to release */ } + page_cache_release(page); } pagevec_lru_add(&lru_pvec); + ret = 0; out: return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index dc25565a61e..d9fc277940d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -855,7 +855,7 @@ unlock: swap_free(swap); redirty: set_page_dirty(page); - return WRITEPAGE_ACTIVATE; /* Return with the page locked */ + return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ } #ifdef CONFIG_NUMA diff --git a/mm/vmscan.c b/mm/vmscan.c index b0cd81c32de..795a050fe47 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) res = mapping->a_ops->writepage(page, &wbc); if (res < 0) handle_write_error(mapping, page, res); - if (res == WRITEPAGE_ACTIVATE) { + if (res == AOP_WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); return PAGE_ACTIVATE; } -- cgit v1.2.3 From 47f3a867f6310d6abfa185ab12baaba7ed1d69af Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 6 Jan 2006 00:10:32 -0800 Subject: [PATCH] mm: fix __alloc_pages cpuset ALLOC_* flags Two changes to the setting of the ALLOC_CPUSET flag in mm/page_alloc.c:__alloc_pages() - A bug fix - the "ignoring mins" case should not be honoring ALLOC_CPUSET. This case of all cases, since it is handling a request that will free up more memory than is asked for (exiting tasks, e.g.) should be allowed to escape cpuset constraints when memory is tight. - A logic change to make it simpler. Honor cpusets even on GFP_ATOMIC (!wait) requests. With this, cpuset confinement applies to all requests except ALLOC_NO_WATERMARKS, so that in a subsequent cleanup patch, I can remove the ALLOC_CPUSET flag entirely. Since I don't know any real reason this logic has to be either way, I am choosing the path of the simplest code. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe14a8c87fc..1e49dc7cd61 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -903,8 +903,7 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -926,7 +925,7 @@ restart: nofail_alloc: /* go through the zonelist yet again, ignoring mins */ page = get_page_from_freelist(gfp_mask, order, - zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); + zonelist, ALLOC_NO_WATERMARKS); if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { -- cgit v1.2.3 From 5ac24eefd1d89bc6aa2817741c3bd5d4205b2efd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 6 Jan 2006 00:10:33 -0800 Subject: [PATCH] memhotplug: __add_section remove unused pgdat definition __add_section defines an unused pointer to the zones pgdat. Remove this definition. This fixes a compile warning. Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f6d4af8af8a..a918f77f02f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, int nr_pages); static int __add_section(struct zone *zone, unsigned long phys_start_pfn) { - struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; int ret; -- cgit v1.2.3 From d7339071f6a8b50101d7ba327926b770f22d5d8b Mon Sep 17 00:00:00 2001 From: Hans Reiser Date: Fri, 6 Jan 2006 00:10:36 -0800 Subject: [PATCH] reiser4: vfs: add truncate_inode_pages_range() This patch makes truncate_inode_pages_range from truncate_inode_pages. truncate_inode_pages became a one-liner call to truncate_inode_pages_range. Reiser4 needs truncate_inode_pages_ranges because it tries to keep correspondence between existences of metadata pointing to data pages and pages to which those metadata point to. So, when metadata of certain part of file is removed from filesystem tree, only pages of corresponding range are to be truncated. (Needed by the madvise(MADV_REMOVE) patch) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 9173ab50060..7dee3274590 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) } /** - * truncate_inode_pages - truncate *all* the pages from an offset + * truncate_inode_pages - truncate range of pages specified by start and + * end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate + * @lend: offset to which to truncate * - * Truncate the page cache at a set offset, removing the pages that are beyond - * that offset (and zeroing out partial pages). + * Truncate the page cache, removing the pages that are between + * specified offsets (and zeroing out partial page + * (if lstart is not page aligned)). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass @@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. - * - * Called under (and serialised by) inode->i_sem. */ -void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +void truncate_inode_pages_range(struct address_space *mapping, + loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t next; @@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) if (mapping->nrpages == 0) return; + BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + end = (lend >> PAGE_CACHE_SHIFT); + pagevec_init(&pvec, 0); next = start; - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; + if (page_index > end) { + next = page_index; + break; + } + if (page_index > next) next = page_index; next++; @@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) next = start; continue; } + if (pvec.pages[0]->index > end) { + pagevec_release(&pvec); + break; + } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; + if (page->index > end) + break; lock_page(page); wait_on_page_writeback(page); if (page->index > next) @@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) pagevec_release(&pvec); } } +EXPORT_SYMBOL(truncate_inode_pages_range); +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Called under (and serialised by) inode->i_sem. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + truncate_inode_pages_range(mapping, lstart, (loff_t)-1); +} EXPORT_SYMBOL(truncate_inode_pages); /** -- cgit v1.2.3 From f6b3ec238d12c8cc6cc71490c6e3127988460349 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Fri, 6 Jan 2006 00:10:38 -0800 Subject: [PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store Here is the patch to implement madvise(MADV_REMOVE) - which frees up a given range of pages & its associated backing store. Current implementation supports only shmfs/tmpfs and other filesystems return -ENOSYS. "Some app allocates large tmpfs files, then when some task quits and some client disconnect, some memory can be released. However the only way to release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli Databases want to use this feature to drop a section of their bufferpool (shared memory segments) - without writing back to disk/swap space. This feature is also useful for supporting hot-plug memory on UML. Concerns raised by Andrew Morton: - "We have no plan for holepunching! If we _do_ have such a plan (or might in the future) then what would the API look like? I think sys_holepunch(fd, start, len), so we should start out with that." - Using madvise is very weird, because people will ask "why do I need to mmap my file before I can stick a hole in it?" - None of the other madvise operations call into the filesystem in this manner. A broad question is: is this capability an MM operation or a filesytem operation? truncate, for example, is a filesystem operation which sometimes has MM side-effects. madvise is an mm operation and with this patch, it gains FS side-effects, only they're really, really significant ones." Comments: - Andrea suggested the fs operation too but then it's more efficient to have it as a mm operation with fs side effects, because they don't immediatly know fd and physical offset of the range. It's possible to fixup in userland and to use the fs operation but it's more expensive, the vmas are already in the kernel and we can use them. Short term plan & Future Direction: - We seem to need this interface only for shmfs/tmpfs files in the short term. We have to add hooks into the filesystem for correctness and completeness. This is what this patch does. - In the future, plan is to support both fs and mmap apis also. This also involves (other) filesystem specific functions to be implemented. - Current patch doesn't support VM_NONLINEAR - which can be addressed in the future. Signed-off-by: Badari Pulavarty Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Michael Kerrisk Cc: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 35 +++++++++++++++++++++++++++++++++++ mm/memory.c | 25 ++++++++++++++++++++++++- mm/shmem.c | 32 ++++++++++++++++++++++++-------- 3 files changed, 83 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 2b7cf0400a2..ae0ae3ea299 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma, return 0; } +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + if (!vma->vm_file || !vma->vm_file->f_mapping + || !vma->vm_file->f_mapping->host) { + return -EINVAL; + } + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + return vmtruncate_range(mapping->host, offset, endoff); +} + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) @@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_RANDOM: error = madvise_behavior(vma, prev, start, end, behavior); break; + case MADV_REMOVE: + error = madvise_remove(vma, start, end); + break; case MADV_WILLNEED: error = madvise_willneed(vma, prev, start, end); @@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. * * return values: * zero - success diff --git a/mm/memory.c b/mm/memory.c index d8dde07a365..e249088908c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1770,9 +1770,32 @@ out_big: out_busy: return -ETXTBSY; } - EXPORT_SYMBOL(vmtruncate); +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op || !inode->i_op->truncate_range) + return -ENOSYS; + + down(&inode->i_sem); + down_write(&inode->i_alloc_sem); + unmap_mapping_range(mapping, offset, (end - offset), 1); + truncate_inode_pages_range(mapping, offset, end); + inode->i_op->truncate_range(inode, offset, end); + up_write(&inode->i_alloc_sem); + up(&inode->i_sem); + + return 0; +} +EXPORT_SYMBOL(vmtruncate_range); + /* * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen diff --git a/mm/shmem.c b/mm/shmem.c index d9fc277940d..65c148efa2e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next) } while (next); } -static void shmem_truncate(struct inode *inode) +static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) { struct shmem_inode_info *info = SHMEM_I(inode); unsigned long idx; @@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode) long nr_swaps_freed = 0; int offset; int freed; + int punch_hole = 0; inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (idx >= info->next_index) return; spin_lock(&info->lock); info->flags |= SHMEM_TRUNCATE; - limit = info->next_index; - info->next_index = idx; + if (likely(end == (loff_t) -1)) { + limit = info->next_index; + info->next_index = idx; + } else { + limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (limit > info->next_index) + limit = info->next_index; + punch_hole = 1; + } + topdir = info->i_indirect; - if (topdir && idx <= SHMEM_NR_DIRECT) { + if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { info->i_indirect = NULL; nr_pages_to_free++; list_add(&topdir->lru, &pages_to_free); @@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode) set_page_private(subdir, page_private(subdir) - freed); if (offset) spin_unlock(&info->lock); - BUG_ON(page_private(subdir) > offset); + if (!punch_hole) + BUG_ON(page_private(subdir) > offset); } if (offset) offset = 0; - else if (subdir) { + else if (subdir && !page_private(subdir)) { dir[diroff] = NULL; nr_pages_to_free++; list_add(&subdir->lru, &pages_to_free); @@ -594,7 +604,7 @@ done2: * Also, though shmem_getpage checks i_size before adding to * cache, no recheck after: so fix the narrow window there too. */ - truncate_inode_pages(inode->i_mapping, inode->i_size); + truncate_inode_pages_range(inode->i_mapping, start, end); } spin_lock(&info->lock); @@ -614,6 +624,11 @@ done2: } } +static void shmem_truncate(struct inode *inode) +{ + shmem_truncate_range(inode, inode->i_size, (loff_t)-1); +} + static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; @@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = { static struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, .setattr = shmem_notify_change, + .truncate_range = shmem_truncate_range, }; static struct inode_operations shmem_dir_inode_operations = { -- cgit v1.2.3 From f0916794f00be44154102dedaeafe68b743078a2 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Fri, 6 Jan 2006 00:10:40 -0800 Subject: [PATCH] Hugetlb: Remove duplicate i_size check cleanup Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e52df7c471..acb864130f8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -350,19 +350,12 @@ static struct page *find_lock_huge_page(struct address_space *mapping, { struct page *page; int err; - struct inode *inode = mapping->host; - unsigned long size; retry: page = find_lock_page(mapping, idx); if (page) goto out; - /* Check to make sure the mapping hasn't been truncated */ - size = i_size_read(inode) >> HPAGE_SHIFT; - if (idx >= size) - goto out; - if (hugetlb_get_quota(mapping)) goto out; page = alloc_huge_page(); -- cgit v1.2.3 From 85ef47f74afe96c8c23eaa605f28cc01443c905f Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Fri, 6 Jan 2006 00:10:42 -0800 Subject: [PATCH] Hugetlb: Rename find_lock_page to find_or_alloc_huge_page find_lock_huge_page() isn't a great name, since it does extra things not analagous to find_lock_page(). Rename it find_or_alloc_huge_page() which is closer to the mark. Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index acb864130f8..fdbbbb90caa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -345,8 +345,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_lock_huge_page(struct address_space *mapping, - unsigned long idx) +static struct page *find_or_alloc_huge_page(struct address_space *mapping, + unsigned long idx) { struct page *page; int err; @@ -398,7 +398,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_lock_huge_page(mapping, idx); + page = find_or_alloc_huge_page(mapping, idx); if (!page) goto out; -- cgit v1.2.3 From 86e5216f8d8aa258ba836caffe2613d79cc9aead Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Fri, 6 Jan 2006 00:10:43 -0800 Subject: [PATCH] Hugetlb: Reorganize hugetlb_fault to prepare for COW This patch splits the "no_page()" type activity into its own function, hugetlb_no_page(). hugetlb_fault() becomes the entry point for hugetlb faults and delegates to the appropriate handler depending on the type of fault. Right now we still have only hugetlb_no_page() but a later patch introduces a COW fault. Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fdbbbb90caa..cf8225108b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -376,20 +376,15 @@ out: return page; } -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) +int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { int ret = VM_FAULT_SIGBUS; unsigned long idx; unsigned long size; - pte_t *pte; struct page *page; struct address_space *mapping; - pte = huge_pte_alloc(mm, address); - if (!pte) - goto out; - mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); @@ -408,11 +403,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto backout; ret = VM_FAULT_MINOR; - if (!pte_none(*pte)) + if (!pte_none(*ptep)) goto backout; add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); + set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); spin_unlock(&mm->page_table_lock); unlock_page(page); out: @@ -426,6 +421,27 @@ backout: goto out; } +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pte_t *ptep; + pte_t entry; + + ptep = huge_pte_alloc(mm, address); + if (!ptep) + return VM_FAULT_OOM; + + entry = *ptep; + if (pte_none(entry)) + return hugetlb_no_page(mm, vma, address, ptep); + + /* + * We could get here if another thread instantiated the pte + * before the test above. + */ + return VM_FAULT_MINOR; +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) -- cgit v1.2.3 From 1e8f889b10d8d2223105719e36ce45688fedbd59 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 6 Jan 2006 00:10:44 -0800 Subject: [PATCH] Hugetlb: Copy on Write support Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be supported. This helps us to safely use hugetlb pages in many more applications. The patch makes the following changes. If needed, I also have it broken out according to the following paragraphs. 1. Add a pair of functions to set/clear write access on huge ptes. The writable check in make_huge_pte is moved out to the caller for use by COW later. 2. Hugetlb copy-on-write requires special case handling in the following situations: - copy_hugetlb_page_range() - Copied pages must be write protected so a COW fault will be triggered (if necessary) if those pages are written to. - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the page cache. MAP_PRIVATE pages still need to be locked however. 3. Provide hugetlb_cow() and calls from hugetlb_fault() and hugetlb_no_page() which handles the COW fault by making the actual copy. 4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps will be allowed. Make MAP_HUGETLB exempt from the depricated VM_RESERVED mapping check. Signed-off-by: David Gibson Signed-off-by: Adam Litke Cc: William Lee Irwin III Cc: "Seth, Rohit" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 108 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf8225108b2..da8a211414c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = { .nopage = hugetlb_nopage, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) { pte_t entry; - if (vma->vm_flags & VM_WRITE) { + if (writable) { entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); } else { @@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) return entry; } +static void set_huge_ptep_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t entry; + + entry = pte_mkwrite(pte_mkdirty(*ptep)); + ptep_set_access_flags(vma, address, ptep, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +} + + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; + int cow; + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { src_pte = huge_pte_offset(src, addr); @@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); if (!pte_none(*src_pte)) { + if (cow) + ptep_set_wrprotect(src, addr, src_pte); entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); @@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } static struct page *find_or_alloc_huge_page(struct address_space *mapping, - unsigned long idx) + unsigned long idx, int shared) { struct page *page; int err; @@ -364,26 +382,80 @@ retry: goto out; } - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); - if (err) { - put_page(page); - hugetlb_put_quota(mapping); - if (err == -EEXIST) - goto retry; - page = NULL; + if (shared) { + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + page = NULL; + } + } else { + /* Caller expects a locked page */ + lock_page(page); } out: return page; } +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t pte) +{ + struct page *old_page, *new_page; + int i, avoidcopy; + + old_page = pte_page(pte); + + /* If no-one else is actually using this page, avoid the copy + * and just make the page writable */ + avoidcopy = (page_count(old_page) == 1); + if (avoidcopy) { + set_huge_ptep_writable(vma, address, ptep); + return VM_FAULT_MINOR; + } + + page_cache_get(old_page); + new_page = alloc_huge_page(); + + if (!new_page) { + page_cache_release(old_page); + + /* Logically this is OOM, not a SIGBUS, but an OOM + * could cause the kernel to go killing other + * processes which won't help the hugepage situation + * at all (?) */ + return VM_FAULT_SIGBUS; + } + + spin_unlock(&mm->page_table_lock); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) + copy_user_highpage(new_page + i, old_page + i, + address + i*PAGE_SIZE); + spin_lock(&mm->page_table_lock); + + ptep = huge_pte_offset(mm, address & HPAGE_MASK); + if (likely(pte_same(*ptep, pte))) { + /* Break COW */ + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); + /* Make the old page be freed below */ + new_page = old_page; + } + page_cache_release(new_page); + page_cache_release(old_page); + return VM_FAULT_MINOR; +} + int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) + unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; unsigned long idx; unsigned long size; struct page *page; struct address_space *mapping; + pte_t new_pte; mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) @@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_or_alloc_huge_page(mapping, idx); + page = find_or_alloc_huge_page(mapping, idx, + vma->vm_flags & VM_SHARED); if (!page) goto out; + BUG_ON(!PageLocked(page)); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; if (idx >= size) @@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, goto backout; add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, address, ptep, new_pte); + + if (write_access && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_cow(mm, vma, address, ptep, new_pte); + } + spin_unlock(&mm->page_table_lock); unlock_page(page); out: @@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; + int ret; ptep = huge_pte_alloc(mm, address); if (!ptep) @@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = *ptep; if (pte_none(entry)) - return hugetlb_no_page(mm, vma, address, ptep); + return hugetlb_no_page(mm, vma, address, ptep, write_access); - /* - * We could get here if another thread instantiated the pte - * before the test above. - */ - return VM_FAULT_MINOR; + ret = VM_FAULT_MINOR; + + spin_lock(&mm->page_table_lock); + /* Check for a racing update before calling hugetlb_cow */ + if (likely(pte_same(entry, *ptep))) + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry); + spin_unlock(&mm->page_table_lock); + + return ret; } int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From 96df9333c94d7d5aeceb21f6c5e7ae8ff34753cf Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:45 -0800 Subject: [PATCH] mm: dequeue a huge page near to this node This was discussed at http://marc.theaimsgroup.com/?l=linux-kernel&m=113166526217117&w=2 This patch changes the dequeueing to select a huge page near the node executing instead of always beginning to check for free nodes from node 0. This will result in a placement of the huge pages near the executing processor improving performance. The existing implementation can place the huge pages far away from the executing processor causing significant degradation of performance. The search starting from zero also means that the lower zones quickly run out of memory. Selecting a huge page near the process distributed the huge pages better. Signed-off-by: Christoph Lameter Cc: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da8a211414c..e93bd63462f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,14 +40,16 @@ static struct page *dequeue_huge_page(void) { int nid = numa_node_id(); struct page *page = NULL; + struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists; + struct zone **z; - if (list_empty(&hugepage_freelists[nid])) { - for (nid = 0; nid < MAX_NUMNODES; ++nid) - if (!list_empty(&hugepage_freelists[nid])) - break; + for (z = zonelist->zones; *z; z++) { + nid = (*z)->zone_pgdat->node_id; + if (!list_empty(&hugepage_freelists[nid])) + break; } - if (nid >= 0 && nid < MAX_NUMNODES && - !list_empty(&hugepage_freelists[nid])) { + + if (*z) { page = list_entry(hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); -- cgit v1.2.3 From 5da7ca86078964cbfe6c83efc1205904587706fe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:46 -0800 Subject: [PATCH] Add NUMA policy support for huge pages. The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter Cc: Andi Kleen Acked-by: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 ++++++++++++++---------- mm/mempolicy.c | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e93bd63462f..eb405565949 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include @@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page) free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct vm_area_struct *vma, + unsigned long address) { int nid = numa_node_id(); struct page *page = NULL; - struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists; + struct zonelist *zonelist = huge_zonelist(vma, address); struct zone **z; for (z = zonelist->zones; *z; z++) { @@ -87,13 +90,13 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } -struct page *alloc_huge_page(void) +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { struct page *page; int i; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(); + page = dequeue_huge_page(vma, addr); if (!page) { spin_unlock(&hugetlb_lock); return NULL; @@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count) spin_lock(&hugetlb_lock); try_to_free_low(count); while (count < nr_huge_pages) { - struct page *page = dequeue_huge_page(); + struct page *page = dequeue_huge_page(NULL, 0); if (!page) break; update_and_free_page(page); @@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_or_alloc_huge_page(struct address_space *mapping, - unsigned long idx, int shared) +static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, struct address_space *mapping, + unsigned long idx, int shared) { struct page *page; int err; @@ -378,7 +382,7 @@ retry: if (hugetlb_get_quota(mapping)) goto out; - page = alloc_huge_page(); + page = alloc_huge_page(vma, addr); if (!page) { hugetlb_put_quota(mapping); goto out; @@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_get(old_page); - new_page = alloc_huge_page(); + new_page = alloc_huge_page(vma, address); if (!new_page) { page_cache_release(old_page); @@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_or_alloc_huge_page(mapping, idx, + page = find_or_alloc_huge_page(vma, address, mapping, idx, vma->vm_flags & VM_SHARED); if (!page) goto out; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72f402cc9c9..45c51ac6344 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol, return nid; } +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + off = vma->vm_pgoff; + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, vma, off); + } else + return interleave_nodes(pol); +} + +/* Return a zonelist suitable for a huge page allocation. */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(current, vma, addr); + + if (pol->policy == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + } + return zonelist_policy(GFP_HIGHUSER, pol); +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, @@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; - if (vma) { - unsigned long off; - off = vma->vm_pgoff; - off += (addr - vma->vm_start) >> PAGE_SHIFT; - nid = offset_il_node(pol, vma, off); - } else { - /* fall back to process interleaving */ - nid = interleave_nodes(pol); - } + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); -- cgit v1.2.3 From 21abb1478a87e26f5fa71dbcb7cf4264272c2248 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:47 -0800 Subject: [PATCH] Remove old node based policy interface from mempolicy.c mempolicy.c contains provisional interface for huge page allocation based on node numbers. This is in use in SLES9 but was never used (AFAIK) in upstream versions of Linux. Huge page allocations now use zonelists to figure out where to allocate pages. The use of zonelists allows us to find the closest hugepage which was the consideration of the NUMA distance for huge page allocations. Remove the obsolete functions. Signed-off-by: Christoph Lameter Cc: Andi Kleen Acked-by: William Lee Irwin III Cc: Adam Litke Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 45c51ac6344..96714e2646a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -960,54 +960,6 @@ void __mpol_free(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -/* - * Hugetlb policy. Same as above, just works with node numbers instead of - * zonelists. - */ - -/* Find first node suitable for an allocation */ -int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(current, vma, addr); - - switch (pol->policy) { - case MPOL_DEFAULT: - return numa_node_id(); - case MPOL_BIND: - return pol->v.zonelist->zones[0]->zone_pgdat->node_id; - case MPOL_INTERLEAVE: - return interleave_nodes(pol); - case MPOL_PREFERRED: - return pol->v.preferred_node >= 0 ? - pol->v.preferred_node : numa_node_id(); - } - BUG(); - return 0; -} - -/* Find secondary valid nodes for an allocation */ -int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(current, vma, addr); - - switch (pol->policy) { - case MPOL_PREFERRED: - case MPOL_DEFAULT: - case MPOL_INTERLEAVE: - return 1; - case MPOL_BIND: { - struct zone **z; - for (z = pol->v.zonelist->zones; *z; z++) - if ((*z)->zone_pgdat->node_id == nid) - return 1; - return 0; - } - default: - BUG(); - return 0; - } -} - /* * Shared memory backing store policy support. * -- cgit v1.2.3 From 6bda666a03f063968833760c5bb5c13062ab9291 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:49 -0800 Subject: [PATCH] hugepages: fold find_or_alloc_pages into huge_no_page() The number of parameters for find_or_alloc_page increases significantly after policy support is added to huge pages. Simplify the code by folding find_or_alloc_huge_page() into hugetlb_no_page(). Adam Litke objected to this piece in an earlier patch but I think this is a good simplification. Diffstat shows that we can get rid of almost half of the lines of find_or_alloc_page(). If we can find no consensus then lets simply drop this patch. Signed-off-by: Christoph Lameter Cc: Andi Kleen Acked-by: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 66 ++++++++++++++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eb405565949..f4c43d7980b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -368,43 +368,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, struct address_space *mapping, - unsigned long idx, int shared) -{ - struct page *page; - int err; - -retry: - page = find_lock_page(mapping, idx); - if (page) - goto out; - - if (hugetlb_get_quota(mapping)) - goto out; - page = alloc_huge_page(vma, addr); - if (!page) { - hugetlb_put_quota(mapping); - goto out; - } - - if (shared) { - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); - if (err) { - put_page(page); - hugetlb_put_quota(mapping); - if (err == -EEXIST) - goto retry; - page = NULL; - } - } else { - /* Caller expects a locked page */ - lock_page(page); - } -out: - return page; -} - static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte) { @@ -471,12 +434,31 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_or_alloc_huge_page(vma, address, mapping, idx, - vma->vm_flags & VM_SHARED); - if (!page) - goto out; +retry: + page = find_lock_page(mapping, idx); + if (!page) { + if (hugetlb_get_quota(mapping)) + goto out; + page = alloc_huge_page(vma, address); + if (!page) { + hugetlb_put_quota(mapping); + goto out; + } - BUG_ON(!PageLocked(page)); + if (vma->vm_flags & VM_SHARED) { + int err; + + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + goto out; + } + } else + lock_page(page); + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; -- cgit v1.2.3 From a94b3ab7eab4edcc9b2cb474b188f774c331adf7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 6 Jan 2006 00:10:51 -0800 Subject: [PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES The NODES_SPAN_OTHER_NODES config option was created so that DISCONTIGMEM could handle pSeries numa layouts. However, support for DISCONTIGMEM has been replaced by SPARSEMEM on powerpc. As a result, this config option and supporting code is no longer needed. I have already sent a patch to Paul that removes the option from powerpc specific code. This removes the arch independent piece. Doesn't really matter which is applied first. Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1e49dc7cd61..07825c637a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1708,8 +1708,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { if (!early_pfn_valid(pfn)) continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); set_page_count(page, 1); -- cgit v1.2.3 From c484d41042e6ccb88089ca41e3b3eed1bafdae21 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 6 Jan 2006 00:10:55 -0800 Subject: [PATCH] mm: free_pages_and_swap_cache opt Minor optimization (though it doesn't help in the PREEMPT case, severely constrained by small ZAP_BLOCK_SIZE). free_pages_and_swap_cache works in chunks of 16, calling release_pages which works in chunks of PAGEVEC_SIZE. But PAGEVEC_SIZE was dropped from 16 to 14 in 2.6.10, so we're now doing more spin_lock_irq'ing than necessary: use PAGEVEC_SIZE throughout. Signed-off-by: Hugh Dickins Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index 0df9a57b1de..fc2aecb70a9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page) */ void free_pages_and_swap_cache(struct page **pages, int nr) { - int chunk = 16; struct page **pagep = pages; lru_add_drain(); while (nr) { - int todo = min(chunk, nr); + int todo = min(nr, PAGEVEC_SIZE); int i; for (i = 0; i < todo; i++) -- cgit v1.2.3 From c54ad30c784b84d0275152d0ca80985b21471811 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:56 -0800 Subject: [PATCH] mm: pagealloc opt Slightly optimise some page allocation and freeing functions by taking advantage of knowing whether or not interrupts are disabled. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07825c637a5..680cbe5b6ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -375,11 +375,10 @@ static int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { - unsigned long flags; struct page *page = NULL; int ret = 0; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (!list_empty(list) && count--) { @@ -389,12 +388,13 @@ free_pages_bulk(struct zone *zone, int count, __free_pages_bulk(page, zone, order); ret++; } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return ret; } void __free_pages_ok(struct page *page, unsigned int order) { + unsigned long flags; LIST_HEAD(list); int i; int reserved = 0; @@ -415,7 +415,9 @@ void __free_pages_ok(struct page *page, unsigned int order) list_add(&page->lru, &list); mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { page = __rmqueue(zone, order); if (page == NULL) @@ -552,7 +553,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, allocated++; list_add_tail(&page->lru, list); } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return allocated; } @@ -589,6 +590,7 @@ void drain_remote_pages(void) #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { + unsigned long flags; struct zone *zone; int i; @@ -600,8 +602,10 @@ static void __drain_pages(unsigned int cpu) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; + local_irq_save(flags); pcp->count -= free_pages_bulk(zone, pcp->count, &pcp->list, 0); + local_irq_restore(flags); } } } @@ -744,7 +748,7 @@ again: if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (pcp->count) { + if (likely(pcp->count)) { page = list_entry(pcp->list.next, struct page, lru); list_del(&page->lru); pcp->count--; -- cgit v1.2.3 From 77a8a78834561398fb4cb1480afa7b0e80b1dd53 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:57 -0800 Subject: [PATCH] mm: set_page_refs opt Inline set_page_refs. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 19 +++++++++++++++++-- mm/page_alloc.c | 17 ----------------- 2 files changed, 17 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 6bf134e8fb3..85004f540e3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -9,5 +9,20 @@ * 2 of the License, or (at your option) any later version. */ -/* page_alloc.c */ -extern void set_page_refs(struct page *page, int order); +static inline void set_page_refs(struct page *page, int order) +{ +#ifdef CONFIG_MMU + set_page_count(page, 1); +#else + int i; + + /* + * We need to reference all the pages for this order, otherwise if + * anyone accesses one of the pages with (get/put) it will be freed. + * - eg: access_process_vm() + */ + for (i = 0; i < (1 << order); i++) + set_page_count(page + i, 1); +#endif /* CONFIG_MMU */ +} + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 680cbe5b6ba..6d513faf050 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -453,23 +453,6 @@ expand(struct zone *zone, struct page *page, return page; } -void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ -} - /* * This page is about to be returned from the page allocator */ -- cgit v1.2.3 From 92be2e33b155ee76399f51f41fb061f850d02f08 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:57 -0800 Subject: [PATCH] mm: microopt conditions Micro optimise some conditionals where we don't need lazy evaluation. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d513faf050..b0647b51527 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -336,9 +336,9 @@ static inline void __free_pages_bulk (struct page *page, static inline int free_pages_check(const char *function, struct page *page) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -348,7 +348,7 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_reserved )))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); @@ -458,9 +458,9 @@ expand(struct zone *zone, struct page *page, */ static int prep_new_page(struct page *page, int order) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -471,7 +471,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_reserved )))) bad_page(__FUNCTION__, page); /* -- cgit v1.2.3 From 13e7444b0ec59f96d81a4e8c379d5f38fc5f2cc1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:58 -0800 Subject: [PATCH] mm: remove bad_range bad_range is supposed to be a temporary check. It would be a pity to throw it out. Make it depend on CONFIG_DEBUG_VM instead. CONFIG_HOLES_IN_ZONE systems were relying on this to check pfn_valid in the page allocator. Add that to page_is_buddy instead. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0647b51527..088712f2ac0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -81,6 +81,7 @@ int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +#ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { int ret = 0; @@ -122,6 +123,13 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + static void bad_page(const char *function, struct page *page) { printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", @@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is free && - * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (PagePrivate(page) && (page_order(page) == order) && page_count(page) == 0) @@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page, struct free_area *area; struct page *buddy; - combined_idx = __find_combined_index(page_idx, order); buddy = __page_find_buddy(page, page_idx, order); - - if (bad_range(zone, buddy)) - break; if (!page_is_buddy(buddy, order)) break; /* Move the buddy up one level. */ + list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; -- cgit v1.2.3 From 2d92c5c9150a2a9ca3dc25da58d5042e17a96b6a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:59 -0800 Subject: [PATCH] mm: remove pcp low struct per_cpu_pages.low is useless. Remove it. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 088712f2ac0..7cff958e781 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -740,7 +740,7 @@ again: page = NULL; pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) + if (!pcp->count) pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (likely(pcp->count)) { @@ -1345,10 +1345,9 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", + printk("cpu %d %s: high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", - pageset->pcp[temperature].low, pageset->pcp[temperature].high, pageset->pcp[temperature].batch, pageset->pcp[temperature].count); @@ -1790,14 +1789,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); pcp = &p->pcp[1]; /* cold*/ pcp->count = 0; - pcp->low = 0; pcp->high = 2 * batch; pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); @@ -2193,12 +2190,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n cpu: %i pcp: %i" "\n count: %i" - "\n low: %i" "\n high: %i" "\n batch: %i", i, j, pageset->pcp[j].count, - pageset->pcp[j].low, pageset->pcp[j].high, pageset->pcp[j].batch); } -- cgit v1.2.3 From a86b1f53166a260ced8f3c8c526945bf496f2e78 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:00 -0800 Subject: [PATCH] mm: page_state fixes read_page_state and __get_page_state only traverse online CPUs, which will cause results to fluctuate when CPUs are plugged in or out. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cff958e781..379618747de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1169,12 +1169,11 @@ EXPORT_SYMBOL(nr_pagecache); DEFINE_PER_CPU(long, nr_pagecache_local) = 0; #endif -void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { int cpu = 0; memset(ret, 0, sizeof(*ret)); - cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { @@ -1227,7 +1226,7 @@ unsigned long __read_page_state(unsigned long offset) unsigned long ret = 0; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu) { unsigned long in; in = (unsigned long)&per_cpu(page_states, cpu) + offset; -- cgit v1.2.3 From 085cc7d5de3cc662da7ea78296464a0d52f3f01f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:01 -0800 Subject: [PATCH] mm: page_alloc cleanups Small cleanups that does not change generated code with the gcc's I've tested with. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 379618747de..925b0b985f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -447,8 +447,7 @@ void __free_pages_ok(struct page *page, unsigned int order) * * -- wli */ -static inline struct page * -expand(struct zone *zone, struct page *page, +static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area) { unsigned long size = 1 << high; @@ -462,7 +461,6 @@ expand(struct zone *zone, struct page *page, area->nr_free++; set_page_order(&page[size], high); } - return page; } /* @@ -522,7 +520,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) rmv_page_order(page); area->nr_free--; zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area); + return page; } return NULL; @@ -537,19 +536,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list) { int i; - int allocated = 0; - struct page *page; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) break; - allocated++; list_add_tail(&page->lru, list); } spin_unlock(&zone->lock); - return allocated; + return i; } #ifdef CONFIG_NUMA -- cgit v1.2.3 From 008857c1a49ccffc31a54c3ea7e182833bd61304 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Fri, 6 Jan 2006 00:11:01 -0800 Subject: [PATCH] Cleanup bootmem allocator and fix alloc_bootmem_low Patch cleans up the alloc_bootmem fix for swiotlb. Patch removes alloc_bootmem_*_limit api and fixes alloc_boot_*low api to do the right thing -- allocate from low32 memory. Signed-off-by: Ravikiran Thirumalai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 16b9465eb4e..cbb82ee14fb 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -393,15 +393,14 @@ unsigned long __init free_all_bootmem (void) return(free_all_bootmem_core(NODE_DATA(0))); } -void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit) +void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { pg_data_t *pgdat = pgdat_list; void *ptr; for_each_pgdat(pgdat) if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal, limit))) + align, goal, 0))) return(ptr); /* @@ -413,15 +412,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un } -void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, + unsigned long goal) { void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return (ptr); - return __alloc_bootmem_limit(size, align, goal, limit); + return __alloc_bootmem(size, align, goal); } +#define LOW32LIMIT 0xffffffff + +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) +{ + pg_data_t *pgdat = pgdat_list; + void *ptr; + + for_each_pgdat(pgdat) + if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + align, goal, LOW32LIMIT))) + return(ptr); + + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); + panic("Out of low memory"); + return NULL; +} + +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); +} -- cgit v1.2.3 From a226f6c899799fe2c4919daa0767ac579c88f7bd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Jan 2006 00:11:08 -0800 Subject: [PATCH] FRV: Clean up bootmem allocator's page freeing algorithm The attached patch cleans up the way the bootmem allocator frees pages. A new function, __free_pages_bootmem(), is provided in mm/page_alloc.c that is called from mm/bootmem.c to turn pages over to the main allocator. All the bits of code to initialise pages (clearing PG_reserved and setting the page count) are moved to here. The checks on page validity are removed, on the assumption that the struct page arrays will have been prepared correctly. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 20 ++++---------------- mm/internal.h | 2 ++ mm/page_alloc.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index cbb82ee14fb..35c32290f71 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) unsigned long v = ~map[i / BITS_PER_LONG]; if (gofast && v == ~0UL) { - int j, order; + int order; page = pfn_to_page(pfn); count += BITS_PER_LONG; - __ClearPageReserved(page); order = ffs(BITS_PER_LONG) - 1; - set_page_refs(page, order); - for (j = 1; j < BITS_PER_LONG; j++) { - if (j + 16 < BITS_PER_LONG) - prefetchw(page + j + 16); - __ClearPageReserved(page + j); - set_page_count(page + j, 0); - } - __free_pages(page, order); + __free_pages_bootmem(page, order); i += BITS_PER_LONG; page += BITS_PER_LONG; } else if (v) { @@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) for (m = 1; m && i < idx; m<<=1, page++, i++) { if (v & m) { count++; - __ClearPageReserved(page); - set_page_refs(page, 0); - __free_page(page); + __free_pages_bootmem(page, 0); } } } else { @@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) count = 0; for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { count++; - __ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + __free_pages_bootmem(page, 0); } total += count; bdata->node_bootmem_map = NULL; diff --git a/mm/internal.h b/mm/internal.h index 85004f540e3..17256bb2f4e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -26,3 +26,5 @@ static inline void set_page_refs(struct page *page, int order) #endif /* CONFIG_MMU */ } +extern void fastcall __init __free_pages_bootmem(struct page *page, + unsigned int order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 925b0b985f7..cdad3249cf7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,8 @@ unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +static void fastcall free_hot_cold_page(struct page *page, int cold); + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -432,6 +434,39 @@ void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } +/* + * permit the bootmem allocator to evade page validation on high-order frees + */ +void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) +{ + if (order == 0) { + __ClearPageReserved(page); + set_page_count(page, 0); + + free_hot_cold_page(page, 0); + } else { + LIST_HEAD(list); + int loop; + + for (loop = 0; loop < BITS_PER_LONG; loop++) { + struct page *p = &page[loop]; + + if (loop + 16 < BITS_PER_LONG) + prefetchw(p + 16); + __ClearPageReserved(p); + set_page_count(p, 0); + } + + arch_free_page(page, order); + + mod_page_state(pgfree, 1 << order); + + list_add(&page->lru, &list); + kernel_map_pages(page, 1 << order, 0); + free_pages_bulk(page_zone(page), 1, &list, order); + } +} + /* * The order of subdivision here is critical for the IO subsystem. @@ -671,7 +706,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) /* * Free a 0-order page */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); -- cgit v1.2.3 From bbfbb7cec9dd7266534b2b4b9c8be2fa425bbfc9 Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Fri, 6 Jan 2006 00:11:08 -0800 Subject: [PATCH] find_lock_page(): call __lock_page() directly. As find_lock_page() already checks with TestSetPageLocked() that page is locked, there is no need to call lock_page() that will try-lock page again (chances of page being unlocked in between are small). Call __lock_page() directly, this saves one atomic operation. Also, mark truncate-while-slept path as unlikely while we are here. (akpm: ug. But this is actually a common path for normal old read()s against a page which is under readahead I/O so ho-hum.) Signed-off-by: Nikita Danilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 6e1d08a2b8b..4ef24a39768 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -555,11 +555,12 @@ repeat: page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); - lock_page(page); + __lock_page(page); read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (unlikely(page->mapping != mapping || + page->index != offset)) { unlock_page(page); page_cache_release(page); goto repeat; -- cgit v1.2.3 From 7756b9e4e321c3c83c7aa5b9532d3e7fd7ddeb4a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Jan 2006 00:11:09 -0800 Subject: [PATCH] kill last zone_reclaim() bits Remove the last bits of Martin's ill-fated sys_set_zone_reclaim(). Cc: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 80 ------------------------------------------------------------- 1 file changed, 80 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 795a050fe47..b2baca7645d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -74,9 +74,6 @@ struct scan_control { int may_writepage; - /* Can pages be swapped as part of reclaim? */ - int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -430,8 +427,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; if (!add_to_swap(page)) goto activate_locked; } @@ -952,7 +947,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.gfp_mask = gfp_mask; sc.may_writepage = 0; - sc.may_swap = 1; inc_page_state(allocstall); @@ -1055,7 +1049,6 @@ loop_again: total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; - sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1353,76 +1346,3 @@ static int __init kswapd_init(void) } module_init(kswapd_init) - - -/* - * Try to free up some pages from this zone through reclaim. - */ -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) -{ - struct scan_control sc; - int nr_pages = 1 << order; - int total_reclaimed = 0; - - /* The reclaim may sleep, so don't do it if sleep isn't allowed */ - if (!(gfp_mask & __GFP_WAIT)) - return 0; - if (zone->all_unreclaimable) - return 0; - - sc.gfp_mask = gfp_mask; - sc.may_writepage = 0; - sc.may_swap = 0; - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - /* scan at the highest priority */ - sc.priority = 0; - disable_swap_token(); - - if (nr_pages > SWAP_CLUSTER_MAX) - sc.swap_cluster_max = nr_pages; - else - sc.swap_cluster_max = SWAP_CLUSTER_MAX; - - /* Don't reclaim the zone if there are other reclaimers active */ - if (atomic_read(&zone->reclaim_in_progress) > 0) - goto out; - - shrink_zone(zone, &sc); - total_reclaimed = sc.nr_reclaimed; - - out: - return total_reclaimed; -} - -asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, - unsigned int state) -{ - struct zone *z; - int i; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (node >= MAX_NUMNODES || !node_online(node)) - return -EINVAL; - - /* This will break if we ever add more zones */ - if (!(zone & (1<node_zones[i]; - - if (state) - z->reclaim_pages = 1; - else - z->reclaim_pages = 0; - } - - return 0; -} -- cgit v1.2.3 From 9328b8faae922e52073785ed6c1eaa8565648a0e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:10 -0800 Subject: [PATCH] mm: dma32 zone statistics Add dma32 to zone statistics. Also attempt to arrange struct page_state a bit better (visually). Signed-off-by: Nick Piggin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdad3249cf7..e12154d9c4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2277,32 +2277,40 @@ static char *vmstat_text[] = { "pgpgout", "pswpin", "pswpout", - "pgalloc_high", + "pgalloc_high", "pgalloc_normal", + "pgalloc_dma32", "pgalloc_dma", + "pgfree", "pgactivate", "pgdeactivate", "pgfault", "pgmajfault", + "pgrefill_high", "pgrefill_normal", + "pgrefill_dma32", "pgrefill_dma", "pgsteal_high", "pgsteal_normal", + "pgsteal_dma32", "pgsteal_dma", + "pgscan_kswapd_high", "pgscan_kswapd_normal", - + "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_direct_high", "pgscan_direct_normal", + "pgscan_direct_dma32", "pgscan_direct_dma", - "pginodesteal", + "pginodesteal", "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", -- cgit v1.2.3 From 224abf92b2f439a9030f21d2926ec8047d1ffcdb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:11 -0800 Subject: [PATCH] mm: bad_page optimisation Cut down size slightly by not passing bad_page the function name (it should be able to be determined by dump_stack()). And cut down the number of printks in bad_page. Also, cut down some branching in the destroy_compound_page path. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e12154d9c4e..b9fd2c238f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -132,16 +132,16 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(const char *function, struct page *page) -{ - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, - page->mapping, page_mapcount(page), page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); +static void bad_page(struct page *page) +{ + printk(KERN_EMERG "Bad page state in process '%s'\n" + "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + "Trying to fix it up, but a reboot is needed\n" + "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_lru | 1 << PG_private | 1 << PG_locked | @@ -194,19 +194,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); + if (unlikely(page[1].index != order)) + bad_page(page); for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (page_private(p) != (unsigned long)page) - bad_page(__FUNCTION__, page); + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); ClearPageCompound(p); } } @@ -316,7 +312,7 @@ static inline void __free_pages_bulk (struct page *page, unsigned long page_idx; int order_size = 1 << order; - if (unlikely(order)) + if (unlikely(PageCompound(page))) destroy_compound_page(page, order); page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -348,7 +344,7 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline int free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -363,7 +359,7 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved )))) - bad_page(function, page); + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -422,7 +418,7 @@ void __free_pages_ok(struct page *page, unsigned int order) #endif for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(page + i); if (reserved) return; @@ -517,7 +513,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved )))) - bad_page(__FUNCTION__, page); + bad_page(page); /* * For now, we report if PG_reserved was found set, but do not @@ -716,7 +712,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; - if (free_pages_check(__FUNCTION__, page)) + if (free_pages_check(page)) return; inc_page_state(pgfree); -- cgit v1.2.3 From 9617d95e6e9ffd883cf90a89724fe60d7ab22f9a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:12 -0800 Subject: [PATCH] mm: rmap optimisation Optimise rmap functions by minimising atomic operations when we know there will be no concurrent modifications. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 +++--- mm/rmap.c | 49 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e249088908c..d7ca7de10f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1498,7 +1498,7 @@ gotten: update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address); /* Free the old page.. */ new_page = old_page; @@ -1978,7 +1978,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); SetPageReferenced(page); - page_add_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address); } else { /* Map the ZERO_PAGE - vm_page_prot is readonly */ page = ZERO_PAGE(address); @@ -2109,7 +2109,7 @@ retry: if (anon) { inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); + page_add_new_anon_rmap(new_page, vma, address); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); diff --git a/mm/rmap.c b/mm/rmap.c index f853c6def15..4107f64ff74 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -434,6 +434,26 @@ int page_referenced(struct page *page, int is_locked) return referenced; } +/** + * page_set_anon_rmap - setup new anonymous rmap + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + */ +static void __page_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + + page->index = linear_page_index(vma, address); + + inc_page_state(nr_mapped); +} + /** * page_add_anon_rmap - add pte mapping to an anonymous page * @page: the page to add the mapping to @@ -445,20 +465,27 @@ int page_referenced(struct page *page, int is_locked) void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - if (atomic_inc_and_test(&page->_mapcount)) { - struct anon_vma *anon_vma = vma->anon_vma; - - BUG_ON(!anon_vma); - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; - - page->index = linear_page_index(vma, address); - - inc_page_state(nr_mapped); - } + if (atomic_inc_and_test(&page->_mapcount)) + __page_set_anon_rmap(page, vma, address); /* else checking page index and mapping is racy */ } +/* + * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * Same as page_add_anon_rmap but must only be called on *new* pages. + * This means the inc-and-test can be bypassed. + */ +void page_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ + __page_set_anon_rmap(page, vma, address); +} + /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to -- cgit v1.2.3 From 41e9b63b35b52cf918a4ffdb8d77862ab824aa8b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:13 -0800 Subject: [PATCH] mm: pfault optimisation This atomic operation is superfluous: the pte will be added with the referenced bit set, and the page will be referenced through this mapping after the page fault handler returns anyway. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d7ca7de10f4..7197f9bcd38 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1977,7 +1977,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); - SetPageReferenced(page); page_add_new_anon_rmap(page, vma, address); } else { /* Map the ZERO_PAGE - vm_page_prot is readonly */ -- cgit v1.2.3 From 210fe530305ee50cd889fe9250168228b2994f32 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Jan 2006 00:11:14 -0800 Subject: [PATCH] vmscan: balancing fix Revert a patch which went into 2.6.8-rc1. The changelog for that patch was: The shrink_zone() logic can, under some circumstances, cause far too many pages to be reclaimed. Say, we're scanning at high priority and suddenly hit a large number of reclaimable pages on the LRU. Change things so we bale out when SWAP_CLUSTER_MAX pages have been reclaimed. Problem is, this change caused significant imbalance in inter-zone scan balancing by truncating scans of larger zones. Suppose, for example, ZONE_HIGHMEM is 10x the size of ZONE_NORMAL. The zone balancing algorithm would require that if we're scanning 100 pages of ZONE_HIGHMEM, we should scan 10 pages of ZONE_NORMAL. But this logic will cause the scanning of ZONE_HIGHMEM to bale out after only 32 pages are reclaimed. Thus effectively causing smaller zones to be scanned relatively harder than large ones. Now I need to remember what the workload was which caused me to write this patch originally, then fix it up in a different way... Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b2baca7645d..5c8a412b43f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,9 +63,6 @@ struct scan_control { unsigned long nr_mapped; /* From page_state */ - /* How many pages shrink_cache() should reclaim */ - int nr_to_reclaim; - /* Ask shrink_caches, or shrink_zone to scan at this priority */ unsigned int priority; @@ -656,7 +653,6 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) if (current_is_kswapd()) mod_page_state(kswapd_steal, nr_freed); mod_page_state_zone(zone, pgsteal, nr_freed); - sc->nr_to_reclaim -= nr_freed; spin_lock_irq(&zone->lru_lock); /* @@ -856,8 +852,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) else nr_inactive = 0; - sc->nr_to_reclaim = sc->swap_cluster_max; - while (nr_active || nr_inactive) { if (nr_active) { sc->nr_to_scan = min(nr_active, @@ -871,8 +865,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) (unsigned long)sc->swap_cluster_max); nr_inactive -= sc->nr_to_scan; shrink_cache(zone, sc); - if (sc->nr_to_reclaim <= 0) - break; } } -- cgit v1.2.3 From 80bfed904c690642db9d4178950735299160950b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 6 Jan 2006 00:11:14 -0800 Subject: [PATCH] consolidate lru_add_drain() and lru_drain_cache() Cc: Christoph Lameter Cc: Rajesh Shah Cc: Li Shaohua Cc: Srivatsa Vaddagiri Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 73d351439ef..ee6d71ccfa5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page) put_cpu_var(lru_add_active_pvecs); } -void lru_add_drain(void) +static void __lru_add_drain(int cpu) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); + /* CPU is dead, so no locking needed. */ if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &__get_cpu_var(lru_add_active_pvecs); + pvec = &per_cpu(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_pvecs); +} + +void lru_add_drain(void) +{ + __lru_add_drain(get_cpu()); + put_cpu(); } /* @@ -412,17 +418,6 @@ void vm_acct_memory(long pages) } #ifdef CONFIG_HOTPLUG_CPU -static void lru_drain_cache(unsigned int cpu) -{ - struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); - - /* CPU is dead, so no locking needed. */ - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); -} /* Drop the CPU's cached committed space back into the central pool. */ static int cpu_swap_callback(struct notifier_block *nfb, @@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, if (action == CPU_DEAD) { atomic_add(*committed, &vm_committed_space); *committed = 0; - lru_drain_cache((long)hcpu); + __lru_add_drain((long)hcpu); } return NOTIFY_OK; } -- cgit v1.2.3 From f3fe65122da05e1cd4c9140340d96ea2f95d0c49 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 6 Jan 2006 00:11:15 -0800 Subject: [PATCH] mm: add populated_zone() helper There are numerous places we check whether a zone is populated or not. Provide a helper function to check for populated zones and convert all checks for zone->present_pages. Signed-off-by: Con Kolivas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- mm/vmscan.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b9fd2c238f1..8f3de5af92d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1358,7 +1358,7 @@ void show_free_areas(void) show_node(zone); printk("%s per-cpu:", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk(" empty\n"); continue; } else @@ -1435,7 +1435,7 @@ void show_free_areas(void) show_node(zone); printk("%s: ", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk("empty\n"); continue; } @@ -2134,7 +2134,7 @@ static int frag_show(struct seq_file *m, void *arg) int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2167,7 +2167,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { int i; - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c8a412b43f..7681d8ee04f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -897,7 +897,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) @@ -1069,7 +1069,7 @@ loop_again: for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (zone->all_unreclaimable && @@ -1106,7 +1106,7 @@ scan: struct zone *zone = pgdat->node_zones + i; int nr_slab; - if (zone->present_pages == 0) + if (!populated_zone(zone)) continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) @@ -1258,7 +1258,7 @@ void wakeup_kswapd(struct zone *zone, int order) { pg_data_t *pgdat; - if (zone->present_pages == 0) + if (!populated_zone(zone)) return; pgdat = zone->zone_pgdat; -- cgit v1.2.3 From 1a93205bdffd9d7278d4a66081cdb48452522a58 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:16 -0800 Subject: [PATCH] mm: simplify build_zonelists_node by removing the case statement. Simplify build_zonelists_node by removing the case statement. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f3de5af92d..7adc9526d32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1455,35 +1455,23 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { +static int __init build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int j, int k) +{ + struct zone *zone; + + BUG_ON(k > ZONE_HIGHMEM); + for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) { + if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG(); + BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA32: - zone = pgdat->node_zones + ZONE_DMA32; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; } - return j; } -- cgit v1.2.3 From 4be38e351c5f455f6f490f5aff29053e33ab4f99 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:17 -0800 Subject: [PATCH] mm: move determination of policy_zone into page allocator Currently the function to build a zonelist for a BIND policy has the side effect to set the policy_zone. This seems to be a bit strange. policy zone seems to not be initialized elsewhere and therefore 0. Do we police ZONE_DMA if no bind policy has been used yet? This patch moves the determination of the zone to apply policies to into the page allocator. We determine the zone while building the zonelist for nodes. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 15 +++------------ mm/page_alloc.c | 2 ++ 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 96714e2646a..0f1d2b8a952 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ -static int policy_zone; +int policy_zone = ZONE_DMA; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ @@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) if (!zl) return NULL; num = 0; - for_each_node_mask(nd, *nodes) { - int k; - for (k = MAX_NR_ZONES-1; k >= 0; k--) { - struct zone *z = &NODE_DATA(nd)->node_zones[k]; - if (!z->present_pages) - continue; - zl->zones[num++] = z; - if (k > policy_zone) - policy_zone = k; - } - } + for_each_node_mask(nd, *nodes) + zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; zl->zones[num] = NULL; return zl; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7adc9526d32..512e3f4d496 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1470,6 +1471,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; + check_highest_zone(k); } } return j; -- cgit v1.2.3 From 02a68a5ebc7dd823da7496116f42290103e1e4a9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:18 -0800 Subject: [PATCH] Fix zone policy determination The use k in the inner loop means that the highest zone nr is always used if any zone of a node is populated. This means that the policy zone is not correctly determined on arches that do no use HIGHMEM like ia64. Change the loop to decrement k which also simplifies the BUG_ON. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 512e3f4d496..ca978992c89 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1465,15 +1465,19 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zone *zone; BUG_ON(k > ZONE_HIGHMEM); - for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) { + + do { + zone = pgdat->node_zones + k; if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); + BUG_ON(k > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; check_highest_zone(k); } - } + k--; + + } while (k >= 0); return j; } -- cgit v1.2.3 From 070f80326a215d8e6c4fd6f175e28eb446c492bc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:19 -0800 Subject: [PATCH] build_zonelists_node(): rename args Give j and r meaningful names. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca978992c89..7f580779abd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1460,25 +1460,25 @@ void show_free_areas(void) * Add all populated zones of a node to the zonelist. */ static int __init build_zonelists_node(pg_data_t *pgdat, - struct zonelist *zonelist, int j, int k) + struct zonelist *zonelist, int nr_zones, int zone_type) { struct zone *zone; - BUG_ON(k > ZONE_HIGHMEM); + BUG_ON(zone_type > ZONE_HIGHMEM); do { - zone = pgdat->node_zones + k; + zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG_ON(k > ZONE_NORMAL); + BUG_ON(zone_type > ZONE_NORMAL); #endif - zonelist->zones[j++] = zone; - check_highest_zone(k); + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); } - k--; + zone_type--; - } while (k >= 0); - return j; + } while (zone_type >= 0); + return nr_zones; } static inline int highest_zone(int zone_bits) -- cgit v1.2.3 From a74609fafa2e5cc31d558012abaaa55ec9ad9da4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:20 -0800 Subject: [PATCH] mm: page_state opt Optimise page_state manipulations by introducing interrupt unsafe accessors to page_state fields. Callers must provide their own locking (either disable interrupts or not update from interrupt context). Switch over the hot callsites that can easily be moved under interrupts off sections. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 89 ++++++++++++++++++++++++++++++++------------------------- mm/rmap.c | 10 +++++-- mm/vmscan.c | 27 +++++++++-------- 3 files changed, 72 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f580779abd..fd47494cb98 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -424,9 +424,9 @@ void __free_pages_ok(struct page *page, unsigned int order) return; list_add(&page->lru, &list); - mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<zone_pgdat; pg_data_t *orig = zonelist->zones[0]->zone_pgdat; struct per_cpu_pageset *p; - local_irq_save(flags); - cpu = smp_processor_id(); - p = zone_pcp(z,cpu); + p = zone_pcp(z, cpu); if (pg == orig) { p->numa_hit++; } else { @@ -696,7 +692,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) p->local_node++; else p->other_node++; - local_irq_restore(flags); #endif } @@ -716,11 +711,11 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (free_pages_check(page)) return; - inc_page_state(pgfree); kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); + __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) @@ -753,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * -buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); + int cpu; again: + cpu = get_cpu(); if (order == 0) { struct per_cpu_pages *pcp; - page = NULL; - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); - if (!pcp->count) + if (!pcp->count) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (likely(pcp->count)) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; + if (unlikely(!pcp->count)) + goto failed; } - local_irq_restore(flags); - put_cpu(); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); + if (!page) + goto failed; } - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - if (prep_new_page(page, order)) - goto again; + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order)) + goto again; - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; } #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ @@ -871,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, continue; } - page = buffered_rmqueue(*z, order, gfp_mask); + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); if (page) { - zone_statistics(zonelist, *z); break; } } while (*(++z) != NULL); @@ -1248,7 +1251,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); } -unsigned long __read_page_state(unsigned long offset) +unsigned long read_page_state_offset(unsigned long offset) { unsigned long ret = 0; int cpu; @@ -1262,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset) return ret; } -void __mod_page_state(unsigned long offset, unsigned long delta) +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) { unsigned long flags; - void* ptr; + void *ptr; local_irq_save(flags); ptr = &__get_cpu_var(page_states); - *(unsigned long*)(ptr + offset) += delta; + *(unsigned long *)(ptr + offset) += delta; local_irq_restore(flags); } - -EXPORT_SYMBOL(__mod_page_state); +EXPORT_SYMBOL(mod_page_state_offset); void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) diff --git a/mm/rmap.c b/mm/rmap.c index 4107f64ff74..6f3f7db2712 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -451,7 +451,11 @@ static void __page_set_anon_rmap(struct page *page, page->index = linear_page_index(vma, address); - inc_page_state(nr_mapped); + /* + * nr_mapped state can be updated without turning off + * interrupts because it is not modified via interrupt. + */ + __inc_page_state(nr_mapped); } /** @@ -498,7 +502,7 @@ void page_add_file_rmap(struct page *page) BUG_ON(!pfn_valid(page_to_pfn(page))); if (atomic_inc_and_test(&page->_mapcount)) - inc_page_state(nr_mapped); + __inc_page_state(nr_mapped); } /** @@ -522,7 +526,7 @@ void page_remove_rmap(struct page *page) */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - dec_page_state(nr_mapped); + __dec_page_state(nr_mapped); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7681d8ee04f..be8235fb193 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -645,16 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) goto done; max_scan -= nr_scan; - if (current_is_kswapd()) - mod_page_state_zone(zone, pgscan_kswapd, nr_scan); - else - mod_page_state_zone(zone, pgscan_direct, nr_scan); nr_freed = shrink_list(&page_list, sc); - if (current_is_kswapd()) - mod_page_state(kswapd_steal, nr_freed); - mod_page_state_zone(zone, pgsteal, nr_freed); - spin_lock_irq(&zone->lru_lock); + local_irq_disable(); + if (current_is_kswapd()) { + __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); + __mod_page_state(kswapd_steal, nr_freed); + } else + __mod_page_state_zone(zone, pgscan_direct, nr_scan); + __mod_page_state_zone(zone, pgsteal, nr_freed); + + spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. */ @@ -816,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } } zone->nr_active += pgmoved; - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + spin_unlock(&zone->lru_lock); + + __mod_page_state_zone(zone, pgrefill, pgscanned); + __mod_page_state(pgdeactivate, pgdeactivate); + local_irq_enable(); - mod_page_state_zone(zone, pgrefill, pgscanned); - mod_page_state(pgdeactivate, pgdeactivate); + pagevec_release(&pvec); } /* -- cgit v1.2.3 From b0e15190ead07056ab0c3844a499ff35e66d27cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Jan 2006 00:11:42 -0800 Subject: [PATCH] NOMMU: Make SYSV IPC SHM use ramfs facilities on NOMMU The attached patch makes the SYSV IPC shared memory facilities use the new ramfs facilities on a no-MMU kernel. The following changes are made: (1) There are now shmem_mmap() and shmem_get_unmapped_area() functions to allow the IPC SHM facilities to commune with the tiny-shmem and shmem code. (2) ramfs files now need resizing using do_truncate() rather than by modifying the inode size directly (see shmem_file_setup()). This causes ramfs to attempt to bind a block of pages of sufficient size to the inode. (3) CONFIG_SYSVIPC is no longer contingent on CONFIG_MMU. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 7 +++++++ mm/shmem.c | 2 +- mm/tiny-shmem.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index c1196812876..c10262d6823 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } + +struct page *filemap_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + BUG(); + return NULL; +} diff --git a/mm/shmem.c b/mm/shmem.c index 65c148efa2e..a1f2f02af72 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1270,7 +1270,7 @@ out_nomem: return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index b58abcf44ed..cdc6d431972 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto close_file; d_instantiate(dentry, inode); - inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + file->f_vfsmnt = mntget(shm_mnt); file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &ramfs_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; + + /* notify everyone as to the change of file size */ + error = do_truncate(dentry, size, file); + if (error < 0) + goto close_file; + return file; close_file: @@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page) { return 0; } + +int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); +#ifndef CONFIG_MMU + return ramfs_nommu_mmap(file, vma); +#else + return 0; +#endif +} + +#ifndef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif -- cgit v1.2.3 From c898ec16e83331abde39118e22e9e38335bbb950 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 6 Jan 2006 00:12:07 -0800 Subject: [PATCH] allow flatmem to be disabled when only sparsemem is implemented On architectures that implement sparsemem but not discontigmem we want to be able to hide the flatmem option in some cases. On ppc64 for example, when we select NUMA we must not select flatmem. Signed-off-by: Anton Blanchard Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 21eb51d4da8..b3db11f137e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -11,7 +11,7 @@ choice config FLATMEM_MANUAL bool "Flat Memory" - depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE + depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE help This option allows you to change some of the ways that Linux manages its memory internally. Most users will -- cgit v1.2.3 From 3a291a20bd6fcfafb2109031f0760a0d3e92ecd7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Jan 2006 00:16:37 -0800 Subject: [PATCH] mm: add a new function (needed for swap suspend) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds the function get_swap_page_of_type() allowing us to specify an index in swap_info[] and select a swap_info_struct structure to be used for allocating a swap page. This function (or another one of similar functionality) will be necessary for implementing the image-writing part of swsusp in the user space.  It can also be used for simplifying the current in-kernel implementation of the image-writing part of swsusp. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index edafeace301..6da4b28b896 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -211,6 +211,26 @@ noswap: return (swp_entry_t) {0}; } +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + spin_lock(&swap_lock); + si = swap_info + type; + if (si->flags & SWP_WRITEOK) { + nr_swap_pages--; + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + nr_swap_pages++; + } + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + static struct swap_info_struct * swap_info_get(swp_entry_t entry) { struct swap_info_struct * p; -- cgit v1.2.3 From 22905f775dd6a8b73be99826dcad07ceec00244b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Nov 2005 15:07:01 -0800 Subject: identify multipage ->writepages() calls NFS needs to be able to distinguish between single-page ->writepage() calls and multipage ->writepages() calls. For the single-page writepage calls NFS can kick off the I/O within the context of ->writepage(). For multipage ->writepages calls, nfs_writepage() will leave the I/O pending and nfs_writepages() will kick off the I/O when it all has been queued up within NFS. Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- mm/page-writeback.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0166ea15c9e..5240e426c1f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -550,11 +550,17 @@ void __init page_writeback_init(void) int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { + int ret; + if (wbc->nr_to_write <= 0) return 0; + wbc->for_writepages = 1; if (mapping->a_ops->writepages) - return mapping->a_ops->writepages(mapping, wbc); - return generic_writepages(mapping, wbc); + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + wbc->for_writepages = 0; + return ret; } /** -- cgit v1.2.3 From 67207b9664a8d603138ef1556141e6d0a102bea7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 15 Nov 2005 15:53:48 -0500 Subject: [PATCH] spufs: The SPU file system, base This is the current version of the spu file system, used for driving SPEs on the Cell Broadband Engine. This release is almost identical to the version for the 2.6.14 kernel posted earlier, which is available as part of the Cell BE Linux distribution from http://www.bsc.es/projects/deepcomputing/linuxoncell/. The first patch provides all the interfaces for running spu application, but does not have any support for debugging SPU tasks or for scheduling. Both these functionalities are added in the subsequent patches. See Documentation/filesystems/spufs.txt on how to use spufs. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7197f9bcd38..3944fec3801 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2267,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } +EXPORT_SYMBOL_GPL(__handle_mm_fault); + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. -- cgit v1.2.3 From 84c2008af01132c4ca257ed9b595693c611df15d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 8 Jan 2006 01:00:28 -0800 Subject: [PATCH] revert "mm: page_state fixes" Hugh says: page_alloc_cpu_notify() specifically contains code to /* Add dead cpu's page_states to our own. */ which handles this more efficiently. Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd47494cb98..0b98f428b07 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1204,6 +1204,7 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) int cpu = 0; memset(ret, 0, sizeof(*ret)); + cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { @@ -1256,7 +1257,7 @@ unsigned long read_page_state_offset(unsigned long offset) unsigned long ret = 0; int cpu; - for_each_cpu(cpu) { + for_each_online_cpu(cpu) { unsigned long in; in = (unsigned long)&per_cpu(page_states, cpu) + offset; -- cgit v1.2.3 From f9f7500521b25dbf1aba476b81230489ad8e2c4b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 8 Jan 2006 01:00:33 -0800 Subject: [PATCH] slab: remove unused align parameter from alloc_percpu __alloc_percpu and alloc_percpu both take an 'align' argument which is completely ignored. snmp6_mib_init() in net/ipv6/af_inet6.c attempts to use it, but it will be ignored. Therefore, remove the 'align' argument and fixup the lone caller. Signed-off-by: Matthew Dobson Acked-by: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e5ec26e0c46..eb70fddf205 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2944,9 +2944,8 @@ EXPORT_SYMBOL(__kmalloc); * Objects should be dereferenced using the per_cpu_ptr macro only. * * @size: how many bytes of memory are required. - * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. */ -void *__alloc_percpu(size_t size, size_t align) +void *__alloc_percpu(size_t size) { int i; struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); -- cgit v1.2.3 From 85289f98ddc13f6cea82c59d6ff78f9d205dfccc Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 8 Jan 2006 01:00:36 -0800 Subject: [PATCH] slab: extract slabinfo header printing to separate function This patch extracts slabinfo header printing to a separate function print_slabinfo_header() to make s_start() more readable. Signed-off-by: Matthew Dobson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index eb70fddf205..3d3b5a46854 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3364,32 +3364,37 @@ next: #ifdef CONFIG_PROC_FS -static void *s_start(struct seq_file *m, loff_t *pos) +static void print_slabinfo_header(struct seq_file *m) { - loff_t n = *pos; - struct list_head *p; - - down(&cache_chain_sem); - if (!n) { - /* - * Output format version, so at least we can change it - * without _too_ many complaints. - */ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ #if STATS - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); #else - seq_puts(m, "slabinfo - version: 2.1\n"); + seq_puts(m, "slabinfo - version: 2.1\n"); #endif - seq_puts(m, "# name "); - seq_puts(m, " : tunables "); - seq_puts(m, " : slabdata "); + seq_puts(m, "# name " + " "); + seq_puts(m, " : tunables "); + seq_puts(m, " : slabdata "); #if STATS - seq_puts(m, " : globalstat " - " "); - seq_puts(m, " : cpustat "); + seq_puts(m, " : globalstat " + " "); + seq_puts(m, " : cpustat "); #endif - seq_putc(m, '\n'); - } + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + down(&cache_chain_sem); + if (!n) + print_slabinfo_header(m); p = cache_chain.next; while (n--) { p = p->next; -- cgit v1.2.3 From 4d268eba1187ef66844a6a33b9431e5d0dadd4ad Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 8 Jan 2006 01:00:36 -0800 Subject: [PATCH] slab: extract slab order calculation to separate function This patch moves the ugly loop that determines the 'optimal' size (page order) of cache slabs from kmem_cache_create() to a separate function and cleans it up a bit. Thanks to Matthew Wilcox for the help with this patch. Signed-off-by: Matthew Dobson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 89 +++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3d3b5a46854..2551b1eeadb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1473,6 +1473,53 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index) } } +/** + * calculate_slab_order - calculate size (page order) of slabs and the number + * of objects per slab. + * + * This could be made much more intelligent. For now, try to avoid using + * high order pages for slabs. When the gfp() functions are more friendly + * towards high-order requests, this should be changed. + */ +static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, + size_t align, gfp_t flags) +{ + size_t left_over = 0; + + for ( ; ; cachep->gfporder++) { + unsigned int num; + size_t remainder; + + if (cachep->gfporder > MAX_GFP_ORDER) { + cachep->num = 0; + break; + } + + cache_estimate(cachep->gfporder, size, align, flags, + &remainder, &num); + if (!num) + continue; + /* More than offslab_limit objects will cause problems */ + if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) + break; + + cachep->num = num; + left_over = remainder; + + /* + * Large number of objects is good, but very large slabs are + * currently bad for the gfp()s. + */ + if (cachep->gfporder >= slab_break_gfp_order) + break; + + if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) + /* Acceptable internal fragmentation */ + break; + } + return left_over; +} + /** * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -1682,46 +1729,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->gfporder = 0; cache_estimate(cachep->gfporder, size, align, flags, &left_over, &cachep->num); - } else { - /* - * Calculate size (in pages) of slabs, and the num of objs per - * slab. This could be made much more intelligent. For now, - * try to avoid using high page-orders for slabs. When the - * gfp() funcs are more friendly towards high-order requests, - * this should be changed. - */ - do { - unsigned int break_flag = 0; -cal_wastage: - cache_estimate(cachep->gfporder, size, align, flags, - &left_over, &cachep->num); - if (break_flag) - break; - if (cachep->gfporder >= MAX_GFP_ORDER) - break; - if (!cachep->num) - goto next; - if (flags & CFLGS_OFF_SLAB && - cachep->num > offslab_limit) { - /* This num of objs will cause problems. */ - cachep->gfporder--; - break_flag++; - goto cal_wastage; - } - - /* - * Large num of objs is good, but v. large slabs are - * currently bad for the gfp()s. - */ - if (cachep->gfporder >= slab_break_gfp_order) - break; - - if ((left_over*8) <= (PAGE_SIZE<gfporder)) - break; /* Acceptable internal fragmentation. */ -next: - cachep->gfporder++; - } while (1); - } + } else + left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) { printk("kmem_cache_create: couldn't create cache %s.\n", name); -- cgit v1.2.3 From b28a02de8c70d41d6b6ba8911e83ed3ccf2e13f8 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 8 Jan 2006 01:00:37 -0800 Subject: [PATCH] slab: fix code formatting The slab allocator code is inconsistent in coding style and messy. For this patch, I ran Lindent for mm/slab.c and fixed up goofs by hand. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 964 ++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 500 insertions(+), 464 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2551b1eeadb..f71d8be2f4e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -130,7 +130,6 @@ #define FORCED_DEBUG 0 #endif - /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) @@ -217,12 +216,12 @@ static unsigned long offslab_limit; * Slabs are chained into three list: fully used, partial, fully free slabs. */ struct slab { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; }; /* @@ -242,9 +241,9 @@ struct slab { * We assume struct slab_rcu can overlay struct slab when destroying. */ struct slab_rcu { - struct rcu_head head; - kmem_cache_t *cachep; - void *addr; + struct rcu_head head; + kmem_cache_t *cachep; + void *addr; }; /* @@ -279,23 +278,23 @@ struct array_cache { #define BOOT_CPUCACHE_ENTRIES 1 struct arraycache_init { struct array_cache cache; - void * entries[BOOT_CPUCACHE_ENTRIES]; + void *entries[BOOT_CPUCACHE_ENTRIES]; }; /* * The slab lists for all objects. */ struct kmem_list3 { - struct list_head slabs_partial; /* partial list first, better asm code */ - struct list_head slabs_full; - struct list_head slabs_free; - unsigned long free_objects; - unsigned long next_reap; - int free_touched; - unsigned int free_limit; - spinlock_t list_lock; - struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + unsigned long next_reap; + int free_touched; + unsigned int free_limit; + spinlock_t list_lock; + struct array_cache *shared; /* shared per node */ + struct array_cache **alien; /* on other nodes */ }; /* @@ -367,63 +366,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent) * * manages a cache. */ - + struct kmem_cache { /* 1) per-cpu data, touched during every alloc/free */ - struct array_cache *array[NR_CPUS]; - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - unsigned int objsize; + struct array_cache *array[NR_CPUS]; + unsigned int batchcount; + unsigned int limit; + unsigned int shared; + unsigned int objsize; /* 2) touched by every alloc & free from the backend */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - spinlock_t spinlock; + struct kmem_list3 *nodelists[MAX_NUMNODES]; + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + spinlock_t spinlock; /* 3) cache_grow/shrink */ /* order of pgs per slab (2^n) */ - unsigned int gfporder; + unsigned int gfporder; /* force GFP flags, e.g. GFP_DMA */ - gfp_t gfpflags; + gfp_t gfpflags; - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - unsigned int colour_next; /* cache colouring */ - kmem_cache_t *slabp_cache; - unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + unsigned int colour_next; /* cache colouring */ + kmem_cache_t *slabp_cache; + unsigned int slab_size; + unsigned int dflags; /* dynamic flags */ /* constructor func */ - void (*ctor)(void *, kmem_cache_t *, unsigned long); + void (*ctor) (void *, kmem_cache_t *, unsigned long); /* de-constructor func */ - void (*dtor)(void *, kmem_cache_t *, unsigned long); + void (*dtor) (void *, kmem_cache_t *, unsigned long); /* 4) cache creation/removal */ - const char *name; - struct list_head next; + const char *name; + struct list_head next; /* 5) statistics */ #if STATS - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long grown; + unsigned long reaped; + unsigned long errors; + unsigned long max_freeable; + unsigned long node_allocs; + unsigned long node_frees; + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; #endif #if DEBUG - int dbghead; - int reallen; + int dbghead; + int reallen; #endif }; @@ -523,14 +522,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); if (cachep->flags & SLAB_STORE_USER) - return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); - return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); + return (unsigned long *)(objp + cachep->objsize - + 2 * BYTES_PER_WORD); + return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD); } static void **dbg_userword(kmem_cache_t *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_STORE_USER)); - return (void**)(objp+cachep->objsize-BYTES_PER_WORD); + return (void **)(objp + cachep->objsize - BYTES_PER_WORD); } #else @@ -607,31 +607,31 @@ struct cache_names { static struct cache_names __initdata cache_names[] = { #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, #include - { NULL, } + {NULL,} #undef CACHE }; static struct arraycache_init initarray_cache __initdata = - { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; + { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = - { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; + { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { - .batchcount = 1, - .limit = BOOT_CPUCACHE_ENTRIES, - .shared = 1, - .objsize = sizeof(kmem_cache_t), - .flags = SLAB_NO_REAP, - .spinlock = SPIN_LOCK_UNLOCKED, - .name = "kmem_cache", + .batchcount = 1, + .limit = BOOT_CPUCACHE_ENTRIES, + .shared = 1, + .objsize = sizeof(kmem_cache_t), + .flags = SLAB_NO_REAP, + .spinlock = SPIN_LOCK_UNLOCKED, + .name = "kmem_cache", #if DEBUG - .reallen = sizeof(kmem_cache_t), + .reallen = sizeof(kmem_cache_t), #endif }; /* Guard access to the cache-chain. */ -static struct semaphore cache_chain_sem; +static struct semaphore cache_chain_sem; static struct list_head cache_chain; /* @@ -655,9 +655,9 @@ static enum { static DEFINE_PER_CPU(struct work_struct, reap_work); -static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node); -static void enable_cpucache (kmem_cache_t *cachep); -static void cache_reap (void *unused); +static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node); +static void enable_cpucache(kmem_cache_t *cachep); +static void cache_reap(void *unused); static int __node_shrink(kmem_cache_t *cachep, int node); static inline struct array_cache *ac_data(kmem_cache_t *cachep) @@ -671,9 +671,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) #if DEBUG /* This happens if someone tries to call - * kmem_cache_create(), or __kmalloc(), before - * the generic caches are initialized. - */ + * kmem_cache_create(), or __kmalloc(), before + * the generic caches are initialized. + */ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); #endif while (size > csizep->cs_size) @@ -697,10 +697,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep); /* Cal the num objs, wastage, and bytes left over for a given slab size. */ static void cache_estimate(unsigned long gfporder, size_t size, size_t align, - int flags, size_t *left_over, unsigned int *num) + int flags, size_t *left_over, unsigned int *num) { int i; - size_t wastage = PAGE_SIZE< 0) i--; @@ -718,8 +718,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align, i = SLAB_LIMIT; *num = i; - wastage -= i*size; - wastage -= ALIGN(base+i*extra, align); + wastage -= i * size; + wastage -= ALIGN(base + i * extra, align); *left_over = wastage; } @@ -728,7 +728,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align, static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) { printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", - function, cachep->name, msg); + function, cachep->name, msg); dump_stack(); } @@ -755,9 +755,9 @@ static void __devinit start_cpu_timer(int cpu) } static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount) + int batchcount) { - int memsize = sizeof(void*)*entries+sizeof(struct array_cache); + int memsize = sizeof(void *) * entries + sizeof(struct array_cache); struct array_cache *nc = NULL; nc = kmalloc_node(memsize, GFP_KERNEL, node); @@ -775,7 +775,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, static inline struct array_cache **alloc_alien_cache(int node, int limit) { struct array_cache **ac_ptr; - int memsize = sizeof(void*)*MAX_NUMNODES; + int memsize = sizeof(void *) * MAX_NUMNODES; int i; if (limit > 1) @@ -789,7 +789,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit) } ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); if (!ac_ptr[i]) { - for (i--; i <=0; i--) + for (i--; i <= 0; i--) kfree(ac_ptr[i]); kfree(ac_ptr); return NULL; @@ -807,12 +807,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) return; for_each_node(i) - kfree(ac_ptr[i]); + kfree(ac_ptr[i]); kfree(ac_ptr); } -static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) +static inline void __drain_alien_cache(kmem_cache_t *cachep, + struct array_cache *ac, int node) { struct kmem_list3 *rl3 = cachep->nodelists[node]; @@ -826,7 +827,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) { - int i=0; + int i = 0; struct array_cache *ac; unsigned long flags; @@ -846,10 +847,10 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) #endif static int __devinit cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) + unsigned long action, void *hcpu) { long cpu = (long)hcpu; - kmem_cache_t* cachep; + kmem_cache_t *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); int memsize = sizeof(struct kmem_list3); @@ -871,27 +872,27 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, */ if (!cachep->nodelists[node]) { if (!(l3 = kmalloc_node(memsize, - GFP_KERNEL, node))) + GFP_KERNEL, node))) goto bad; kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep)%REAPTIMEOUT_LIST3; + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; cachep->nodelists[node] = l3; } spin_lock_irq(&cachep->nodelists[node]->list_lock); cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; spin_unlock_irq(&cachep->nodelists[node]->list_lock); } /* Now we can go ahead with allocating the shared array's - & array cache's */ + & array cache's */ list_for_each_entry(cachep, &cache_chain, next) { nc = alloc_arraycache(node, cachep->limit, - cachep->batchcount); + cachep->batchcount); if (!nc) goto bad; cachep->array[cpu] = nc; @@ -900,12 +901,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, BUG_ON(!l3); if (!l3->shared) { if (!(nc = alloc_arraycache(node, - cachep->shared*cachep->batchcount, - 0xbaadf00d))) - goto bad; + cachep->shared * + cachep->batchcount, + 0xbaadf00d))) + goto bad; /* we are serialised from CPU_DEAD or - CPU_UP_CANCELLED by the cpucontrol lock */ + CPU_UP_CANCELLED by the cpucontrol lock */ l3->shared = nc; } } @@ -942,13 +944,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, free_block(cachep, nc->entry, nc->avail, node); if (!cpus_empty(mask)) { - spin_unlock(&l3->list_lock); - goto unlock_cache; - } + spin_unlock(&l3->list_lock); + goto unlock_cache; + } if (l3->shared) { free_block(cachep, l3->shared->entry, - l3->shared->avail, node); + l3->shared->avail, node); kfree(l3->shared); l3->shared = NULL; } @@ -966,7 +968,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, } else { spin_unlock(&l3->list_lock); } -unlock_cache: + unlock_cache: spin_unlock_irq(&cachep->spinlock); kfree(nc); } @@ -975,7 +977,7 @@ unlock_cache: #endif } return NOTIFY_OK; -bad: + bad: up(&cache_chain_sem); return NOTIFY_BAD; } @@ -985,8 +987,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, - int nodeid) +static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid) { struct kmem_list3 *ptr; @@ -1055,14 +1056,14 @@ void __init kmem_cache_init(void) cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, - &left_over, &cache_cache.num); + &left_over, &cache_cache.num); if (!cache_cache.num) BUG(); - cache_cache.colour = left_over/cache_cache.colour_off; + cache_cache.colour = left_over / cache_cache.colour_off; cache_cache.colour_next = 0; - cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + - sizeof(struct slab), cache_line_size()); + cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + + sizeof(struct slab), cache_line_size()); /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; @@ -1074,14 +1075,18 @@ void __init kmem_cache_init(void) */ sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); + sizes[INDEX_AC].cs_size, + ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | + SLAB_PANIC), NULL, NULL); if (INDEX_AC != INDEX_L3) sizes[INDEX_L3].cs_cachep = - kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); + kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, + ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, + NULL); while (sizes->cs_size != ULONG_MAX) { /* @@ -1091,35 +1096,41 @@ void __init kmem_cache_init(void) * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ - if(!sizes->cs_cachep) + if (!sizes->cs_cachep) sizes->cs_cachep = kmem_cache_create(names->name, - sizes->cs_size, ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS + | SLAB_PANIC), + NULL, NULL); /* Inc off-slab bufctl limit until the ceiling is hit. */ if (!(OFF_SLAB(sizes->cs_cachep))) { - offslab_limit = sizes->cs_size-sizeof(struct slab); + offslab_limit = sizes->cs_size - sizeof(struct slab); offslab_limit /= sizeof(kmem_bufctl_t); } sizes->cs_dmacachep = kmem_cache_create(names->name_dma, - sizes->cs_size, ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), - NULL, NULL); + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | + SLAB_CACHE_DMA | + SLAB_PANIC), NULL, + NULL); sizes++; names++; } /* 4) Replace the bootstrap head arrays */ { - void * ptr; + void *ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); local_irq_disable(); BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); memcpy(ptr, ac_data(&cache_cache), - sizeof(struct arraycache_init)); + sizeof(struct arraycache_init)); cache_cache.array[smp_processor_id()] = ptr; local_irq_enable(); @@ -1127,11 +1138,11 @@ void __init kmem_cache_init(void) local_irq_disable(); BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) - != &initarray_generic.cache); + != &initarray_generic.cache); memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), - sizeof(struct arraycache_init)); + sizeof(struct arraycache_init)); malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; + ptr; local_irq_enable(); } /* 5) Replace the bootstrap kmem_list3's */ @@ -1139,16 +1150,16 @@ void __init kmem_cache_init(void) int node; /* Replace the static kmem_list3 structures for the boot cpu */ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], - numa_node_id()); + numa_node_id()); for_each_online_node(node) { init_list(malloc_sizes[INDEX_AC].cs_cachep, - &initkmem_list3[SIZE_AC+node], node); + &initkmem_list3[SIZE_AC + node], node); if (INDEX_AC != INDEX_L3) { init_list(malloc_sizes[INDEX_L3].cs_cachep, - &initkmem_list3[SIZE_L3+node], - node); + &initkmem_list3[SIZE_L3 + node], + node); } } } @@ -1158,7 +1169,7 @@ void __init kmem_cache_init(void) kmem_cache_t *cachep; down(&cache_chain_sem); list_for_each_entry(cachep, &cache_chain, next) - enable_cpucache(cachep); + enable_cpucache(cachep); up(&cache_chain_sem); } @@ -1184,7 +1195,7 @@ static int __init cpucache_init(void) * pages to gfp. */ for_each_online_cpu(cpu) - start_cpu_timer(cpu); + start_cpu_timer(cpu); return 0; } @@ -1226,7 +1237,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) */ static void kmem_freepages(kmem_cache_t *cachep, void *addr) { - unsigned long i = (1<gfporder); + unsigned long i = (1 << cachep->gfporder); struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; @@ -1239,13 +1250,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; free_pages((unsigned long)addr, cachep->gfporder); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - atomic_sub(1<gfporder, &slab_reclaim_pages); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); } static void kmem_rcu_free(struct rcu_head *head) { - struct slab_rcu *slab_rcu = (struct slab_rcu *) head; + struct slab_rcu *slab_rcu = (struct slab_rcu *)head; kmem_cache_t *cachep = slab_rcu->cachep; kmem_freepages(cachep, slab_rcu->addr); @@ -1257,19 +1268,19 @@ static void kmem_rcu_free(struct rcu_head *head) #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, - unsigned long caller) + unsigned long caller) { int size = obj_reallen(cachep); - addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; + addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)]; - if (size < 5*sizeof(unsigned long)) + if (size < 5 * sizeof(unsigned long)) return; - *addr++=0x12345678; - *addr++=caller; - *addr++=smp_processor_id(); - size -= 3*sizeof(unsigned long); + *addr++ = 0x12345678; + *addr++ = caller; + *addr++ = smp_processor_id(); + size -= 3 * sizeof(unsigned long); { unsigned long *sptr = &caller; unsigned long svalue; @@ -1277,7 +1288,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, while (!kstack_end(sptr)) { svalue = *sptr++; if (kernel_text_address(svalue)) { - *addr++=svalue; + *addr++ = svalue; size -= sizeof(unsigned long); if (size <= sizeof(unsigned long)) break; @@ -1285,25 +1296,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, } } - *addr++=0x87654321; + *addr++ = 0x87654321; } #endif static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) { int size = obj_reallen(cachep); - addr = &((char*)addr)[obj_dbghead(cachep)]; + addr = &((char *)addr)[obj_dbghead(cachep)]; memset(addr, val, size); - *(unsigned char *)(addr+size-1) = POISON_END; + *(unsigned char *)(addr + size - 1) = POISON_END; } static void dump_line(char *data, int offset, int limit) { int i; printk(KERN_ERR "%03x:", offset); - for (i=0;iflags & SLAB_RED_ZONE) { printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { printk(KERN_ERR "Last user: [<%p>]", - *dbg_userword(cachep, objp)); + *dbg_userword(cachep, objp)); print_symbol("(%s)", - (unsigned long)*dbg_userword(cachep, objp)); + (unsigned long)*dbg_userword(cachep, objp)); printk("\n"); } - realobj = (char*)objp+obj_dbghead(cachep); + realobj = (char *)objp + obj_dbghead(cachep); size = obj_reallen(cachep); - for (i=0; i size) - limit = size-i; + if (i + limit > size) + limit = size - i; dump_line(realobj, i, limit); } } @@ -1346,27 +1357,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) int size, i; int lines = 0; - realobj = (char*)objp+obj_dbghead(cachep); + realobj = (char *)objp + obj_dbghead(cachep); size = obj_reallen(cachep); - for (i=0;i size) - limit = size-i; + if (i + limit > size) + limit = size - i; dump_line(realobj, i, limit); i += 16; lines++; @@ -1382,19 +1394,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) struct slab *slabp = page_get_slab(virt_to_page(objp)); int objnr; - objnr = (objp-slabp->s_mem)/cachep->objsize; + objnr = (objp - slabp->s_mem) / cachep->objsize; if (objnr) { - objp = slabp->s_mem+(objnr-1)*cachep->objsize; - realobj = (char*)objp+obj_dbghead(cachep); + objp = slabp->s_mem + (objnr - 1) * cachep->objsize; + realobj = (char *)objp + obj_dbghead(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", - realobj, size); + realobj, size); print_objinfo(cachep, objp, 2); } - if (objnr+1 < cachep->num) { - objp = slabp->s_mem+(objnr+1)*cachep->objsize; - realobj = (char*)objp+obj_dbghead(cachep); + if (objnr + 1 < cachep->num) { + objp = slabp->s_mem + (objnr + 1) * cachep->objsize; + realobj = (char *)objp + obj_dbghead(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", - realobj, size); + realobj, size); print_objinfo(cachep, objp, 2); } } @@ -1405,7 +1417,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) * Before calling the slab must have been unlinked from the cache. * The cache-lock is not held/needed. */ -static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) +static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp) { void *addr = slabp->s_mem - slabp->colouroff; @@ -1416,8 +1428,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); + if ((cachep->objsize % PAGE_SIZE) == 0 + && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), + cachep->objsize / PAGE_SIZE, + 1); else check_poison_obj(cachep, objp); #else @@ -1427,20 +1442,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) slab_error(cachep, "start of a freed object " - "was overwritten"); + "was overwritten"); if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) slab_error(cachep, "end of a freed object " - "was overwritten"); + "was overwritten"); } if (cachep->dtor && !(cachep->flags & SLAB_POISON)) - (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); + (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0); } #else if (cachep->dtor) { int i; for (i = 0; i < cachep->num; i++) { - void* objp = slabp->s_mem+cachep->objsize*i; - (cachep->dtor)(objp, cachep, 0); + void *objp = slabp->s_mem + cachep->objsize * i; + (cachep->dtor) (objp, cachep, 0); } } #endif @@ -1448,7 +1463,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { struct slab_rcu *slab_rcu; - slab_rcu = (struct slab_rcu *) slabp; + slab_rcu = (struct slab_rcu *)slabp; slab_rcu->cachep = cachep; slab_rcu->addr = addr; call_rcu(&slab_rcu->head, kmem_rcu_free); @@ -1466,10 +1481,10 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index) int node; for_each_online_node(node) { - cachep->nodelists[node] = &initkmem_list3[index+node]; + cachep->nodelists[node] = &initkmem_list3[index + node]; cachep->nodelists[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep)%REAPTIMEOUT_LIST3; + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; } } @@ -1486,7 +1501,7 @@ static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, { size_t left_over = 0; - for ( ; ; cachep->gfporder++) { + for (;; cachep->gfporder++) { unsigned int num; size_t remainder; @@ -1566,14 +1581,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, * Sanity checks... these are all serious usage bugs. */ if ((!name) || - in_interrupt() || - (size < BYTES_PER_WORD) || - (size > (1< (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { + printk(KERN_ERR "%s: Early error in slab %s\n", + __FUNCTION__, name); + BUG(); + } down(&cache_chain_sem); @@ -1593,11 +1607,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, set_fs(old_fs); if (res) { printk("SLAB: cache with size %d has lost its name\n", - pc->objsize); + pc->objsize); continue; } - if (!strcmp(pc->name,name)) { + if (!strcmp(pc->name, name)) { printk("kmem_cache_create: duplicate cache %s\n", name); dump_stack(); goto oops; @@ -1609,10 +1623,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { /* No constructor, but inital state check requested */ printk(KERN_ERR "%s: No con, but init state check " - "requested - %s\n", __FUNCTION__, name); + "requested - %s\n", __FUNCTION__, name); flags &= ~SLAB_DEBUG_INITIAL; } - #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with @@ -1620,8 +1633,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ - if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) - flags |= SLAB_RED_ZONE|SLAB_STORE_USER; + if ((size < 4096 + || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) + flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif @@ -1642,9 +1656,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ - if (size & (BYTES_PER_WORD-1)) { - size += (BYTES_PER_WORD-1); - size &= ~(BYTES_PER_WORD-1); + if (size & (BYTES_PER_WORD - 1)) { + size += (BYTES_PER_WORD - 1); + size &= ~(BYTES_PER_WORD - 1); } /* calculate out the final buffer alignment: */ @@ -1655,7 +1669,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * objects into one cacheline. */ ralign = cache_line_size(); - while (size <= ralign/2) + while (size <= ralign / 2) ralign /= 2; } else { ralign = BYTES_PER_WORD; @@ -1664,13 +1678,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; if (ralign > BYTES_PER_WORD) - flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } /* 3) caller mandated alignment: disables debug if necessary */ if (ralign < align) { ralign = align; if (ralign > BYTES_PER_WORD) - flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } /* 4) Store it. Note that the debug code below can reduce * the alignment to BYTES_PER_WORD. @@ -1692,7 +1706,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* add space for red zone words */ cachep->dbghead += BYTES_PER_WORD; - size += 2*BYTES_PER_WORD; + size += 2 * BYTES_PER_WORD; } if (flags & SLAB_STORE_USER) { /* user store requires word alignment and @@ -1703,7 +1717,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { + if (size >= malloc_sizes[INDEX_L3 + 1].cs_size + && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { cachep->dbghead += PAGE_SIZE - size; size = PAGE_SIZE; } @@ -1711,7 +1726,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, #endif /* Determine if the slab management is 'on' or 'off' slab. */ - if (size >= (PAGE_SIZE>>3)) + if (size >= (PAGE_SIZE >> 3)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). @@ -1728,7 +1743,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ cachep->gfporder = 0; cache_estimate(cachep->gfporder, size, align, flags, - &left_over, &cachep->num); + &left_over, &cachep->num); } else left_over = calculate_slab_order(cachep, size, align, flags); @@ -1738,8 +1753,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep = NULL; goto oops; } - slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + + sizeof(struct slab), align); /* * If the slab has been placed off-slab, and we have enough space then @@ -1752,14 +1767,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); + slab_size = + cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); } cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < align) cachep->colour_off = align; - cachep->colour = left_over/cachep->colour_off; + cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; @@ -1786,7 +1802,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * the creation of further caches will BUG(). */ cachep->array[smp_processor_id()] = - &initarray_generic.cache; + &initarray_generic.cache; /* If the cache that's used by * kmalloc(sizeof(kmem_list3)) is the first cache, @@ -1800,8 +1816,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, g_cpucache_up = PARTIAL_AC; } else { cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), - GFP_KERNEL); + kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); if (g_cpucache_up == PARTIAL_AC) { set_up_list3s(cachep, SIZE_L3); @@ -1811,16 +1826,18 @@ kmem_cache_create (const char *name, size_t size, size_t align, for_each_online_node(node) { cachep->nodelists[node] = - kmalloc_node(sizeof(struct kmem_list3), - GFP_KERNEL, node); + kmalloc_node(sizeof + (struct kmem_list3), + GFP_KERNEL, node); BUG_ON(!cachep->nodelists[node]); - kmem_list3_init(cachep->nodelists[node]); + kmem_list3_init(cachep-> + nodelists[node]); } } } cachep->nodelists[numa_node_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep)%REAPTIMEOUT_LIST3; + jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; BUG_ON(!ac_data(cachep)); ac_data(cachep)->avail = 0; @@ -1829,15 +1846,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, ac_data(cachep)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; - } + } /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); unlock_cpu_hotplug(); -oops: + oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", - name); + name); up(&cache_chain_sem); return cachep; } @@ -1880,7 +1897,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) /* * Waits for all CPUs to execute func(). */ -static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) +static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) { check_irq_on(); preempt_disable(); @@ -1895,12 +1912,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) preempt_enable(); } -static void drain_array_locked(kmem_cache_t* cachep, - struct array_cache *ac, int force, int node); +static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, + int force, int node); static void do_drain(void *arg) { - kmem_cache_t *cachep = (kmem_cache_t*)arg; + kmem_cache_t *cachep = (kmem_cache_t *) arg; struct array_cache *ac; int node = numa_node_id(); @@ -1920,7 +1937,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep) smp_call_function_all_cpus(do_drain, cachep); check_irq_on(); spin_lock_irq(&cachep->spinlock); - for_each_online_node(node) { + for_each_online_node(node) { l3 = cachep->nodelists[node]; if (l3) { spin_lock(&l3->list_lock); @@ -1958,8 +1975,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node) slab_destroy(cachep, slabp); spin_lock_irq(&l3->list_lock); } - ret = !list_empty(&l3->slabs_full) || - !list_empty(&l3->slabs_partial); + ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); return ret; } @@ -2015,7 +2031,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); * The caller must guarantee that noone will allocate memory from the cache * during the kmem_cache_destroy(). */ -int kmem_cache_destroy(kmem_cache_t * cachep) +int kmem_cache_destroy(kmem_cache_t *cachep) { int i; struct kmem_list3 *l3; @@ -2037,7 +2053,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep) if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); down(&cache_chain_sem); - list_add(&cachep->next,&cache_chain); + list_add(&cachep->next, &cache_chain); up(&cache_chain_sem); unlock_cpu_hotplug(); return 1; @@ -2047,7 +2063,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep) synchronize_rcu(); for_each_online_cpu(i) - kfree(cachep->array[i]); + kfree(cachep->array[i]); /* NUMA: free the list3 structures */ for_each_online_node(i) { @@ -2066,39 +2082,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep) EXPORT_SYMBOL(kmem_cache_destroy); /* Get the memory for a slab management obj. */ -static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, - int colour_off, gfp_t local_flags) +static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp, + int colour_off, gfp_t local_flags) { struct slab *slabp; - + if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); if (!slabp) return NULL; } else { - slabp = objp+colour_off; + slabp = objp + colour_off; colour_off += cachep->slab_size; } slabp->inuse = 0; slabp->colouroff = colour_off; - slabp->s_mem = objp+colour_off; + slabp->s_mem = objp + colour_off; return slabp; } static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) { - return (kmem_bufctl_t *)(slabp+1); + return (kmem_bufctl_t *) (slabp + 1); } static void cache_init_objs(kmem_cache_t *cachep, - struct slab *slabp, unsigned long ctor_flags) + struct slab *slabp, unsigned long ctor_flags) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem+cachep->objsize*i; + void *objp = slabp->s_mem + cachep->objsize * i; #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2116,25 +2132,28 @@ static void cache_init_objs(kmem_cache_t *cachep, * Otherwise, deadlock. They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) - cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); + cachep->ctor(objp + obj_dbghead(cachep), cachep, + ctor_flags); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) slab_error(cachep, "constructor overwrote the" - " end of an object"); + " end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) slab_error(cachep, "constructor overwrote the" - " start of an object"); + " start of an object"); } - if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) + && cachep->flags & SLAB_POISON) + kernel_map_pages(virt_to_page(objp), + cachep->objsize / PAGE_SIZE, 0); #else if (cachep->ctor) cachep->ctor(objp, cachep, ctor_flags); #endif - slab_bufctl(slabp)[i] = i+1; + slab_bufctl(slabp)[i] = i + 1; } - slab_bufctl(slabp)[i-1] = BUFCTL_END; + slab_bufctl(slabp)[i - 1] = BUFCTL_END; slabp->free = 0; } @@ -2170,17 +2189,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) */ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) { - struct slab *slabp; - void *objp; - size_t offset; - gfp_t local_flags; - unsigned long ctor_flags; + struct slab *slabp; + void *objp; + size_t offset; + gfp_t local_flags; + unsigned long ctor_flags; struct kmem_list3 *l3; /* Be lazy and only check for valid flags here, - * keeping it out of the critical path in kmem_cache_alloc(). + * keeping it out of the critical path in kmem_cache_alloc(). */ - if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) + if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) BUG(); if (flags & SLAB_NO_GROW) return 0; @@ -2246,9 +2265,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) l3->free_objects += cachep->num; spin_unlock(&l3->list_lock); return 1; -opps1: + opps1: kmem_freepages(cachep, objp); -failed: + failed: if (local_flags & __GFP_WAIT) local_irq_disable(); return 0; @@ -2268,18 +2287,19 @@ static void kfree_debugcheck(const void *objp) if (!virt_addr_valid(objp)) { printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", - (unsigned long)objp); - BUG(); + (unsigned long)objp); + BUG(); } page = virt_to_page(objp); if (!PageSlab(page)) { - printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); + printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", + (unsigned long)objp); BUG(); } } static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, - void *caller) + void *caller) { struct page *page; unsigned int objnr; @@ -2290,20 +2310,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, page = virt_to_page(objp); if (page_get_cache(page) != cachep) { - printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", - page_get_cache(page),cachep); + printk(KERN_ERR + "mismatch in kmem_cache_free: expected cache %p, got %p\n", + page_get_cache(page), cachep); printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); - printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name); + printk(KERN_ERR "%p is %s.\n", page_get_cache(page), + page_get_cache(page)->name); WARN_ON(1); } slabp = page_get_slab(page); if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", - objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); + if (*dbg_redzone1(cachep, objp) != RED_ACTIVE + || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { + slab_error(cachep, + "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR + "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; @@ -2311,30 +2337,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = caller; - objnr = (objp-slabp->s_mem)/cachep->objsize; + objnr = (objp - slabp->s_mem) / cachep->objsize; BUG_ON(objnr >= cachep->num); - BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); + BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize); if (cachep->flags & SLAB_DEBUG_INITIAL) { /* Need to call the slab's constructor so the * caller can perform a verify of its state (debugging). * Called without the cache-lock held. */ - cachep->ctor(objp+obj_dbghead(cachep), - cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); + cachep->ctor(objp + obj_dbghead(cachep), + cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); } if (cachep->flags & SLAB_POISON && cachep->dtor) { /* we want to cache poison the object, * call the destruction callback */ - cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); + cachep->dtor(objp + obj_dbghead(cachep), cachep, 0); } if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { store_stackinfo(cachep, objp, (unsigned long)caller); - kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); + kernel_map_pages(virt_to_page(objp), + cachep->objsize / PAGE_SIZE, 0); } else { poison_obj(cachep, objp, POISON_FREE); } @@ -2349,7 +2376,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) { kmem_bufctl_t i; int entries = 0; - + /* Check slab's freelist to see if this obj is there. */ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { entries++; @@ -2357,13 +2384,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) goto bad; } if (entries != cachep->num - slabp->inuse) { -bad: - printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse); - for (i=0;inum*sizeof(kmem_bufctl_t);i++) { - if ((i%16)==0) + bad: + printk(KERN_ERR + "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse); + for (i = 0; + i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); + i++) { + if ((i % 16) == 0) printk("\n%03x:", i); - printk(" %02x", ((unsigned char*)slabp)[i]); + printk(" %02x", ((unsigned char *)slabp)[i]); } printk("\n"); BUG(); @@ -2383,7 +2413,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) check_irq_off(); ac = ac_data(cachep); -retry: + retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { /* if there was little recent activity on this @@ -2405,8 +2435,8 @@ retry: shared_array->avail -= batchcount; ac->avail = batchcount; memcpy(ac->entry, - &(shared_array->entry[shared_array->avail]), - sizeof(void*)*batchcount); + &(shared_array->entry[shared_array->avail]), + sizeof(void *) * batchcount); shared_array->touched = 1; goto alloc_done; } @@ -2434,7 +2464,7 @@ retry: /* get obj pointer */ ac->entry[ac->avail++] = slabp->s_mem + - slabp->free*cachep->objsize; + slabp->free * cachep->objsize; slabp->inuse++; next = slab_bufctl(slabp)[slabp->free]; @@ -2442,7 +2472,7 @@ retry: slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; WARN_ON(numa_node_id() != slabp->nodeid); #endif - slabp->free = next; + slabp->free = next; } check_slabp(cachep, slabp); @@ -2454,9 +2484,9 @@ retry: list_add(&slabp->list, &l3->slabs_partial); } -must_grow: + must_grow: l3->free_objects -= ac->avail; -alloc_done: + alloc_done: spin_unlock(&l3->list_lock); if (unlikely(!ac->avail)) { @@ -2468,7 +2498,7 @@ alloc_done: if (!x && ac->avail == 0) // no objects in sight? abort return NULL; - if (!ac->avail) // objects refilled by interrupt? + if (!ac->avail) // objects refilled by interrupt? goto retry; } ac->touched = 1; @@ -2485,16 +2515,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) } #if DEBUG -static void * -cache_alloc_debugcheck_after(kmem_cache_t *cachep, - gfp_t flags, void *objp, void *caller) +static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags, + void *objp, void *caller) { - if (!objp) + if (!objp) return objp; - if (cachep->flags & SLAB_POISON) { + if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); + kernel_map_pages(virt_to_page(objp), + cachep->objsize / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -2506,24 +2536,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep, *dbg_userword(cachep, objp) = caller; if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", - objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE + || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, + "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR + "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } objp += obj_dbghead(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) { - unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; + unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; if (!(flags & __GFP_WAIT)) ctor_flags |= SLAB_CTOR_ATOMIC; cachep->ctor(objp, cachep, ctor_flags); - } + } return objp; } #else @@ -2532,7 +2566,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep, static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) { - void* objp; + void *objp; struct array_cache *ac; check_irq_off(); @@ -2551,7 +2585,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) { unsigned long save_flags; - void* objp; + void *objp; cache_alloc_debugcheck_before(cachep, flags); @@ -2559,7 +2593,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) objp = ____cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, - __builtin_return_address(0)); + __builtin_return_address(0)); prefetchw(objp); return objp; } @@ -2571,74 +2605,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) { struct list_head *entry; - struct slab *slabp; - struct kmem_list3 *l3; - void *obj; - kmem_bufctl_t next; - int x; - - l3 = cachep->nodelists[nodeid]; - BUG_ON(!l3); - -retry: - spin_lock(&l3->list_lock); - entry = l3->slabs_partial.next; - if (entry == &l3->slabs_partial) { - l3->free_touched = 1; - entry = l3->slabs_free.next; - if (entry == &l3->slabs_free) - goto must_grow; - } - - slabp = list_entry(entry, struct slab, list); - check_spinlock_acquired_node(cachep, nodeid); - check_slabp(cachep, slabp); - - STATS_INC_NODEALLOCS(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - BUG_ON(slabp->inuse == cachep->num); - - /* get obj pointer */ - obj = slabp->s_mem + slabp->free*cachep->objsize; - slabp->inuse++; - next = slab_bufctl(slabp)[slabp->free]; + struct slab *slabp; + struct kmem_list3 *l3; + void *obj; + kmem_bufctl_t next; + int x; + + l3 = cachep->nodelists[nodeid]; + BUG_ON(!l3); + + retry: + spin_lock(&l3->list_lock); + entry = l3->slabs_partial.next; + if (entry == &l3->slabs_partial) { + l3->free_touched = 1; + entry = l3->slabs_free.next; + if (entry == &l3->slabs_free) + goto must_grow; + } + + slabp = list_entry(entry, struct slab, list); + check_spinlock_acquired_node(cachep, nodeid); + check_slabp(cachep, slabp); + + STATS_INC_NODEALLOCS(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + BUG_ON(slabp->inuse == cachep->num); + + /* get obj pointer */ + obj = slabp->s_mem + slabp->free * cachep->objsize; + slabp->inuse++; + next = slab_bufctl(slabp)[slabp->free]; #if DEBUG - slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; #endif - slabp->free = next; - check_slabp(cachep, slabp); - l3->free_objects--; - /* move slabp to correct slabp list: */ - list_del(&slabp->list); - - if (slabp->free == BUFCTL_END) { - list_add(&slabp->list, &l3->slabs_full); - } else { - list_add(&slabp->list, &l3->slabs_partial); - } + slabp->free = next; + check_slabp(cachep, slabp); + l3->free_objects--; + /* move slabp to correct slabp list: */ + list_del(&slabp->list); + + if (slabp->free == BUFCTL_END) { + list_add(&slabp->list, &l3->slabs_full); + } else { + list_add(&slabp->list, &l3->slabs_partial); + } - spin_unlock(&l3->list_lock); - goto done; + spin_unlock(&l3->list_lock); + goto done; -must_grow: - spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags, nodeid); + must_grow: + spin_unlock(&l3->list_lock); + x = cache_grow(cachep, flags, nodeid); - if (!x) - return NULL; + if (!x) + return NULL; - goto retry; -done: - return obj; + goto retry; + done: + return obj; } #endif /* * Caller needs to acquire correct kmem_list's list_lock */ -static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node) +static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, + int node) { int i; struct kmem_list3 *l3; @@ -2661,7 +2696,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + "'%s', objp %p\n", cachep->name, objp); BUG(); } #endif @@ -2705,20 +2740,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) spin_lock(&l3->list_lock); if (l3->shared) { struct array_cache *shared_array = l3->shared; - int max = shared_array->limit-shared_array->avail; + int max = shared_array->limit - shared_array->avail; if (max) { if (batchcount > max) batchcount = max; memcpy(&(shared_array->entry[shared_array->avail]), - ac->entry, - sizeof(void*)*batchcount); + ac->entry, sizeof(void *) * batchcount); shared_array->avail += batchcount; goto free_done; } } free_block(cachep, ac->entry, batchcount, node); -free_done: + free_done: #if STATS { int i = 0; @@ -2740,10 +2774,9 @@ free_done: spin_unlock(&l3->list_lock); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), - sizeof(void*)*ac->avail); + sizeof(void *) * ac->avail); } - /* * __cache_free * Release an obj back to its cache. If the obj has a constructed @@ -2768,7 +2801,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp) if (unlikely(slabp->nodeid != numa_node_id())) { struct array_cache *alien = NULL; int nodeid = slabp->nodeid; - struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; + struct kmem_list3 *l3 = + cachep->nodelists[numa_node_id()]; STATS_INC_NODEFREES(cachep); if (l3->alien && l3->alien[nodeid]) { @@ -2776,15 +2810,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp) spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) __drain_alien_cache(cachep, - alien, nodeid); + alien, nodeid); alien->entry[alien->avail++] = objp; spin_unlock(&alien->lock); } else { spin_lock(&(cachep->nodelists[nodeid])-> - list_lock); + list_lock); free_block(cachep, &objp, 1, nodeid); spin_unlock(&(cachep->nodelists[nodeid])-> - list_lock); + list_lock); } return; } @@ -2831,9 +2865,9 @@ EXPORT_SYMBOL(kmem_cache_alloc); */ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) { - unsigned long addr = (unsigned long) ptr; + unsigned long addr = (unsigned long)ptr; unsigned long min_addr = PAGE_OFFSET; - unsigned long align_mask = BYTES_PER_WORD-1; + unsigned long align_mask = BYTES_PER_WORD - 1; unsigned long size = cachep->objsize; struct page *page; @@ -2853,7 +2887,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) if (unlikely(page_get_cache(page) != cachep)) goto out; return 1; -out: + out: return 0; } @@ -2880,8 +2914,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) if (unlikely(!cachep->nodelists[nodeid])) { /* Fall back to __cache_alloc if we run into trouble */ - printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); - return __cache_alloc(cachep,flags); + printk(KERN_WARNING + "slab: not allocating in inactive node %d for cache %s\n", + nodeid, cachep->name); + return __cache_alloc(cachep, flags); } cache_alloc_debugcheck_before(cachep, flags); @@ -2891,7 +2927,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) else ptr = __cache_alloc_node(cachep, flags, nodeid); local_irq_restore(save_flags); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); + ptr = + cache_alloc_debugcheck_after(cachep, flags, ptr, + __builtin_return_address(0)); return ptr; } @@ -2957,7 +2995,7 @@ EXPORT_SYMBOL(__kmalloc); void *__alloc_percpu(size_t size) { int i; - struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); + struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); if (!pdata) return NULL; @@ -2981,9 +3019,9 @@ void *__alloc_percpu(size_t size) } /* Catch derefs w/o wrappers */ - return (void *) (~(unsigned long) pdata); + return (void *)(~(unsigned long)pdata); -unwind_oom: + unwind_oom: while (--i >= 0) { if (!cpu_possible(i)) continue; @@ -3046,7 +3084,7 @@ void kfree(const void *objp) local_irq_save(flags); kfree_debugcheck(objp); c = page_get_cache(virt_to_page(objp)); - __cache_free(c, (void*)objp); + __cache_free(c, (void *)objp); local_irq_restore(flags); } EXPORT_SYMBOL(kfree); @@ -3059,17 +3097,16 @@ EXPORT_SYMBOL(kfree); * Don't free memory not originally allocated by alloc_percpu() * The complemented objp is to check for that. */ -void -free_percpu(const void *objp) +void free_percpu(const void *objp) { int i; - struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); + struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); /* * We allocate for all cpus so we cannot use for online cpu here. */ for_each_cpu(i) - kfree(p->ptrs[i]); + kfree(p->ptrs[i]); kfree(p); } EXPORT_SYMBOL(free_percpu); @@ -3103,44 +3140,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep) if (!(new_alien = alloc_alien_cache(node, cachep->limit))) goto fail; #endif - if (!(new = alloc_arraycache(node, (cachep->shared* - cachep->batchcount), 0xbaadf00d))) + if (!(new = alloc_arraycache(node, (cachep->shared * + cachep->batchcount), + 0xbaadf00d))) goto fail; if ((l3 = cachep->nodelists[node])) { spin_lock_irq(&l3->list_lock); if ((nc = cachep->nodelists[node]->shared)) - free_block(cachep, nc->entry, - nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node); l3->shared = new; if (!cachep->nodelists[node]->alien) { l3->alien = new_alien; new_alien = NULL; } - l3->free_limit = (1 + nr_cpus_node(node))* - cachep->batchcount + cachep->num; + l3->free_limit = (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; spin_unlock_irq(&l3->list_lock); kfree(nc); free_alien_cache(new_alien); continue; } if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), - GFP_KERNEL, node))) + GFP_KERNEL, node))) goto fail; kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep)%REAPTIMEOUT_LIST3; + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; l3->shared = new; l3->alien = new_alien; - l3->free_limit = (1 + nr_cpus_node(node))* - cachep->batchcount + cachep->num; + l3->free_limit = (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; cachep->nodelists[node] = l3; } return err; -fail: + fail: err = -ENOMEM; return err; } @@ -3162,18 +3199,19 @@ static void do_ccupdate_local(void *info) new->new[smp_processor_id()] = old; } - static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, - int shared) + int shared) { struct ccupdate_struct new; int i, err; - memset(&new.new,0,sizeof(new.new)); + memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); + new.new[i] = + alloc_arraycache(cpu_to_node(i), limit, batchcount); if (!new.new[i]) { - for (i--; i >= 0; i--) kfree(new.new[i]); + for (i--; i >= 0; i--) + kfree(new.new[i]); return -ENOMEM; } } @@ -3201,13 +3239,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, err = alloc_kmemlist(cachep); if (err) { printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", - cachep->name, -err); + cachep->name, -err); BUG(); } return 0; } - static void enable_cpucache(kmem_cache_t *cachep) { int err; @@ -3254,14 +3291,14 @@ static void enable_cpucache(kmem_cache_t *cachep) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); + err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", - cachep->name, -err); + cachep->name, -err); } -static void drain_array_locked(kmem_cache_t *cachep, - struct array_cache *ac, int force, int node) +static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, + int force, int node) { int tofree; @@ -3269,14 +3306,14 @@ static void drain_array_locked(kmem_cache_t *cachep, if (ac->touched && !force) { ac->touched = 0; } else if (ac->avail) { - tofree = force ? ac->avail : (ac->limit+4)/5; + tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) { - tofree = (ac->avail+1)/2; + tofree = (ac->avail + 1) / 2; } free_block(cachep, ac->entry, tofree, node); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void*)*ac->avail); + sizeof(void *) * ac->avail); } } @@ -3299,13 +3336,14 @@ static void cache_reap(void *unused) if (down_trylock(&cache_chain_sem)) { /* Give up. Setup the next iteration. */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); + schedule_delayed_work(&__get_cpu_var(reap_work), + REAPTIMEOUT_CPUC); return; } list_for_each(walk, &cache_chain) { kmem_cache_t *searchp; - struct list_head* p; + struct list_head *p; int tofree; struct slab *slabp; @@ -3322,7 +3360,7 @@ static void cache_reap(void *unused) spin_lock_irq(&l3->list_lock); drain_array_locked(searchp, ac_data(searchp), 0, - numa_node_id()); + numa_node_id()); if (time_after(l3->next_reap, jiffies)) goto next_unlock; @@ -3331,14 +3369,16 @@ static void cache_reap(void *unused) if (l3->shared) drain_array_locked(searchp, l3->shared, 0, - numa_node_id()); + numa_node_id()); if (l3->free_touched) { l3->free_touched = 0; goto next_unlock; } - tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); + tofree = + (l3->free_limit + 5 * searchp->num - + 1) / (5 * searchp->num); do { p = l3->slabs_free.next; if (p == &(l3->slabs_free)) @@ -3358,10 +3398,10 @@ static void cache_reap(void *unused) spin_unlock_irq(&l3->list_lock); slab_destroy(searchp, slabp); spin_lock_irq(&l3->list_lock); - } while(--tofree > 0); -next_unlock: + } while (--tofree > 0); + next_unlock: spin_unlock_irq(&l3->list_lock); -next: + next: cond_resched(); } check_irq_on(); @@ -3418,7 +3458,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) kmem_cache_t *cachep = p; ++*pos; return cachep->next.next == &cache_chain ? NULL - : list_entry(cachep->next.next, kmem_cache_t, next); + : list_entry(cachep->next.next, kmem_cache_t, next); } static void s_stop(struct seq_file *m, void *p) @@ -3430,11 +3470,11 @@ static int s_show(struct seq_file *m, void *p) { kmem_cache_t *cachep = p; struct list_head *q; - struct slab *slabp; - unsigned long active_objs; - unsigned long num_objs; - unsigned long active_slabs = 0; - unsigned long num_slabs, free_objects = 0, shared_avail = 0; + struct slab *slabp; + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs = 0; + unsigned long num_slabs, free_objects = 0, shared_avail = 0; const char *name; char *error = NULL; int node; @@ -3451,14 +3491,14 @@ static int s_show(struct seq_file *m, void *p) spin_lock(&l3->list_lock); - list_for_each(q,&l3->slabs_full) { + list_for_each(q, &l3->slabs_full) { slabp = list_entry(q, struct slab, list); if (slabp->inuse != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each(q,&l3->slabs_partial) { + list_for_each(q, &l3->slabs_partial) { slabp = list_entry(q, struct slab, list); if (slabp->inuse == cachep->num && !error) error = "slabs_partial inuse accounting error"; @@ -3467,7 +3507,7 @@ static int s_show(struct seq_file *m, void *p) active_objs += slabp->inuse; active_slabs++; } - list_for_each(q,&l3->slabs_free) { + list_for_each(q, &l3->slabs_free) { slabp = list_entry(q, struct slab, list); if (slabp->inuse && !error) error = "slabs_free/inuse accounting error"; @@ -3478,25 +3518,24 @@ static int s_show(struct seq_file *m, void *p) spin_unlock(&l3->list_lock); } - num_slabs+=active_slabs; - num_objs = num_slabs*cachep->num; + num_slabs += active_slabs; + num_objs = num_slabs * cachep->num; if (num_objs - active_objs != free_objects && !error) error = "free_objects accounting error"; - name = cachep->name; + name = cachep->name; if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - name, active_objs, num_objs, cachep->objsize, - cachep->num, (1<gfporder)); + name, active_objs, num_objs, cachep->objsize, + cachep->num, (1 << cachep->gfporder)); seq_printf(m, " : tunables %4u %4u %4u", - cachep->limit, cachep->batchcount, - cachep->shared); + cachep->limit, cachep->batchcount, cachep->shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", - active_slabs, num_slabs, shared_avail); + active_slabs, num_slabs, shared_avail); #if STATS - { /* list3 stats */ + { /* list3 stats */ unsigned long high = cachep->high_mark; unsigned long allocs = cachep->num_allocations; unsigned long grown = cachep->grown; @@ -3507,9 +3546,7 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu", - allocs, high, grown, reaped, errors, - max_freeable, node_allocs, node_frees); + %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); } /* cpu stats */ { @@ -3519,7 +3556,7 @@ static int s_show(struct seq_file *m, void *p) unsigned long freemiss = atomic_read(&cachep->freemiss); seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", - allochit, allocmiss, freehit, freemiss); + allochit, allocmiss, freehit, freemiss); } #endif seq_putc(m, '\n'); @@ -3542,10 +3579,10 @@ static int s_show(struct seq_file *m, void *p) */ struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, }; #define MAX_SLABINFO_WRITE 128 @@ -3556,18 +3593,18 @@ struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) +ssize_t slabinfo_write(struct file *file, const char __user * buffer, + size_t count, loff_t *ppos) { - char kbuf[MAX_SLABINFO_WRITE+1], *tmp; + char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; int limit, batchcount, shared, res; struct list_head *p; - + if (count > MAX_SLABINFO_WRITE) return -EINVAL; if (copy_from_user(&kbuf, buffer, count)) return -EFAULT; - kbuf[MAX_SLABINFO_WRITE] = '\0'; + kbuf[MAX_SLABINFO_WRITE] = '\0'; tmp = strchr(kbuf, ' '); if (!tmp) @@ -3580,18 +3617,17 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, /* Find the cache in the chain of caches. */ down(&cache_chain_sem); res = -EINVAL; - list_for_each(p,&cache_chain) { + list_for_each(p, &cache_chain) { kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); if (!strcmp(cachep->name, kbuf)) { if (limit < 1 || batchcount < 1 || - batchcount > limit || - shared < 0) { + batchcount > limit || shared < 0) { res = 0; } else { res = do_tune_cpucache(cachep, limit, - batchcount, shared); + batchcount, shared); } break; } -- cgit v1.2.3 From bec6b0c89b234090681a4516e20ac5debe3e7c59 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:38 -0800 Subject: [PATCH] slab: remove nested #ifdef CONFIG_NUMA For some reason there is an #ifdef CONFIG_NUMA within another #ifdef CONFIG_NUMA in the page allocator. Remove innermost #ifdef CONFIG_NUMA Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b98f428b07..5eeeadd9f66 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1883,7 +1883,6 @@ bad: static inline void free_zone_pagesets(int cpu) { -#ifdef CONFIG_NUMA struct zone *zone; for_each_zone(zone) { @@ -1892,7 +1891,6 @@ static inline void free_zone_pagesets(int cpu) zone_pcp(zone, cpu) = NULL; kfree(pset); } -#endif } static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, -- cgit v1.2.3 From 9d0243bca345d5ce25d3f4b74b7facb3a6df1232 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 8 Jan 2006 01:00:39 -0800 Subject: [PATCH] drop-pagecache Add /proc/sys/vm/drop_caches. When written to, this will cause the kernel to discard as much pagecache and/or reclaimable slab objects as it can. THis operation requires root permissions. It won't drop dirty data, so the user should run `sync' first. Caveats: a) Holds inode_lock for exorbitant amounts of time. b) Needs to be taught about NUMA nodes: propagate these all the way through so the discarding can be controlled on a per-node basis. This is a debugging feature: useful for getting consistent results between filesystem benchmarks. We could possibly put it under a config option, but it's less than 300 bytes. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 1 - mm/vmscan.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 7dee3274590..b1a463d0fe7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -249,7 +249,6 @@ unlock: break; } pagevec_release(&pvec); - cond_resched(); } return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index be8235fb193..428c5801d4b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -180,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) +int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) { struct shrinker *shrinker; int ret = 0; -- cgit v1.2.3 From 8ad4b1fb8205340dba16b63467bb23efc27264d6 Mon Sep 17 00:00:00 2001 From: Rohit Seth Date: Sun, 8 Jan 2006 01:00:40 -0800 Subject: [PATCH] Make high and batch sizes of per_cpu_pagelists configurable As recently there has been lot of traffic on the right values for batch and high water marks for per_cpu_pagelists. This patch makes these two variables configurable through /proc interface. A new tunable /proc/sys/vm/percpu_pagelist_fraction is added. This entry controls the fraction of pages at most in each zone that are allocated for each per cpu page list. The min value for this is 8. It means that we don't allow more than 1/8th of pages in each zone to be allocated in any single per_cpu_pagelist. The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) Signed-off-by: Rohit Seth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5eeeadd9f66..2c46f697e8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +int percpu_pagelist_fraction; static void fastcall free_hot_cold_page(struct page *page, int cold); @@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) INIT_LIST_HEAD(&pcp->list); } +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot list */ + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + #ifdef CONFIG_NUMA /* * Boot pageset table. One per cpu which is going to be used for all @@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu) goto bad; setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); } return 0; @@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + __initdata int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA -- cgit v1.2.3 From 23316bc86fd31c5d644a71c398ec41d9fecacec4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 8 Jan 2006 01:00:41 -0800 Subject: [PATCH] mm: cleanup zone_pcp Use zone_pcp everywhere even though NUMA code "knows" the internal details of the zone. Stop other people trying to copy, and it looks nicer. Also, only print the pagesets of online cpus in zoneinfo. Signed-off-by: Nick Piggin Cc: "Seth, Rohit" Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c46f697e8f..6b92a945ae6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -597,7 +597,7 @@ void drain_remote_pages(void) if (zone->zone_pgdat->node_id == numa_node_id()) continue; - pset = zone->pageset[smp_processor_id()]; + pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -1881,12 +1881,12 @@ static int __devinit process_zones(int cpu) for_each_zone(zone) { - zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); - if (!zone->pageset[cpu]) + if (!zone_pcp(zone, cpu)) goto bad; - setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); if (percpu_pagelist_fraction) setup_pagelist_highmark(zone_pcp(zone, cpu), @@ -1898,8 +1898,8 @@ bad: for_each_zone(dzone) { if (dzone == zone) break; - kfree(dzone->pageset[cpu]); - dzone->pageset[cpu] = NULL; + kfree(zone_pcp(dzone, cpu)); + zone_pcp(dzone, cpu) = NULL; } return -ENOMEM; } @@ -1984,7 +1984,7 @@ static __devinit void zone_pcp_init(struct zone *zone) for (cpu = 0; cpu < NR_CPUS; cpu++) { #ifdef CONFIG_NUMA /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; setup_pageset(&boot_pageset[cpu],0); #else setup_pageset(zone_pcp(zone,cpu), batch); @@ -2227,7 +2227,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, ")" "\n pagesets"); - for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { + for_each_online_cpu(i) { struct per_cpu_pageset *pageset; int j; -- cgit v1.2.3 From 48db57f8ff10eb09ab887ccb6150b0da0c7be24e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 8 Jan 2006 01:00:42 -0800 Subject: [PATCH] mm: free_pages opt Try to streamline free_pages_bulk by ensuring callers don't pass in a 'count' that exceeds the list size. Some cleanups: Rename __free_pages_bulk to __free_one_page. Put the page list manipulation from __free_pages_ok into free_one_page. Make __free_pages_ok static. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 58 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6b92a945ae6..ad3d0202cde 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -308,7 +308,7 @@ static inline int page_is_buddy(struct page *page, int order) * -- wli */ -static inline void __free_pages_bulk (struct page *page, +static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; @@ -383,40 +383,42 @@ static inline int free_pages_check(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +static void free_pages_bulk(struct zone *zone, int count, + struct list_head *list, int order) { - struct page *page = NULL; - int ret = 0; - spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { + while (count--) { + struct page *page; + + BUG_ON(list_empty(list)); page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_pages_bulk list manipulates */ + /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - __free_pages_bulk(page, zone, order); - ret++; + __free_one_page(page, zone, order); } spin_unlock(&zone->lock); - return ret; } -void __free_pages_ok(struct page *page, unsigned int order) +static void free_one_page(struct zone *zone, struct page *page, int order) { - unsigned long flags; LIST_HEAD(list); + list_add(&page->lru, &list); + free_pages_bulk(zone, 1, &list, order); +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; int i; int reserved = 0; arch_free_page(page, order); #ifndef CONFIG_MMU - if (order > 0) - for (i = 1 ; i < (1 << order) ; ++i) - __put_page(page + i); + for (i = 1 ; i < (1 << order) ; ++i) + __put_page(page + i); #endif for (i = 0 ; i < (1 << order) ; ++i) @@ -424,11 +426,10 @@ void __free_pages_ok(struct page *page, unsigned int order) if (reserved) return; - list_add(&page->lru, &list); - kernel_map_pages(page, 1<pcp[i]; - if (pcp->count) - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; } } local_irq_restore(flags); @@ -627,8 +627,8 @@ static void __drain_pages(unsigned int cpu) pcp = &pset->pcp[i]; local_irq_save(flags); - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; local_irq_restore(flags); } } @@ -719,8 +719,10 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + if (pcp->count >= pcp->high) { + free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pcp->count -= pcp->batch; + } local_irq_restore(flags); put_cpu(); } @@ -759,7 +761,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist, again: cpu = get_cpu(); - if (order == 0) { + if (likely(order == 0)) { struct per_cpu_pages *pcp; pcp = &zone_pcp(zone, cpu)->pcp[cold]; -- cgit v1.2.3 From 21eac81f252fe31c3cf64b805a1e8652192f3a3b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:45 -0800 Subject: [PATCH] Swap Migration V5: LRU operations This is the start of the `swap migration' patch series. Swap migration allows the moving of the physical location of pages between nodes in a numa system while the process is running. This means that the virtual addresses that the process sees do not change. However, the system rearranges the physical location of those pages. The main intent of page migration patches here is to reduce the latency of memory access by moving pages near to the processor where the process accessing that memory is running. The patchset allows a process to manually relocate the node on which its pages are located through the MF_MOVE and MF_MOVE_ALL options while setting a new memory policy. The pages of process can also be relocated from another process using the sys_migrate_pages() function call. Requires CAP_SYS_ADMIN. The migrate_pages function call takes two sets of nodes and moves pages of a process that are located on the from nodes to the destination nodes. Manual migration is very useful if for example the scheduler has relocated a process to a processor on a distant node. A batch scheduler or an administrator can detect the situation and move the pages of the process nearer to the new processor. sys_migrate_pages() could be used on non-numa machines as well, to force all of a particualr process's pages out to swap, if someone thinks that's useful. Larger installations usually partition the system using cpusets into sections of nodes. Paul has equipped cpusets with the ability to move pages when a task is moved to another cpuset. This allows automatic control over locality of a process. If a task is moved to a new cpuset then also all its pages are moved with it so that the performance of the process does not sink dramatically (as is the case today). Swap migration works by simply evicting the page. The pages must be faulted back in. The pages are then typically reallocated by the system near the node where the process is executing. For swap migration the destination of the move is controlled by the allocation policy. Cpusets set the allocation policy before calling sys_migrate_pages() in order to move the pages as intended. No allocation policy changes are performed for sys_migrate_pages(). This means that the pages may not faulted in to the specified nodes if no allocation policy was set by other means. The pages will just end up near the node where the fault occurred. There's another patch series in the pipeline which implements "direct migration". The direct migration patchset extends the migration functionality to avoid going through swap. The destination node of the relation is controllable during the actual moving of pages. The crutch of using the allocation policy to relocate is not necessary and the pages are moved directly to the target. Its also faster since swap is not used. And sys_migrate_pages() can then move pages directly to the specified node. Implement functions to isolate pages from the LRU and put them back later. This patch: An earlier implementation was provided by Hirokazu Takahashi and IWAMOTO Toshihiro for the memory hotplug project. From: Magnus This breaks out isolate_lru_page() and putpack_lru_page(). Needed for swap migration. Signed-off-by: Magnus Damm Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 87 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 428c5801d4b..261a56ee11b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -593,20 +593,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - if (!TestClearPageLRU(page)) - BUG(); - list_del(&page->lru); - if (get_page_testone(page)) { - /* - * It is being freed elsewhere - */ - __put_page(page); - SetPageLRU(page); - list_add(&page->lru, src); - continue; - } else { - list_add(&page->lru, dst); + switch (__isolate_lru_page(page)) { + case 1: + /* Succeeded to isolate page */ + list_move(&page->lru, dst); nr_taken++; + break; + case -ENOENT: + /* Not possible to isolate */ + list_move(&page->lru, src); + break; + default: + BUG(); } } @@ -614,6 +612,48 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, return nr_taken; } +static void lru_add_drain_per_cpu(void *dummy) +{ + lru_add_drain(); +} + +/* + * Isolate one page from the LRU lists and put it on the + * indicated list. Do necessary cache draining if the + * page is not on the LRU lists yet. + * + * Result: + * 0 = page not on LRU list + * 1 = page removed from LRU list and added to the specified list. + * -ENOENT = page is being freed elsewhere. + */ +int isolate_lru_page(struct page *page) +{ + int rc = 0; + struct zone *zone = page_zone(page); + +redo: + spin_lock_irq(&zone->lru_lock); + rc = __isolate_lru_page(page); + if (rc == 1) { + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); + if (rc == 0) { + /* + * Maybe this page is still waiting for a cpu to drain it + * from one of the lru lists? + */ + rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); + if (rc == 0 && PageLRU(page)) + goto redo; + } + return rc; +} + /* * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed */ @@ -679,6 +719,40 @@ done: pagevec_release(&pvec); } +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + /* * This moves pages from the active list to the inactive list. * -- cgit v1.2.3 From 930d915252edda7042c944ed3c30194a2f9fe163 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:47 -0800 Subject: [PATCH] Swap Migration V5: PF_SWAPWRITE to allow writing to swap Add PF_SWAPWRITE to control a processes permission to write to swap. - Use PF_SWAPWRITE in may_write_to_queue() instead of checking for kswapd and pdflush - Set PF_SWAPWRITE flag for kswapd and pdflush Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pdflush.c | 2 +- mm/vmscan.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index 52822c98c48..c4b6d0afd73 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -90,7 +90,7 @@ struct pdflush_work { static int __pdflush(struct pdflush_work *my_work) { - current->flags |= PF_FLUSHER; + current->flags |= PF_FLUSHER | PF_SWAPWRITE; my_work->fn = NULL; my_work->who = current; INIT_LIST_HEAD(&my_work->list); diff --git a/mm/vmscan.c b/mm/vmscan.c index 261a56ee11b..6c30a8c5979 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -268,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page) static int may_write_to_queue(struct backing_dev_info *bdi) { - if (current_is_kswapd()) - return 1; - if (current_is_pdflush()) /* This is unlikely, but why not... */ + if (current->flags & PF_SWAPWRITE) return 1; if (!bdi_write_congested(bdi)) return 1; @@ -1299,7 +1297,7 @@ static int kswapd(void *p) * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ - tsk->flags |= PF_MEMALLOC|PF_KSWAPD; + tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; order = 0; for ( ; ; ) { -- cgit v1.2.3 From 49d2e9cc4544369635cd6f4ef6d5bb0f757079a7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:48 -0800 Subject: [PATCH] Swap Migration V5: migrate_pages() function This adds the basic page migration function with a minimal implementation that only allows the eviction of pages to swap space. Page eviction and migration may be useful to migrate pages, to suspend programs or for remapping single pages (useful for faulty pages or pages with soft ECC failures) The process is as follows: The function wanting to migrate pages must first build a list of pages to be migrated or evicted and take them off the lru lists via isolate_lru_page(). isolate_lru_page determines that a page is freeable based on the LRU bit set. Then the actual migration or swapout can happen by calling migrate_pages(). migrate_pages does its best to migrate or swapout the pages and does multiple passes over the list. Some pages may only be swappable if they are not dirty. migrate_pages may start writing out dirty pages in the initial passes over the pages. However, migrate_pages may not be able to migrate or evict all pages for a variety of reasons. The remaining pages may be returned to the LRU lists using putback_lru_pages(). Changelog V4->V5: - Use the lru caches to return pages to the LRU Changelog V3->V4: - Restructure code so that applying patches to support full migration does require minimal changes. Rename swapout_pages() to migrate_pages(). Changelog V2->V3: - Extract common code from shrink_list() and swapout_pages() Signed-off-by: Mike Kravetz Signed-off-by: Christoph Lameter Cc: "Michael Kerrisk" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 180 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6c30a8c5979..a537a7f1635 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -373,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } +static int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return 0; /* truncate got there first */ + + write_lock_irq(&mapping->tree_lock); + + /* + * The non-racy check for busy page. It is critical to check + * PageDirty _after_ making sure that the page is freeable and + * not in use by anybody. (pagecache + us == 2) + */ + if (unlikely(page_count(page) != 2)) + goto cannot_free; + smp_rmb(); + if (unlikely(PageDirty(page))) + goto cannot_free; + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; + __delete_from_swap_cache(page); + write_unlock_irq(&mapping->tree_lock); + swap_free(swap); + __put_page(page); /* The pagecache ref */ + return 1; + } + + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); + __put_page(page); + return 1; + +cannot_free: + write_unlock_irq(&mapping->tree_lock); + return 0; +} + /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ @@ -504,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) goto free_it; } - if (!mapping) - goto keep_locked; /* truncate got there first */ - - write_lock_irq(&mapping->tree_lock); - - /* - * The non-racy check for busy page. It is critical to check - * PageDirty _after_ making sure that the page is freeable and - * not in use by anybody. (pagecache + us == 2) - */ - if (unlikely(page_count(page) != 2)) - goto cannot_free; - smp_rmb(); - if (unlikely(PageDirty(page))) - goto cannot_free; - -#ifdef CONFIG_SWAP - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; - __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); - swap_free(swap); - __put_page(page); /* The pagecache ref */ - goto free_it; - } -#endif /* CONFIG_SWAP */ - - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); + if (!remove_mapping(mapping, page)) + goto keep_locked; free_it: unlock_page(page); @@ -542,10 +551,6 @@ free_it: __pagevec_release_nonlru(&freed_pvec); continue; -cannot_free: - write_unlock_irq(&mapping->tree_lock); - goto keep_locked; - activate_locked: SetPageActive(page); pgactivate++; @@ -563,6 +568,147 @@ keep: return reclaimed; } +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + * + * return codes: + * 0 = complete + * 1 = retry + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return 1; +} +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because t has become empty + * or no retryable pages exist anymore. + * + * SIMPLIFIED VERSION: This implementation of migrate_pages + * is only swapping out pages and never touches the second + * list. The direct migration patchset + * extends this function to avoid the use of swap. + */ +int migrate_pages(struct list_head *l, struct list_head *t) +{ + int retry; + LIST_HEAD(failed); + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + cond_resched(); + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we use + * lock_page to have a higher chance of acquiring the lock. + */ + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto retry_later; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) + wait_on_page_writeback(page); + else + if (PageWriteback(page)) { + unlock_page(page); + goto retry_later; + } + +#ifdef CONFIG_SWAP + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page)) { + unlock_page(page); + list_move(&page->lru, &failed); + nr_failed++; + continue; + } + } +#endif /* CONFIG_SWAP */ + + /* + * Page is properly locked and writeback is complete. + * Try to migrate the page. + */ + if (swap_page(page)) { +retry_later: + retry++; + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + if (!list_empty(&failed)) + list_splice(&failed, l); + + return nr_failed + retry; +} + /* * zone->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages -- cgit v1.2.3 From 7cbe34cf86c673503b177ff47cfa2c7030dabb50 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:49 -0800 Subject: [PATCH] Swap Migration V5: Add CONFIG_MIGRATION for page migration support Include page migration if the system is NUMA or having a memory model that allows distinct areas of memory (SPARSEMEM, DISCONTIGMEM). And: - Only include lru_add_drain_per_cpu if building for an SMP system. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 7 +++++++ mm/vmscan.c | 20 +++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index b3db11f137e..a9cb80ae640 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if PARISC && !PA20 default "4" + +# +# support for page migration +# +config MIGRATION + def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM + depends on SWAP diff --git a/mm/vmscan.c b/mm/vmscan.c index a537a7f1635..58270aea669 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -568,6 +568,7 @@ keep: return reclaimed; } +#ifdef CONFIG_MIGRATION /* * swapout a single page * page is locked upon entry, unlocked on exit @@ -656,8 +657,9 @@ redo: /* * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we use - * lock_page to have a higher chance of acquiring the lock. + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. */ if (pass > 2) lock_page(page); @@ -669,15 +671,15 @@ redo: * Only wait on writeback if we have already done a pass where * we we may have triggered writeouts for lots of pages. */ - if (pass > 0) + if (pass > 0) { wait_on_page_writeback(page); - else + } else { if (PageWriteback(page)) { unlock_page(page); goto retry_later; } + } -#ifdef CONFIG_SWAP if (PageAnon(page) && !PageSwapCache(page)) { if (!add_to_swap(page)) { unlock_page(page); @@ -686,16 +688,15 @@ redo: continue; } } -#endif /* CONFIG_SWAP */ /* * Page is properly locked and writeback is complete. * Try to migrate the page. */ - if (swap_page(page)) { + if (!swap_page(page)) + continue; retry_later: - retry++; - } + retry++; } if (retry && pass++ < 10) goto redo; @@ -708,6 +709,7 @@ retry_later: return nr_failed + retry; } +#endif /* * zone->lru_lock is heavily contended. Some of the functions that -- cgit v1.2.3 From dc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:50 -0800 Subject: [PATCH] Swap Migration V5: MPOL_MF_MOVE interface Add page migration support via swap to the NUMA policy layer This patch adds page migration support to the NUMA policy layer. An additional flag MPOL_MF_MOVE is introduced for mbind. If MPOL_MF_MOVE is specified then pages that do not conform to the memory policy will be evicted from memory. When they get pages back in new pages will be allocated following the numa policy. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 135 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0f1d2b8a952..9cc6d962831 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -83,9 +83,14 @@ #include #include #include +#include + #include #include +/* Internal MPOL_MF_xxx flags */ +#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ + static kmem_cache_t *policy_cache; static kmem_cache_t *sn_cache; @@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) return policy; } +/* Check if we are the only process mapping the page in question */ +static inline int single_mm_mapping(struct mm_struct *mm, + struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int rc = 1; + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + if (mm != vma->vm_mm) { + rc = 0; + goto out; + } + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + if (mm != vma->vm_mm) { + rc = 0; + goto out; + } +out: + spin_unlock(&mapping->i_mmap_lock); + return rc; +} + +/* + * Add a page to be migrated to the pagelist + */ +static void migrate_page_add(struct vm_area_struct *vma, + struct page *page, struct list_head *pagelist, unsigned long flags) +{ + /* + * Avoid migrating a page that is shared by others and not writable. + */ + if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || + mapping_writably_mapped(page->mapping) || + single_mm_mapping(vma->vm_mm, page->mapping)) { + int rc = isolate_lru_page(page); + + if (rc == 1) + list_add(&page->lru, pagelist); + /* + * If the isolate attempt was not successful then we just + * encountered an unswappable page. Something must be wrong. + */ + WARN_ON(rc == 0); + } +} + /* Ensure all existing pages follow the policy. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { pte_t *orig_pte; pte_t *pte; @@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page) continue; nid = page_to_nid(page); - if (!node_isset(nid, *nodes)) - break; + if (!node_isset(nid, *nodes)) { + if (pagelist) + migrate_page_add(vma, page, pagelist, flags); + else + break; + } } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); return addr != end; } static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { pmd_t *pmd; unsigned long next; @@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes)) + if (check_pte_range(vma, pmd, addr, next, nodes, + flags, pagelist)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { pud_t *pud; unsigned long next; @@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes)) + if (check_pmd_range(vma, pud, addr, next, nodes, + flags, pagelist)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } static inline int check_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { pgd_t *pgd; unsigned long next; @@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes)) + if (check_pud_range(vma, pgd, addr, next, nodes, + flags, pagelist)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; } -/* Step 1: check the range */ +/* Check if a vma is migratable */ +static inline int vma_migratable(struct vm_area_struct *vma) +{ + if (vma->vm_flags & ( + VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP)) + return 0; + return 1; +} + +/* + * Check if all pages in a range are on a set of nodes. + * If pagelist != NULL then isolate pages from the LRU and + * put them on the pagelist. + */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - nodemask_t *nodes, unsigned long flags) + const nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { int err; struct vm_area_struct *first, *vma, *prev; @@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { - if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); - if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); - if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return ERR_PTR(-EFAULT); + if (prev && prev->vm_end < vma->vm_start) + return ERR_PTR(-EFAULT); + } + if (!is_vm_hugetlb_page(vma) && + ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma)))) { unsigned long endvma = vma->vm_end; + if (endvma > end) endvma = end; if (vma->vm_start > start) start = vma->vm_start; - err = check_pgd_range(vma, start, endvma, nodes); + err = check_pgd_range(vma, start, endvma, nodes, + flags, pagelist); if (err) { first = ERR_PTR(err); break; @@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len, struct mempolicy *new; unsigned long end; int err; + LIST_HEAD(pagelist); - if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) + if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + || mode > MPOL_MAX) return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + if (start & ~PAGE_MASK) return -EINVAL; + if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; + len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; + if (end < start) return -EINVAL; if (end == start) return 0; + if (mpol_check_policy(mode, nmask)) return -EINVAL; + new = mpol_new(mode, nmask); if (IS_ERR(new)) return PTR_ERR(new); + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, mode,nodes_addr(nodes)[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nmask, flags); + vma = check_range(mm, start, end, nmask, flags, + (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL); err = PTR_ERR(vma); - if (!IS_ERR(vma)) + if (!IS_ERR(vma)) { err = mbind_range(vma, start, end, new); + if (!list_empty(&pagelist)) + migrate_pages(&pagelist, NULL); + if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT)) + err = -EIO; + } + if (!list_empty(&pagelist)) + putback_lru_pages(&pagelist); + up_write(&mm->mmap_sem); mpol_free(new); return err; -- cgit v1.2.3 From 39743889aaf76725152f16aa90ca3c45f6d52da3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:51 -0800 Subject: [PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9cc6d962831..20d5ad39fa4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -614,12 +614,42 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +/* + * For now migrate_pages simply swaps out the pages from nodes that are in + * the source set but not in the target set. In the future, we would + * want a function that moves pages between the two nodesets in such + * a way as to preserve the physical layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + LIST_HEAD(pagelist); + int count = 0; + nodemask_t nodes; + + nodes_andnot(nodes, *from_nodes, *to_nodes); + nodes_complement(nodes, nodes); + + down_read(&mm->mmap_sem); + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (!list_empty(&pagelist)) { + migrate_pages(&pagelist, NULL); + if (!list_empty(&pagelist)) + count = putback_lru_pages(&pagelist); + } + up_read(&mm->mmap_sem); + return count; +} + /* * User space interface with variable sized bitmaps for nodelists. */ /* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; @@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, &nodes); } +/* Macro needed until Paul implements this function in kernel/cpusets.c */ +#define cpuset_mems_allowed(task) node_online_map + +asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) +{ + struct mm_struct *mm; + struct task_struct *task; + nodemask_t old; + nodemask_t new; + nodemask_t task_nodes; + int err; + + err = get_nodes(&old, old_nodes, maxnode); + if (err) + return err; + + err = get_nodes(&new, new_nodes, maxnode); + if (err) + return err; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser priviledges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); +out: + mmput(mm); + return err; +} + + /* Retrieve NUMA policy */ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long __user *nmask, -- cgit v1.2.3 From 8419c3181086c86664e8246bc997afc2e4ffba4f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:52 -0800 Subject: [PATCH] SwapMig: CONFIG_MIGRATION fixes Move move_to_lru, putback_lru_pages and isolate_lru in section surrounded by CONFIG_MIGRATION saving some codesize for single processor kernels. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 152 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 76 insertions(+), 76 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 58270aea669..daed4a73b76 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -569,6 +569,40 @@ keep: } #ifdef CONFIG_MIGRATION +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + /* * swapout a single page * page is locked upon entry, unlocked on exit @@ -709,6 +743,48 @@ retry_later: return nr_failed + retry; } + +static void lru_add_drain_per_cpu(void *dummy) +{ + lru_add_drain(); +} + +/* + * Isolate one page from the LRU lists and put it on the + * indicated list. Do necessary cache draining if the + * page is not on the LRU lists yet. + * + * Result: + * 0 = page not on LRU list + * 1 = page removed from LRU list and added to the specified list. + * -ENOENT = page is being freed elsewhere. + */ +int isolate_lru_page(struct page *page) +{ + int rc = 0; + struct zone *zone = page_zone(page); + +redo: + spin_lock_irq(&zone->lru_lock); + rc = __isolate_lru_page(page); + if (rc == 1) { + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); + if (rc == 0) { + /* + * Maybe this page is still waiting for a cpu to drain it + * from one of the lru lists? + */ + rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); + if (rc == 0 && PageLRU(page)) + goto redo; + } + return rc; +} #endif /* @@ -758,48 +834,6 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, return nr_taken; } -static void lru_add_drain_per_cpu(void *dummy) -{ - lru_add_drain(); -} - -/* - * Isolate one page from the LRU lists and put it on the - * indicated list. Do necessary cache draining if the - * page is not on the LRU lists yet. - * - * Result: - * 0 = page not on LRU list - * 1 = page removed from LRU list and added to the specified list. - * -ENOENT = page is being freed elsewhere. - */ -int isolate_lru_page(struct page *page) -{ - int rc = 0; - struct zone *zone = page_zone(page); - -redo: - spin_lock_irq(&zone->lru_lock); - rc = __isolate_lru_page(page); - if (rc == 1) { - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - if (rc == 0) { - /* - * Maybe this page is still waiting for a cpu to drain it - * from one of the lru lists? - */ - rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); - if (rc == 0 && PageLRU(page)) - goto redo; - } - return rc; -} - /* * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed */ @@ -865,40 +899,6 @@ done: pagevec_release(&pvec); } -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - -/* - * Add isolated pages on the list back to the LRU - * - * returns the number of pages put back. - */ -int putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - int count = 0; - - list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); - count++; - } - return count; -} - /* * This moves pages from the active list to the inactive list. * -- cgit v1.2.3 From 1480a540c98525640174a7eadd712378fcd6fd63 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:53 -0800 Subject: [PATCH] SwapMig: add_to_swap() avoid atomic allocations Add gfp_mask to add_to_swap add_to_swap does allocations with GFP_ATOMIC in order not to interfere with swapping. During migration we may have use add_to_swap extensively which may lead to out of memory errors. This patch makes add_to_swap take a parameter that specifies the gfp mask. The page migration code can then make add_to_swap use GFP_KERNEL. Signed-off-by: Hirokazu Takahashi Signed-off-by: Dave Hansen Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 4 ++-- mm/vmscan.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index fc2aecb70a9..7b09ac503fe 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -141,7 +141,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page * page) +int add_to_swap(struct page * page, gfp_t gfp_mask) { swp_entry_t entry; int err; @@ -166,7 +166,7 @@ int add_to_swap(struct page * page) * Add it to the swap cache and mark it dirty */ err = __add_to_swap_cache(page, entry, - GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); + gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); switch (err) { case 0: /* Success */ diff --git a/mm/vmscan.c b/mm/vmscan.c index daed4a73b76..5393b093a87 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -458,7 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page)) + if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; } #endif /* CONFIG_SWAP */ @@ -715,7 +715,7 @@ redo: } if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { unlock_page(page); list_move(&page->lru, &failed); nr_failed++; -- cgit v1.2.3 From ee27497df36823f2793212cad0997c044eb0e1eb Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:54 -0800 Subject: [PATCH] SwapMig: Drop unused pages immediately Drop unused pages immediately If a page is encountered that is only referenced by the migration code then there is no reason to swap or migrate the page. Release the page by calling move_to_lru(). Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5393b093a87..73ba4046ed2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -689,6 +689,11 @@ redo: list_for_each_entry_safe(page, page2, l, lru) { cond_resched(); + if (page_count(page) == 1) { + /* page was freed from under us. So we are done. */ + move_to_lru(page); + continue; + } /* * Skip locked pages during the first two passes to give the * functions holding the lock time to release the page. Later we -- cgit v1.2.3 From d498471133ff1f9586a06820beaeebc575fe2814 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:55 -0800 Subject: [PATCH] SwapMig: Extend parameters for migrate_pages() Extend the parameters of migrate_pages() to allow the caller control over the fate of successfully migrated or impossible to migrate pages. Swap migration and direct migration will have the same interface after this patch so that patches can be independently applied to the policy layer and the core migration code. Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 27 ++++++++++++++++++++++----- mm/vmscan.c | 17 ++++++++--------- 2 files changed, 30 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 20d5ad39fa4..30bdafba52d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -429,6 +429,19 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } +static int swap_pages(struct list_head *pagelist) +{ + LIST_HEAD(moved); + LIST_HEAD(failed); + int n; + + n = migrate_pages(pagelist, NULL, &moved, &failed); + putback_lru_pages(&failed); + putback_lru_pages(&moved); + + return n; +} + long do_mbind(unsigned long start, unsigned long len, unsigned long mode, nodemask_t *nmask, unsigned long flags) { @@ -481,10 +494,13 @@ long do_mbind(unsigned long start, unsigned long len, (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL); err = PTR_ERR(vma); if (!IS_ERR(vma)) { + int nr_failed = 0; + err = mbind_range(vma, start, end, new); if (!list_empty(&pagelist)) - migrate_pages(&pagelist, NULL); - if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT)) + nr_failed = swap_pages(&pagelist); + + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } if (!list_empty(&pagelist)) @@ -635,11 +651,12 @@ int do_migrate_pages(struct mm_struct *mm, down_read(&mm->mmap_sem); check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (!list_empty(&pagelist)) { - migrate_pages(&pagelist, NULL); - if (!list_empty(&pagelist)) - count = putback_lru_pages(&pagelist); + count = swap_pages(&pagelist); + putback_lru_pages(&pagelist); } + up_read(&mm->mmap_sem); return count; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 73ba4046ed2..5eecb514cce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -670,10 +670,10 @@ retry: * list. The direct migration patchset * extends this function to avoid the use of swap. */ -int migrate_pages(struct list_head *l, struct list_head *t) +int migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) { int retry; - LIST_HEAD(failed); int nr_failed = 0; int pass = 0; struct page *page; @@ -686,12 +686,12 @@ int migrate_pages(struct list_head *l, struct list_head *t) redo: retry = 0; - list_for_each_entry_safe(page, page2, l, lru) { + list_for_each_entry_safe(page, page2, from, lru) { cond_resched(); if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ - move_to_lru(page); + list_move(&page->lru, moved); continue; } /* @@ -722,7 +722,7 @@ redo: if (PageAnon(page) && !PageSwapCache(page)) { if (!add_to_swap(page, GFP_KERNEL)) { unlock_page(page); - list_move(&page->lru, &failed); + list_move(&page->lru, failed); nr_failed++; continue; } @@ -732,8 +732,10 @@ redo: * Page is properly locked and writeback is complete. * Try to migrate the page. */ - if (!swap_page(page)) + if (!swap_page(page)) { + list_move(&page->lru, moved); continue; + } retry_later: retry++; } @@ -743,9 +745,6 @@ retry_later: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - if (!list_empty(&failed)) - list_splice(&failed, l); - return nr_failed + retry; } -- cgit v1.2.3 From d0d963281ccb22e6f339bfdd75c6b2e31351929f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:55 -0800 Subject: [PATCH] SwapMig: Switch error handling in migrate_pages to use -Exx Use -Exxx instead of numeric return codes and cleanup the code in migrate_pages() using -Exx error codes. Consolidate successful migration handling Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5eecb514cce..bf903b2d198 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -606,10 +606,6 @@ int putback_lru_pages(struct list_head *l) /* * swapout a single page * page is locked upon entry, unlocked on exit - * - * return codes: - * 0 = complete - * 1 = retry */ static int swap_page(struct page *page) { @@ -650,7 +646,7 @@ unlock_retry: unlock_page(page); retry: - return 1; + return -EAGAIN; } /* * migrate_pages @@ -669,6 +665,8 @@ retry: * is only swapping out pages and never touches the second * list. The direct migration patchset * extends this function to avoid the use of swap. + * + * Return: Number of pages not migrated when "to" ran empty. */ int migrate_pages(struct list_head *from, struct list_head *to, struct list_head *moved, struct list_head *failed) @@ -679,6 +677,7 @@ int migrate_pages(struct list_head *from, struct list_head *to, struct page *page; struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; + int rc; if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -689,22 +688,23 @@ redo: list_for_each_entry_safe(page, page2, from, lru) { cond_resched(); - if (page_count(page) == 1) { + rc = 0; + if (page_count(page) == 1) /* page was freed from under us. So we are done. */ - list_move(&page->lru, moved); - continue; - } + goto next; + /* * Skip locked pages during the first two passes to give the * functions holding the lock time to release the page. Later we * use lock_page() to have a higher chance of acquiring the * lock. */ + rc = -EAGAIN; if (pass > 2) lock_page(page); else if (TestSetPageLocked(page)) - goto retry_later; + goto next; /* * Only wait on writeback if we have already done a pass where @@ -713,18 +713,19 @@ redo: if (pass > 0) { wait_on_page_writeback(page); } else { - if (PageWriteback(page)) { - unlock_page(page); - goto retry_later; - } + if (PageWriteback(page)) + goto unlock_page; } + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ if (PageAnon(page) && !PageSwapCache(page)) { if (!add_to_swap(page, GFP_KERNEL)) { - unlock_page(page); - list_move(&page->lru, failed); - nr_failed++; - continue; + rc = -ENOMEM; + goto unlock_page; } } @@ -732,12 +733,23 @@ redo: * Page is properly locked and writeback is complete. * Try to migrate the page. */ - if (!swap_page(page)) { + rc = swap_page(page); + goto next; + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + /* Success */ list_move(&page->lru, moved); - continue; } -retry_later: - retry++; } if (retry && pass++ < 10) goto redo; -- cgit v1.2.3 From aea47ff363c15b0be5fc27ed991b1fdee338f0a7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:57 -0800 Subject: [PATCH] mm: make hugepages obey cpusets. See http://marc.theaimsgroup.com/?l=linux-kernel&m=113167000201265&w=2 http://marc.theaimsgroup.com/?l=linux-mm&m=113167267527312&w=2 Make hugepages obey cpusets. Signed-off-by: Christoph Lameter Acked-by: William Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f4c43d7980b..b21d78c941b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, for (z = zonelist->zones; *z; z++) { nid = (*z)->zone_pgdat->node_id; - if (!list_empty(&hugepage_freelists[nid])) + if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && + !list_empty(&hugepage_freelists[nid])) break; } -- cgit v1.2.3 From cd105df4590c89837a1c300843238148cfef9b5f Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Sun, 8 Jan 2006 01:00:59 -0800 Subject: [PATCH] mm: clean up local variables Clean up a local variable with the same name as a variable in a larger block. Also move a variable into the block where it's actually used. Spotted by http://linuxicc.sourceforge.net/ Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 ++- mm/swapfile.c | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f71d8be2f4e..76b092bd0bf 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -854,7 +854,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); int memsize = sizeof(struct kmem_list3); - struct array_cache *nc = NULL; switch (action) { case CPU_UP_PREPARE: @@ -891,6 +890,8 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, /* Now we can go ahead with allocating the shared array's & array cache's */ list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + nc = alloc_arraycache(node, cachep->limit, cachep->batchcount); if (!nc) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6da4b28b896..80f948a2028 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1493,7 +1493,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) goto bad_swap; - + /* OK, set up the swap map and apply the bad block list */ if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { error = -ENOMEM; @@ -1502,17 +1502,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = 0; memset(p->swap_map, 0, maxpages * sizeof(short)); - for (i=0; iinfo.nr_badpages; i++) { - int page = swap_header->info.badpages[i]; - if (page <= 0 || page >= swap_header->info.last_page) + for (i = 0; i < swap_header->info.nr_badpages; i++) { + int page_nr = swap_header->info.badpages[i]; + if (page_nr <= 0 || page_nr >= swap_header->info.last_page) error = -EINVAL; else - p->swap_map[page] = SWAP_MAP_BAD; + p->swap_map[page_nr] = SWAP_MAP_BAD; } nr_good_pages = swap_header->info.last_page - swap_header->info.nr_badpages - 1 /* header page */; - if (error) + if (error) goto bad_swap; } -- cgit v1.2.3 From ef2bf0dc8675e14cf8cba3b7fb9f48d72640a70e Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sun, 8 Jan 2006 01:01:00 -0800 Subject: [PATCH] rmap: additional diagnostics in page_remove_rmap() We seem to be hitting this assertion failure too often for it to be hardware bugs. Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 6f3f7db2712..66ec43053a4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -514,6 +514,13 @@ void page_add_file_rmap(struct page *page) void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { + if (page_mapcount(page) < 0) { + printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); + printk (KERN_EMERG " page->flags = %lx\n", page->flags); + printk (KERN_EMERG " page->count = %x\n", page_count(page)); + printk (KERN_EMERG " page->mapping = %p\n", page->mapping); + } + BUG_ON(page_mapcount(page) < 0); /* * It would be tidy to reset the PageAnon mapping here, -- cgit v1.2.3 From 38e35860dbe6197a4b42eb6e8b47da940b7695dd Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:01:01 -0800 Subject: [PATCH] mempolicies: private pointer in check_range and MPOL_MF_INVERT This was was first posted at http://marc.theaimsgroup.com/?l=linux-mm&m=113149240227584&w=2 (Part of this functionality is also contained in the direct migration pathset. The functionality here is more generic and independent of that patchset.) - Add internal flags MPOL_MF_INVERT to control check_range() behavior. - Replace the pagelist passed through by check_range by a general private pointer that may be used for other purposes. (The following patches will use that to merge numa_maps into mempolicy.c and to better group the page migration code in the policy layer) - Improve some comments. Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 30bdafba52d..270e9a39ec1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -88,8 +88,9 @@ #include #include -/* Internal MPOL_MF_xxx flags */ +/* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ static kmem_cache_t *policy_cache; static kmem_cache_t *sn_cache; @@ -227,11 +228,11 @@ static void migrate_page_add(struct vm_area_struct *vma, } } -/* Ensure all existing pages follow the policy. */ +/* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pte_t *orig_pte; pte_t *pte; @@ -248,12 +249,13 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page) continue; nid = page_to_nid(page); - if (!node_isset(nid, *nodes)) { - if (pagelist) - migrate_page_add(vma, page, pagelist, flags); - else - break; - } + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + continue; + + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(vma, page, private, flags); + else + break; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); return addr != end; @@ -262,7 +264,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pmd_t *pmd; unsigned long next; @@ -273,7 +275,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, - flags, pagelist)) + flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; @@ -282,7 +284,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pud_t *pud; unsigned long next; @@ -293,7 +295,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, if (pud_none_or_clear_bad(pud)) continue; if (check_pmd_range(vma, pud, addr, next, nodes, - flags, pagelist)) + flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; @@ -302,7 +304,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, static inline int check_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pgd_t *pgd; unsigned long next; @@ -313,7 +315,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; if (check_pud_range(vma, pgd, addr, next, nodes, - flags, pagelist)) + flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; @@ -335,8 +337,7 @@ static inline int vma_migratable(struct vm_area_struct *vma) */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + const nodemask_t *nodes, unsigned long flags, void *private) { int err; struct vm_area_struct *first, *vma, *prev; @@ -363,7 +364,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, if (vma->vm_start > start) start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, - flags, pagelist); + flags, private); if (err) { first = ERR_PTR(err); break; @@ -452,7 +453,8 @@ long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + if ((flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || mode > MPOL_MAX) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) @@ -490,8 +492,9 @@ long do_mbind(unsigned long start, unsigned long len, mode,nodes_addr(nodes)[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nmask, flags, - (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL); + vma = check_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + err = PTR_ERR(vma); if (!IS_ERR(vma)) { int nr_failed = 0; @@ -646,7 +649,6 @@ int do_migrate_pages(struct mm_struct *mm, nodemask_t nodes; nodes_andnot(nodes, *from_nodes, *to_nodes); - nodes_complement(nodes, nodes); down_read(&mm->mmap_sem); check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, -- cgit v1.2.3 From 1a75a6c825c17249ca49f050a872a04ce0997ce3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:01:02 -0800 Subject: [PATCH] Fold numa_maps into mempolicies.c First discussed at http://marc.theaimsgroup.com/?t=113149255100001&r=1&w=2 - Use the check_range() in mempolicy.c to gather statistics. - Improve the numa_maps code in general and fix some comments. Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 270e9a39ec1..44b9d69900b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -84,6 +84,8 @@ #include #include #include +#include +#include #include #include @@ -91,6 +93,7 @@ /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ static kmem_cache_t *policy_cache; static kmem_cache_t *sn_cache; @@ -228,6 +231,8 @@ static void migrate_page_add(struct vm_area_struct *vma, } } +static void gather_stats(struct page *, void *); + /* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -252,7 +257,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) continue; - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & MPOL_MF_STATS) + gather_stats(page, private); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(vma, page, private, flags); else break; @@ -1460,3 +1467,132 @@ void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) { rebind_policy(current->mempolicy, old, new); } + +/* + * Display pages allocated per node and memory policy via /proc. + */ + +static const char *policy_types[] = { "default", "prefer", "bind", + "interleave" }; + +/* + * Convert a mempolicy into a string. + * Returns the number of characters in buffer (if positive) + * or an error (negative) + */ +static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +{ + char *p = buffer; + int l; + nodemask_t nodes; + int mode = pol ? pol->policy : MPOL_DEFAULT; + + switch (mode) { + case MPOL_DEFAULT: + nodes_clear(nodes); + break; + + case MPOL_PREFERRED: + nodes_clear(nodes); + node_set(pol->v.preferred_node, nodes); + break; + + case MPOL_BIND: + get_zonemask(pol, &nodes); + break; + + case MPOL_INTERLEAVE: + nodes = pol->v.nodes; + break; + + default: + BUG(); + return -EFAULT; + } + + l = strlen(policy_types[mode]); + if (buffer + maxlen < p + l + 1) + return -ENOSPC; + + strcpy(p, policy_types[mode]); + p += l; + + if (!nodes_empty(nodes)) { + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = '='; + p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); + } + return p - buffer; +} + +struct numa_maps { + unsigned long pages; + unsigned long anon; + unsigned long mapped; + unsigned long mapcount_max; + unsigned long node[MAX_NUMNODES]; +}; + +static void gather_stats(struct page *page, void *private) +{ + struct numa_maps *md = private; + int count = page_mapcount(page); + + if (count) + md->mapped++; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->pages++; + + if (PageAnon(page)) + md->anon++; + + md->node[page_to_nid(page)]++; + cond_resched(); +} + +int show_numa_map(struct seq_file *m, void *v) +{ + struct task_struct *task = m->private; + struct vm_area_struct *vma = v; + struct numa_maps *md; + int n; + char buffer[50]; + + if (!vma->vm_mm) + return 0; + + md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); + if (!md) + return 0; + + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_online_map, MPOL_MF_STATS, md); + + if (md->pages) { + mpol_to_str(buffer, sizeof(buffer), + get_vma_policy(task, vma, vma->vm_start)); + + seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", + vma->vm_start, buffer, md->pages, + md->mapped, md->mapcount_max); + + if (md->anon) + seq_printf(m," anon=%lu",md->anon); + + for_each_online_node(n) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); + + seq_putc(m, '\n'); + } + kfree(md); + + if (m->count < m->size) + m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + return 0; +} + -- cgit v1.2.3 From 132beacf9785d2e6e8aecb59aa078f3ca5668fa6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:01:02 -0800 Subject: [PATCH] Drop page table lock before calling migrate_page_add() migrate_page_add cannot be called with a spinlock held (calls isolate_lru_page which calles schedule_on_each_cpu). Drop ptl lock in check_pte_range before calling migrate_page_add(). Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 44b9d69900b..4c0510e9e7f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -259,8 +259,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (flags & MPOL_MF_STATS) gather_stats(page, private); - else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + spin_unlock(ptl); migrate_page_add(vma, page, private, flags); + spin_lock(ptl); + } else break; } while (pte++, addr += PAGE_SIZE, addr != end); -- cgit v1.2.3 From 48fce3429df84a94766fbbc845fa8450d0715b48 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:01:03 -0800 Subject: [PATCH] mempolicies: unexport get_vma_policy() Since the numa_maps functionality is now in mempolicy.c we no longer need to export get_vma_policy(). Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4c0510e9e7f..4b077ec6c00 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -935,8 +935,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif /* Return effective policy for a VMA */ -struct mempolicy * -get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) +static struct mempolicy * get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; -- cgit v1.2.3 From 6ce3c4c0ff62ca6391019b7832fb41a7f28b9e26 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:01:04 -0800 Subject: [PATCH] Move page migration related functions near do_migrate_pages() Group page migration functions in mempolicy.c Add a forward declaration for migrate_page_add (like gather_stats()) and use our new found mobility to group all page migration related function around do_migrate_pages(). Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 270 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 138 insertions(+), 132 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4b077ec6c00..7051fe450e9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -183,55 +183,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) return policy; } -/* Check if we are the only process mapping the page in question */ -static inline int single_mm_mapping(struct mm_struct *mm, - struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int rc = 1; - - spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } -out: - spin_unlock(&mapping->i_mmap_lock); - return rc; -} - -/* - * Add a page to be migrated to the pagelist - */ -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags) -{ - /* - * Avoid migrating a page that is shared by others and not writable. - */ - if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || - mapping_writably_mapped(page->mapping) || - single_mm_mapping(vma->vm_mm, page->mapping)) { - int rc = isolate_lru_page(page); - - if (rc == 1) - list_add(&page->lru, pagelist); - /* - * If the isolate attempt was not successful then we just - * encountered an unswappable page. Something must be wrong. - */ - WARN_ON(rc == 0); - } -} - static void gather_stats(struct page *, void *); +static void migrate_page_add(struct vm_area_struct *vma, + struct page *page, struct list_head *pagelist, unsigned long flags); /* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -440,90 +394,6 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } -static int swap_pages(struct list_head *pagelist) -{ - LIST_HEAD(moved); - LIST_HEAD(failed); - int n; - - n = migrate_pages(pagelist, NULL, &moved, &failed); - putback_lru_pages(&failed); - putback_lru_pages(&moved); - - return n; -} - -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - struct mempolicy *new; - unsigned long end; - int err; - LIST_HEAD(pagelist); - - if ((flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - || mode > MPOL_MAX) - return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - if (start & ~PAGE_MASK) - return -EINVAL; - - if (mode == MPOL_DEFAULT) - flags &= ~MPOL_MF_STRICT; - - len = (len + PAGE_SIZE - 1) & PAGE_MASK; - end = start + len; - - if (end < start) - return -EINVAL; - if (end == start) - return 0; - - if (mpol_check_policy(mode, nmask)) - return -EINVAL; - - new = mpol_new(mode, nmask); - if (IS_ERR(new)) - return PTR_ERR(new); - - /* - * If we are using the default policy then operation - * on discontinuous address spaces is okay after all - */ - if (!new) - flags |= MPOL_MF_DISCONTIG_OK; - - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); - - down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); - - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - - err = mbind_range(vma, start, end, new); - if (!list_empty(&pagelist)) - nr_failed = swap_pages(&pagelist); - - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; - } - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); - - up_write(&mm->mmap_sem); - mpol_free(new); - return err; -} - /* Set the process memory policy */ long do_set_mempolicy(int mode, nodemask_t *nodes) { @@ -643,6 +513,71 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +/* + * page migration + */ + +/* Check if we are the only process mapping the page in question */ +static inline int single_mm_mapping(struct mm_struct *mm, + struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int rc = 1; + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + if (mm != vma->vm_mm) { + rc = 0; + goto out; + } + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + if (mm != vma->vm_mm) { + rc = 0; + goto out; + } +out: + spin_unlock(&mapping->i_mmap_lock); + return rc; +} + +/* + * Add a page to be migrated to the pagelist + */ +static void migrate_page_add(struct vm_area_struct *vma, + struct page *page, struct list_head *pagelist, unsigned long flags) +{ + /* + * Avoid migrating a page that is shared by others and not writable. + */ + if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || + mapping_writably_mapped(page->mapping) || + single_mm_mapping(vma->vm_mm, page->mapping)) { + int rc = isolate_lru_page(page); + + if (rc == 1) + list_add(&page->lru, pagelist); + /* + * If the isolate attempt was not successful then we just + * encountered an unswappable page. Something must be wrong. + */ + WARN_ON(rc == 0); + } +} + +static int swap_pages(struct list_head *pagelist) +{ + LIST_HEAD(moved); + LIST_HEAD(failed); + int n; + + n = migrate_pages(pagelist, NULL, &moved, &failed); + putback_lru_pages(&failed); + putback_lru_pages(&moved); + + return n; +} + /* * For now migrate_pages simply swaps out the pages from nodes that are in * the source set but not in the target set. In the future, we would @@ -673,6 +608,77 @@ int do_migrate_pages(struct mm_struct *mm, return count; } +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + int err; + LIST_HEAD(pagelist); + + if ((flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + || mode > MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + if (mpol_check_policy(mode, nmask)) + return -EINVAL; + + new = mpol_new(mode, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode,nodes_addr(nodes)[0]); + + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + + err = PTR_ERR(vma); + if (!IS_ERR(vma)) { + int nr_failed = 0; + + err = mbind_range(vma, start, end, new); + if (!list_empty(&pagelist)) + nr_failed = swap_pages(&pagelist); + + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + } + if (!list_empty(&pagelist)) + putback_lru_pages(&pagelist); + + up_write(&mm->mmap_sem); + mpol_free(new); + return err; +} + /* * User space interface with variable sized bitmaps for nodelists. */ -- cgit v1.2.3 From 2f659f462d2ab519068d0e2bb677d7a700decb8d Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sun, 8 Jan 2006 01:01:05 -0800 Subject: [PATCH] Optimise oom kill of current task When oom_killer kills current there's no need to call schedule_timeout_interruptible() since task must die ASAP. Signed-Off-By: Pavel Emelianov Signed-Off-By: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d348b903595..4748b906aff 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -298,7 +298,8 @@ retry: /* * Give "p" a good chance of killing itself before we - * retry to allocate memory. + * retry to allocate memory unless "p" is current */ - schedule_timeout_interruptible(1); + if (!test_thread_flag(TIF_MEMDIE)) + schedule_timeout_interruptible(1); } -- cgit v1.2.3 From 22fc6eccbf4ce4eb6265e6ada7b50a7b9cc57d05 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Sun, 8 Jan 2006 01:01:27 -0800 Subject: [PATCH] Change maxaligned_in_smp alignemnt macros to internodealigned_in_smp macros ____cacheline_maxaligned_in_smp is currently used to align critical structures and avoid false sharing. It uses per-arch L1_CACHE_SHIFT_MAX and people find L1_CACHE_SHIFT_MAX useless. However, we have been using ____cacheline_maxaligned_in_smp to align structures on the internode cacheline size. As per Andi's suggestion, following patch kills ____cacheline_maxaligned_in_smp and introduces INTERNODE_CACHE_SHIFT, which defaults to L1_CACHE_SHIFT for all arches. Arches needing L3/Internode cacheline alignment can define INTERNODE_CACHE_SHIFT in the arch asm/cache.h. Patch replaces ____cacheline_maxaligned_in_smp with ____cacheline_internodealigned_in_smp With this patch, L1_CACHE_SHIFT_MAX can be killed Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 72079b538e2..0a51f36ba3a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -18,10 +18,10 @@ */ #ifdef CONFIG_SPARSEMEM_EXTREME struct mem_section *mem_section[NR_SECTION_ROOTS] - ____cacheline_maxaligned_in_smp; + ____cacheline_internodealigned_in_smp; #else struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] - ____cacheline_maxaligned_in_smp; + ____cacheline_internodealigned_in_smp; #endif EXPORT_SYMBOL(mem_section); -- cgit v1.2.3 From 30992c97ae9d01b17374fbfab76a869fb4bba500 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Sun, 8 Jan 2006 01:01:43 -0800 Subject: [PATCH] slob: introduce mm/util.c for shared functions Add mm/util.c for functions common between SLAB and SLOB. Signed-off-by: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 2 +- mm/slab.c | 37 ------------------------------------- mm/util.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 38 deletions(-) create mode 100644 mm/util.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 2fa6d2ca9f2..74c85ddc917 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o slab.o swap.o truncate.o vmscan.o \ - prio_tree.o $(mmu-y) + prio_tree.o util.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/slab.c b/mm/slab.c index 76b092bd0bf..1c46c638355 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3052,20 +3052,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp) } EXPORT_SYMBOL(kmem_cache_free); -/** - * kzalloc - allocate memory. The memory is set to zero. - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - */ -void *kzalloc(size_t size, gfp_t flags) -{ - void *ret = kmalloc(size, flags); - if (ret) - memset(ret, 0, size); - return ret; -} -EXPORT_SYMBOL(kzalloc); - /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. @@ -3659,26 +3645,3 @@ unsigned int ksize(const void *objp) return obj_reallen(page_get_cache(virt_to_page(objp))); } - - -/* - * kstrdup - allocate space for and copy an existing string - * - * @s: the string to duplicate - * @gfp: the GFP mask used in the kmalloc() call when allocating memory - */ -char *kstrdup(const char *s, gfp_t gfp) -{ - size_t len; - char *buf; - - if (!s) - return NULL; - - len = strlen(s) + 1; - buf = kmalloc(len, gfp); - if (buf) - memcpy(buf, s, len); - return buf; -} -EXPORT_SYMBOL(kstrdup); diff --git a/mm/util.c b/mm/util.c new file mode 100644 index 00000000000..5f4bb59da63 --- /dev/null +++ b/mm/util.c @@ -0,0 +1,39 @@ +#include +#include +#include + +/** + * kzalloc - allocate memory. The memory is set to zero. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + */ +void *kzalloc(size_t size, gfp_t flags) +{ + void *ret = kmalloc(size, flags); + if (ret) + memset(ret, 0, size); + return ret; +} +EXPORT_SYMBOL(kzalloc); + +/* + * kstrdup - allocate space for and copy an existing string + * + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrdup(const char *s, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strlen(s) + 1; + buf = kmalloc(len, gfp); + if (buf) + memcpy(buf, s, len); + return buf; +} +EXPORT_SYMBOL(kstrdup); -- cgit v1.2.3 From 10cef6029502915bdb3cf0821d425cf9dc30c817 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Sun, 8 Jan 2006 01:01:45 -0800 Subject: [PATCH] slob: introduce the SLOB allocator configurable replacement for slab allocator This adds a CONFIG_SLAB option under CONFIG_EMBEDDED. When CONFIG_SLAB is disabled, the kernel falls back to using the 'SLOB' allocator. SLOB is a traditional K&R/UNIX allocator with a SLAB emulation layer, similar to the original Linux kmalloc allocator that SLAB replaced. It's signicantly smaller code and is more memory efficient. But like all similar allocators, it scales poorly and suffers from fragmentation more than SLAB, so it's only appropriate for small systems. It's been tested extensively in the Linux-tiny tree. I've also stress-tested it with make -j 8 compiles on a 3G SMP+PREEMPT box (not recommended). Here's a comparison for otherwise identical builds, showing SLOB saving nearly half a megabyte of RAM: $ size vmlinux* text data bss dec hex filename 3336372 529360 190812 4056544 3de5e0 vmlinux-slab 3323208 527948 190684 4041840 3dac70 vmlinux-slob $ size mm/{slab,slob}.o text data bss dec hex filename 13221 752 48 14021 36c5 mm/slab.o 1896 52 8 1956 7a4 mm/slob.o /proc/meminfo: SLAB SLOB delta MemTotal: 27964 kB 27980 kB +16 kB MemFree: 24596 kB 25092 kB +496 kB Buffers: 36 kB 36 kB 0 kB Cached: 1188 kB 1188 kB 0 kB SwapCached: 0 kB 0 kB 0 kB Active: 608 kB 600 kB -8 kB Inactive: 808 kB 812 kB +4 kB HighTotal: 0 kB 0 kB 0 kB HighFree: 0 kB 0 kB 0 kB LowTotal: 27964 kB 27980 kB +16 kB LowFree: 24596 kB 25092 kB +496 kB SwapTotal: 0 kB 0 kB 0 kB SwapFree: 0 kB 0 kB 0 kB Dirty: 4 kB 12 kB +8 kB Writeback: 0 kB 0 kB 0 kB Mapped: 560 kB 556 kB -4 kB Slab: 1756 kB 0 kB -1756 kB CommitLimit: 13980 kB 13988 kB +8 kB Committed_AS: 4208 kB 4208 kB 0 kB PageTables: 28 kB 28 kB 0 kB VmallocTotal: 1007312 kB 1007312 kB 0 kB VmallocUsed: 48 kB 48 kB 0 kB VmallocChunk: 1007264 kB 1007264 kB 0 kB (this work has been sponsored in part by CELF) From: Ingo Molnar Fix 32-bitness bugs in mm/slob.c. Signed-off-by: Matt Mackall Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 4 +- mm/slob.c | 385 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 388 insertions(+), 1 deletion(-) create mode 100644 mm/slob.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 74c85ddc917..9aa03fa1dcc 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ - readahead.o slab.o swap.o truncate.o vmscan.o \ + readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o @@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o +obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/mm/slob.c b/mm/slob.c new file mode 100644 index 00000000000..1c240c4b71d --- /dev/null +++ b/mm/slob.c @@ -0,0 +1,385 @@ +/* + * SLOB Allocator: Simple List Of Blocks + * + * Matt Mackall 12/30/03 + * + * How SLOB works: + * + * The core of SLOB is a traditional K&R style heap allocator, with + * support for returning aligned objects. The granularity of this + * allocator is 8 bytes on x86, though it's perhaps possible to reduce + * this to 4 if it's deemed worth the effort. The slob heap is a + * singly-linked list of pages from __get_free_page, grown on demand + * and allocation from the heap is currently first-fit. + * + * Above this is an implementation of kmalloc/kfree. Blocks returned + * from kmalloc are 8-byte aligned and prepended with a 8-byte header. + * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls + * __get_free_pages directly so that it can return page-aligned blocks + * and keeps a linked list of such pages and their orders. These + * objects are detected in kfree() by their page alignment. + * + * SLAB is emulated on top of SLOB by simply calling constructors and + * destructors for every SLAB allocation. Objects are returned with + * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is + * set, in which case the low-level allocator will fragment blocks to + * create the proper alignment. Again, objects of page-size or greater + * are allocated by calling __get_free_pages. As SLAB objects know + * their size, no separate size bookkeeping is necessary and there is + * essentially no allocation space overhead. + */ + +#include +#include +#include +#include +#include +#include +#include + +struct slob_block { + int units; + struct slob_block *next; +}; +typedef struct slob_block slob_t; + +#define SLOB_UNIT sizeof(slob_t) +#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) +#define SLOB_ALIGN L1_CACHE_BYTES + +struct bigblock { + int order; + void *pages; + struct bigblock *next; +}; +typedef struct bigblock bigblock_t; + +static slob_t arena = { .next = &arena, .units = 1 }; +static slob_t *slobfree = &arena; +static bigblock_t *bigblocks; +static DEFINE_SPINLOCK(slob_lock); +static DEFINE_SPINLOCK(block_lock); + +static void slob_free(void *b, int size); + +static void *slob_alloc(size_t size, gfp_t gfp, int align) +{ + slob_t *prev, *cur, *aligned = 0; + int delta = 0, units = SLOB_UNITS(size); + unsigned long flags; + + spin_lock_irqsave(&slob_lock, flags); + prev = slobfree; + for (cur = prev->next; ; prev = cur, cur = cur->next) { + if (align) { + aligned = (slob_t *)ALIGN((unsigned long)cur, align); + delta = aligned - cur; + } + if (cur->units >= units + delta) { /* room enough? */ + if (delta) { /* need to fragment head to align? */ + aligned->units = cur->units - delta; + aligned->next = cur->next; + cur->next = aligned; + cur->units = delta; + prev = cur; + cur = aligned; + } + + if (cur->units == units) /* exact fit? */ + prev->next = cur->next; /* unlink */ + else { /* fragment */ + prev->next = cur + units; + prev->next->units = cur->units - units; + prev->next->next = cur->next; + cur->units = units; + } + + slobfree = prev; + spin_unlock_irqrestore(&slob_lock, flags); + return cur; + } + if (cur == slobfree) { + spin_unlock_irqrestore(&slob_lock, flags); + + if (size == PAGE_SIZE) /* trying to shrink arena? */ + return 0; + + cur = (slob_t *)__get_free_page(gfp); + if (!cur) + return 0; + + slob_free(cur, PAGE_SIZE); + spin_lock_irqsave(&slob_lock, flags); + cur = slobfree; + } + } +} + +static void slob_free(void *block, int size) +{ + slob_t *cur, *b = (slob_t *)block; + unsigned long flags; + + if (!block) + return; + + if (size) + b->units = SLOB_UNITS(size); + + /* Find reinsertion point */ + spin_lock_irqsave(&slob_lock, flags); + for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next) + if (cur >= cur->next && (b > cur || b < cur->next)) + break; + + if (b + b->units == cur->next) { + b->units += cur->next->units; + b->next = cur->next->next; + } else + b->next = cur->next; + + if (cur + cur->units == b) { + cur->units += b->units; + cur->next = b->next; + } else + cur->next = b; + + slobfree = cur; + + spin_unlock_irqrestore(&slob_lock, flags); +} + +static int FASTCALL(find_order(int size)); +static int fastcall find_order(int size) +{ + int order = 0; + for ( ; size > 4096 ; size >>=1) + order++; + return order; +} + +void *kmalloc(size_t size, gfp_t gfp) +{ + slob_t *m; + bigblock_t *bb; + unsigned long flags; + + if (size < PAGE_SIZE - SLOB_UNIT) { + m = slob_alloc(size + SLOB_UNIT, gfp, 0); + return m ? (void *)(m + 1) : 0; + } + + bb = slob_alloc(sizeof(bigblock_t), gfp, 0); + if (!bb) + return 0; + + bb->order = find_order(size); + bb->pages = (void *)__get_free_pages(gfp, bb->order); + + if (bb->pages) { + spin_lock_irqsave(&block_lock, flags); + bb->next = bigblocks; + bigblocks = bb; + spin_unlock_irqrestore(&block_lock, flags); + return bb->pages; + } + + slob_free(bb, sizeof(bigblock_t)); + return 0; +} + +EXPORT_SYMBOL(kmalloc); + +void kfree(const void *block) +{ + bigblock_t *bb, **last = &bigblocks; + unsigned long flags; + + if (!block) + return; + + if (!((unsigned long)block & (PAGE_SIZE-1))) { + /* might be on the big block list */ + spin_lock_irqsave(&block_lock, flags); + for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { + if (bb->pages == block) { + *last = bb->next; + spin_unlock_irqrestore(&block_lock, flags); + free_pages((unsigned long)block, bb->order); + slob_free(bb, sizeof(bigblock_t)); + return; + } + } + spin_unlock_irqrestore(&block_lock, flags); + } + + slob_free((slob_t *)block - 1, 0); + return; +} + +EXPORT_SYMBOL(kfree); + +unsigned int ksize(const void *block) +{ + bigblock_t *bb; + unsigned long flags; + + if (!block) + return 0; + + if (!((unsigned long)block & (PAGE_SIZE-1))) { + spin_lock_irqsave(&block_lock, flags); + for (bb = bigblocks; bb; bb = bb->next) + if (bb->pages == block) { + spin_unlock_irqrestore(&slob_lock, flags); + return PAGE_SIZE << bb->order; + } + spin_unlock_irqrestore(&block_lock, flags); + } + + return ((slob_t *)block - 1)->units * SLOB_UNIT; +} + +struct kmem_cache { + unsigned int size, align; + const char *name; + void (*ctor)(void *, struct kmem_cache *, unsigned long); + void (*dtor)(void *, struct kmem_cache *, unsigned long); +}; + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void*, struct kmem_cache *, unsigned long), + void (*dtor)(void*, struct kmem_cache *, unsigned long)) +{ + struct kmem_cache *c; + + c = slob_alloc(sizeof(struct kmem_cache), flags, 0); + + if (c) { + c->name = name; + c->size = size; + c->ctor = ctor; + c->dtor = dtor; + /* ignore alignment unless it's forced */ + c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + if (c->align < align) + c->align = align; + } + + return c; +} +EXPORT_SYMBOL(kmem_cache_create); + +int kmem_cache_destroy(struct kmem_cache *c) +{ + slob_free(c, sizeof(struct kmem_cache)); + return 0; +} +EXPORT_SYMBOL(kmem_cache_destroy); + +void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) +{ + void *b; + + if (c->size < PAGE_SIZE) + b = slob_alloc(c->size, flags, c->align); + else + b = (void *)__get_free_pages(flags, find_order(c->size)); + + if (c->ctor) + c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR); + + return b; +} +EXPORT_SYMBOL(kmem_cache_alloc); + +void kmem_cache_free(struct kmem_cache *c, void *b) +{ + if (c->dtor) + c->dtor(b, c, 0); + + if (c->size < PAGE_SIZE) + slob_free(b, c->size); + else + free_pages((unsigned long)b, find_order(c->size)); +} +EXPORT_SYMBOL(kmem_cache_free); + +unsigned int kmem_cache_size(struct kmem_cache *c) +{ + return c->size; +} +EXPORT_SYMBOL(kmem_cache_size); + +const char *kmem_cache_name(struct kmem_cache *c) +{ + return c->name; +} +EXPORT_SYMBOL(kmem_cache_name); + +static struct timer_list slob_timer = TIMER_INITIALIZER( + (void (*)(unsigned long))kmem_cache_init, 0, 0); + +void kmem_cache_init(void) +{ + void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); + + if (p) + free_page((unsigned long)p); + + mod_timer(&slob_timer, jiffies + HZ); +} + +atomic_t slab_reclaim_pages = ATOMIC_INIT(0); +EXPORT_SYMBOL(slab_reclaim_pages); + +#ifdef CONFIG_SMP + +void *__alloc_percpu(size_t size, size_t align) +{ + int i; + struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); + + if (!pdata) + return NULL; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); + if (!pdata->ptrs[i]) + goto unwind_oom; + memset(pdata->ptrs[i], 0, size); + } + + /* Catch derefs w/o wrappers */ + return (void *) (~(unsigned long) pdata); + +unwind_oom: + while (--i >= 0) { + if (!cpu_possible(i)) + continue; + kfree(pdata->ptrs[i]); + } + kfree(pdata); + return NULL; +} +EXPORT_SYMBOL(__alloc_percpu); + +void +free_percpu(const void *objp) +{ + int i; + struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + kfree(p->ptrs[i]); + } + kfree(p); +} +EXPORT_SYMBOL(free_percpu); + +#endif -- cgit v1.2.3 From 5966514db662fb24c9bb43226a80106bcffd51f8 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 8 Jan 2006 01:01:47 -0800 Subject: [PATCH] cpuset: mempolicy one more nodemask conversion Finish converting mm/mempolicy.c from bitmaps to nodemasks. The previous conversion had left one routine using bitmaps, since it involved a corresponding change to kernel/cpuset.c Fix that interface by replacing with a simple macro that calls nodes_subset(), or if !CONFIG_CPUSET, returns (1). Signed-off-by: Paul Jackson Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7051fe450e9..9dea2b8a7d4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -387,10 +387,9 @@ static int contextualize_policy(int mode, nodemask_t *nodes) if (!nodes) return 0; - /* Update current mems_allowed */ cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - cpuset_restrict_to_mems_allowed(nodes->bits); + if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) + return -EINVAL; return mpol_check_policy(mode, nodes); } -- cgit v1.2.3 From 3e0d98b9f1eb757fc98efc84e74e54a08308aa73 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 8 Jan 2006 01:01:49 -0800 Subject: [PATCH] cpuset: memory pressure meter Provide a simple per-cpuset metric of memory pressure, tracking the -rate- that the tasks in a cpuset call try_to_free_pages(), the synchronous (direct) memory reclaim code. This enables batch managers monitoring jobs running in dedicated cpusets to efficiently detect what level of memory pressure that job is causing. This is useful both on tightly managed systems running a wide mix of submitted jobs, which may choose to terminate or reprioritize jobs that are trying to use more memory than allowed on the nodes assigned them, and with tightly coupled, long running, massively parallel scientific computing jobs that will dramatically fail to meet required performance goals if they start to use more memory than allowed to them. This patch just provides a very economical way for the batch manager to monitor a cpuset for signs of memory pressure. It's up to the batch manager or other user code to decide what to do about it and take action. ==> Unless this feature is enabled by writing "1" to the special file /dev/cpuset/memory_pressure_enabled, the hook in the rebalance code of __alloc_pages() for this metric reduces to simply noticing that the cpuset_memory_pressure_enabled flag is zero. So only systems that enable this feature will compute the metric. Why a per-cpuset, running average: Because this meter is per-cpuset, rather than per-task or mm, the system load imposed by a batch scheduler monitoring this metric is sharply reduced on large systems, because a scan of the tasklist can be avoided on each set of queries. Because this meter is a running average, instead of an accumulating counter, a batch scheduler can detect memory pressure with a single read, instead of having to read and accumulate results for a period of time. Because this meter is per-cpuset rather than per-task or mm, the batch scheduler can obtain the key information, memory pressure in a cpuset, with a single read, rather than having to query and accumulate results over all the (dynamically changing) set of tasks in the cpuset. A per-cpuset simple digital filter (requires a spinlock and 3 words of data per-cpuset) is kept, and updated by any task attached to that cpuset, if it enters the synchronous (direct) page reclaim code. A per-cpuset file provides an integer number representing the recent (half-life of 10 seconds) rate of direct page reclaims caused by the tasks in the cpuset, in units of reclaims attempted per second, times 1000. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad3d0202cde..e0e84924171 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -976,6 +976,7 @@ rebalance: cond_resched(); /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- cgit v1.2.3 From cf2a473c4089aa41c26f653200673f5a4cc25047 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 8 Jan 2006 01:01:54 -0800 Subject: [PATCH] cpuset: combine refresh_mems and update_mems The important code paths through alloc_pages_current() and alloc_page_vma(), by which most kernel page allocations go, both called cpuset_update_current_mems_allowed(), which in turn called refresh_mems(). -Both- of these latter two routines did a tasklock, got the tasks cpuset pointer, and checked for out of date cpuset->mems_generation. That was a silly duplication of code and waste of CPU cycles on an important code path. Consolidated those two routines into a single routine, called cpuset_update_task_memory_state(), since it updates more than just mems_allowed. Changed all callers of either routine to call the new consolidated routine. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9dea2b8a7d4..515bfeee027 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -387,7 +387,7 @@ static int contextualize_policy(int mode, nodemask_t *nodes) if (!nodes) return 0; - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) return -EINVAL; return mpol_check_policy(mode, nodes); @@ -461,7 +461,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; if (flags & MPOL_F_ADDR) { @@ -1089,7 +1089,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; @@ -1115,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. * - * Don't call cpuset_update_current_mems_allowed() unless + * Don't call cpuset_update_task_memory_state() unless * 1) it's ok to take cpuset_sem (can WAIT), and * 2) allocating for current task (not interrupt). */ @@ -1124,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) struct mempolicy *pol = current->mempolicy; if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (!pol || in_interrupt()) pol = &default_policy; if (pol->policy == MPOL_INTERLEAVE) -- cgit v1.2.3 From 909d75a3b77bdd8baa9429bad3b69a654d2954ce Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 8 Jan 2006 01:01:55 -0800 Subject: [PATCH] cpuset: implement cpuset_mems_allowed Provide a cpuset_mems_allowed() method, which the sys_migrate_pages() code needed, to obtain the mems_allowed vector of a cpuset, and replaced the workaround in sys_migrate_pages() to call this new method. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 515bfeee027..34d566ac147 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -772,9 +772,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, &nodes); } -/* Macro needed until Paul implements this function in kernel/cpusets.c */ -#define cpuset_mems_allowed(task) node_online_map - asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) -- cgit v1.2.3 From 74cb21553f4bf244185b9bec4c26e4e3169ad55e Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 8 Jan 2006 01:01:56 -0800 Subject: [PATCH] cpuset: numa_policy_rebind cleanup Cleanup, reorganize and make more robust the mempolicy.c code to rebind mempolicies relative to the containing cpuset after a tasks memory placement changes. The real motivator for this cleanup patch is to lay more groundwork for the upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's after the containing cpuset memory placement changes. NUMA mempolicies are constrained by the cpuset their task is a member of. When either (1) a task is moved to a different cpuset, or (2) the 'mems' mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to be recalculated, relative to their new cpuset placement. The old code used an unreliable method of determining what was the old mems_allowed constraining the mempolicy. It just looked at the tasks mems_allowed value. This sort of worked with the present code, that just rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken, referring to the old nodes. But in an upcoming patch, the vma mempolicies will be rebound as well. Then the order in which the various task and vma mempolicies are updated will no longer be deterministic, and one can no longer count on the task->mems_allowed holding the old value for as long as needed. It's not even clear if the current code was guaranteed to work reliably for task mempolicies. So I added a mems_allowed field to each mempolicy, stating exactly what mems_allowed the policy is relative to, and updated synchronously and reliably anytime that the mempolicy is rebound. Also removed a useless wrapper routine, numa_policy_rebind(), and had its caller, cpuset_update_task_memory_state(), call directly to the rewritten policy_rebind() routine, and made that rebind routine extern instead of static, and added a "mpol_" prefix to its name, making it mpol_rebind_policy(). Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 34d566ac147..c39bd86f4ea 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -180,6 +180,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) break; } policy->policy = mode; + policy->cpuset_mems_allowed = cpuset_mems_allowed(current); return policy; } @@ -1411,25 +1412,31 @@ void numa_default_policy(void) } /* Migrate a policy to a different set of nodes */ -static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, - const nodemask_t *new) +void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { + nodemask_t *mpolmask; nodemask_t tmp; if (!pol) return; + mpolmask = &pol->cpuset_mems_allowed; + if (nodes_equal(*mpolmask, *newmask)) + return; switch (pol->policy) { case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - nodes_remap(tmp, pol->v.nodes, *old, *new); + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; - current->il_next = node_remap(current->il_next, *old, *new); + *mpolmask = *newmask; + current->il_next = node_remap(current->il_next, + *mpolmask, *newmask); break; case MPOL_PREFERRED: pol->v.preferred_node = node_remap(pol->v.preferred_node, - *old, *new); + *mpolmask, *newmask); + *mpolmask = *newmask; break; case MPOL_BIND: { nodemask_t nodes; @@ -1439,7 +1446,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, nodes_clear(nodes); for (z = pol->v.zonelist->zones; *z; z++) node_set((*z)->zone_pgdat->node_id, nodes); - nodes_remap(tmp, nodes, *old, *new); + nodes_remap(tmp, nodes, *mpolmask, *newmask); nodes = tmp; zonelist = bind_zonelist(&nodes); @@ -1454,6 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, kfree(pol->v.zonelist); pol->v.zonelist = zonelist; } + *mpolmask = *newmask; break; } default: @@ -1463,14 +1471,13 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, } /* - * Someone moved this task to different nodes. Fixup mempolicies. - * - * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, - * once we have a cpuset mechanism to mark which cpuset subtree is migrating. + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. */ -void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - rebind_policy(current->mempolicy, old, new); + mpol_rebind_policy(tsk->mempolicy, new); } /* -- cgit v1.2.3 From 4225399a66b315d4d1fb1cb61b75dda201c832e3 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 8 Jan 2006 01:01:59 -0800 Subject: [PATCH] cpuset: rebind vma mempolicies fix Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c39bd86f4ea..1850d0aef4a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages_current); +/* + * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + */ +void *cpuset_being_rebound; + /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) { @@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + mpol_rebind_policy(old, &mems); + } *new = *old; atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { @@ -1480,6 +1493,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) mpol_rebind_policy(tsk->mempolicy, new); } +/* + * Rebind each vma in mm to new nodemask. + * + * Call holding a reference to mm. Takes mm->mmap_sem during call. + */ + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new); + up_write(&mm->mmap_sem); +} + /* * Display pages allocated per node and memory policy via /proc. */ -- cgit v1.2.3 From 268fc16e343b4f8e249468747db2e658da46a814 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 8 Jan 2006 01:02:12 -0800 Subject: [PATCH] export/change sync_page_range/_nolock() This exports/changes the sync_page_range/_nolock(). The fatfs needs sync_page_range/_nolock() for expanding truncate, and changes "size_t count" to "loff_t count". Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 4ef24a39768..8fdf3650802 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -280,7 +280,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping, * it is otherwise livelockable. */ int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, size_t count) + loff_t pos, loff_t count) { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; @@ -305,9 +305,8 @@ EXPORT_SYMBOL(sync_page_range); * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */ -static int sync_page_range_nolock(struct inode *inode, - struct address_space *mapping, - loff_t pos, size_t count) +int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, + loff_t pos, loff_t count) { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; @@ -322,6 +321,7 @@ static int sync_page_range_nolock(struct inode *inode, ret = wait_on_page_writeback_range(mapping, start, end); return ret; } +EXPORT_SYMBOL(sync_page_range_nolock); /** * filemap_fdatawait - walk the list of under-writeback pages of the given -- cgit v1.2.3 From 28fd129827b00e12829d48a5290f46277600619b Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 8 Jan 2006 01:02:14 -0800 Subject: [PATCH] Fix and add EXPORT_SYMBOL(filemap_write_and_wait) This patch add EXPORT_SYMBOL(filemap_write_and_wait) and use it. See mm/filemap.c: And changes the filemap_write_and_wait() and filemap_write_and_wait_range(). Current filemap_write_and_wait() doesn't wait if filemap_fdatawrite() returns error. However, even if filemap_fdatawrite() returned an error, it may have submitted the partially data pages to the device. (e.g. in the case of -ENOSPC) Andrew Morton writes, If filemap_fdatawrite() returns an error, this might be due to some I/O problem: dead disk, unplugged cable, etc. Given the generally crappy quality of the kernel's handling of such exceptions, there's a good chance that the filemap_fdatawait() will get stuck in D state forever. So, this patch doesn't wait if filemap_fdatawrite() returns the -EIO. Trond, could you please review the nfs part? Especially I'm not sure, nfs must use the "filemap_fdatawrite(inode->i_mapping) == 0", or not. Acked-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 8fdf3650802..478f4c74cc3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -343,30 +343,44 @@ EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { - int retval = 0; + int err = 0; if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); + err = filemap_fdatawrite(mapping); + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err != -EIO) { + int err2 = filemap_fdatawait(mapping); + if (!err) + err = err2; + } } - return retval; + return err; } +EXPORT_SYMBOL(filemap_write_and_wait); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - int retval = 0; + int err = 0; if (mapping->nrpages) { - retval = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); - if (retval == 0) - retval = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) { + int err2 = wait_on_page_writeback_range(mapping, + lstart >> PAGE_CACHE_SHIFT, + lend >> PAGE_CACHE_SHIFT); + if (!err) + err = err2; + } } - return retval; + return err; } /* -- cgit v1.2.3 From 87ba81dba431232548ce29d5d224115d0c2355ac Mon Sep 17 00:00:00 2001 From: Valentine Barshak Date: Sun, 8 Jan 2006 01:03:44 -0800 Subject: [PATCH] fadvise: return ESPIPE on FIFO/pipe The patch makes posix_fadvise return ESPIPE on FIFO/pipe in order to be fully POSIX-compliant. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 5f19e87bc5a..d257c89e770 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) if (!file) return -EBADF; + if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { + ret = -ESPIPE; + goto out; + } + mapping = file->f_mapping; if (!mapping || len < 0) { ret = -EINVAL; -- cgit v1.2.3 From de5097c2e73f826302cd8957c225b3725e0c7553 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jan 2006 15:59:21 -0800 Subject: [PATCH] mutex subsystem, more debugging code more mutex debugging: check for held locks during memory freeing, task exit, enable sysrq printouts, etc. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven --- mm/page_alloc.c | 3 +++ mm/slab.c | 1 + 2 files changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0e84924171..a5e6891f7bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -415,6 +415,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) int reserved = 0; arch_free_page(page, order); + if (!PageHighMem(page)) + mutex_debug_check_no_locks_freed(page_address(page), + page_address(page+(1< Date: Mon, 9 Jan 2006 15:59:24 -0800 Subject: [PATCH] mutex subsystem, semaphore to mutex: VFS, ->i_sem This patch converts the inode semaphore to a mutex. I have tested it on XFS and compiled as much as one can consider on an ia64. Anyway your luck with it might be different. Modified-by: Ingo Molnar (finished the conversion) Signed-off-by: Jes Sorensen Signed-off-by: Ingo Molnar --- mm/filemap.c | 30 +++++++++++++++--------------- mm/filemap_xip.c | 6 +++--- mm/memory.c | 4 ++-- mm/msync.c | 2 +- mm/rmap.c | 8 ++++---- mm/shmem.c | 6 +++--- mm/swapfile.c | 8 ++++---- mm/truncate.c | 2 +- 8 files changed, 33 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 478f4c74cc3..5fca2737c97 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -61,7 +61,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * - * ->i_sem + * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) * * ->mmap_sem @@ -73,9 +73,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->lock_page (access_process_vm) * * ->mmap_sem - * ->i_sem (msync) + * ->i_mutex (msync) * - * ->i_sem + * ->i_mutex * ->i_alloc_sem (various) * * ->inode_lock @@ -276,7 +276,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping, * integrity" operation. It waits upon in-flight writeout before starting and * waiting upon new writeout. If there was an IO error, return it. * - * We need to re-take i_sem during the generic_osync_inode list walk because + * We need to re-take i_mutex during the generic_osync_inode list walk because * it is otherwise livelockable. */ int sync_page_range(struct inode *inode, struct address_space *mapping, @@ -290,9 +290,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, return 0; ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); if (ret == 0) { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } if (ret == 0) ret = wait_on_page_writeback_range(mapping, start, end); @@ -301,7 +301,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range); /* - * Note: Holding i_sem across sync_page_range_nolock is not a good idea + * Note: Holding i_mutex across sync_page_range_nolock is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */ @@ -1892,7 +1892,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, /* * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. - * i_sem is held, which protects generic_osync_inode() from + * i_mutex is held, which protects generic_osync_inode() from * livelocking. */ if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2195,10 +2195,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, BUG_ON(iocb->ki_pos != pos); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2220,9 +2220,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf, struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2256,9 +2256,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, struct inode *inode = mapping->host; ssize_t ret; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err; @@ -2272,7 +2272,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, EXPORT_SYMBOL(generic_file_writev); /* - * Called under i_sem for writes to S_ISREG files. Returns -EIO if something + * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something * went wrong during pagecache shootdown. */ static ssize_t diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9cf687e4a29..e2b34e95913 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf, *ppos = pos; /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_sem. + * cannot change under us because we hold i_mutex. */ if (pos > inode->i_size) { i_size_write(inode, pos); @@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, loff_t pos; ssize_t ret; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (!access_ok(VERIFY_READ, buf, len)) { ret=-EFAULT; @@ -390,7 +390,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, out_backing: current->backing_dev_info = NULL; out_up: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return ret; } EXPORT_SYMBOL_GPL(xip_file_write); diff --git a/mm/memory.c b/mm/memory.c index 3944fec3801..7a11ddd5060 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1784,13 +1784,13 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) if (!inode->i_op || !inode->i_op->truncate_range) return -ENOSYS; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); inode->i_op->truncate_range(inode, offset, end); up_write(&inode->i_alloc_sem); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return 0; } diff --git a/mm/msync.c b/mm/msync.c index 1b5b6f662dc..3563a56e1a5 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma, ret = filemap_fdatawrite(mapping); if (file->f_op && file->f_op->fsync) { /* - * We don't take i_sem here because mmap_sem + * We don't take i_mutex here because mmap_sem * is already held. */ err = file->f_op->fsync(file,file->f_dentry,1); diff --git a/mm/rmap.c b/mm/rmap.c index 66ec43053a4..dfbb89f99a1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -20,13 +20,13 @@ /* * Lock ordering in mm: * - * inode->i_sem (while writing or truncating, not reading or faulting) + * inode->i_mutex (while writing or truncating, not reading or faulting) * inode->i_alloc_sem * * When a page fault occurs in writing from user to file, down_read - * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within - * down_read of mmap_sem; i_sem and down_write of mmap_sem are never - * taken together; in truncation, i_sem is taken outermost. + * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within + * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never + * taken together; in truncation, i_mutex is taken outermost. * * mm->mmap_sem * page->flags PG_locked (lock_page) diff --git a/mm/shmem.c b/mm/shmem.c index a1f2f02af72..343b3c0937e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1370,7 +1370,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); pos = *ppos; written = 0; @@ -1455,7 +1455,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t if (written) err = written; out: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return err; } @@ -1491,7 +1491,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ /* * We must evaluate after, since reads (unlike writes) - * are called without i_sem protection against truncate + * are called without i_mutex protection against truncate */ nr = PAGE_CACHE_SIZE; i_size = i_size_read(inode); diff --git a/mm/swapfile.c b/mm/swapfile.c index 80f948a2028..6544565a7c0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1187,9 +1187,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile) set_blocksize(bdev, p->old_block_size); bd_release(bdev); } else { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); inode->i_flags &= ~S_SWAPFILE; - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } filp_close(swap_file, NULL); err = 0; @@ -1406,7 +1406,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) p->bdev = bdev; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); did_down = 1; if (IS_SWAPFILE(inode)) { error = -EBUSY; @@ -1596,7 +1596,7 @@ out: if (did_down) { if (!error) inode->i_flags |= S_SWAPFILE; - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } return error; } diff --git a/mm/truncate.c b/mm/truncate.c index b1a463d0fe7..6cb3fff25f6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -196,7 +196,7 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * @mapping: mapping to truncate * @lstart: offset from which to truncate * - * Called under (and serialised by) inode->i_sem. + * Called under (and serialised by) inode->i_mutex. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { -- cgit v1.2.3 From 870f481793b585323fbda3e87c54efc116f46351 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2006 20:52:01 -0800 Subject: [PATCH] replace inode_update_time with file_update_time To allow various options to work per-mount instead of per-sb we need a struct vfsmount when updating ctime and mtime. This preparation patch replaces the inode_update_time routine with a file_update_atime routine so we can easily get at the vfsmount. (and the file makes more sense in this context anyway). Also get rid of the unused second argument - we always want to update the ctime when calling this routine. Signed-off-by: Christoph Hellwig Cc: Al Viro Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/filemap_xip.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 5fca2737c97..96de772be48 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2108,7 +2108,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; - inode_update_time(inode, 1); + file_update_time(file); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index e2b34e95913..b960ac8e591 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -383,7 +383,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (ret) goto out_backing; - inode_update_time(inode, 1); + file_update_time(filp); ret = __xip_file_write (filp, buf, count, pos, ppos); -- cgit v1.2.3 From e97a31117c0f96be6637f68b4029609bb1f2cc7c Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Wed, 11 Jan 2006 01:50:28 +0100 Subject: add missing printk loglevel in mm/swapfile.c in mm/swapfile.c a printk() is missing a loglevel. I believe the proper loglevel for this situation is KERN_ERR, so that's what the patch below sets -if you agree, please apply. Signed-off-by: Jesper Juhl Signed-off-by: Adrian Bunk --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6544565a7c0..d8a5afc8b2a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1442,7 +1442,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) swap_header_version = 2; else { - printk("Unable to find swap-space signature\n"); + printk(KERN_ERR "Unable to find swap-space signature\n"); error = -EINVAL; goto bad_swap; } -- cgit v1.2.3 From 78539fdfa4c21308e90c596f060df8114483862a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jan 2006 20:47:41 +1100 Subject: [XFS] Export pagevec_lookup for use on the XFS page writeout path, for dealing with delayed allocate and unwritten extents (as well). Signed-off-by: Christoph Hellwig Signed-off-by: Nathan Scott --- mm/swap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index ee6d71ccfa5..cbb48e721ab 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -384,6 +384,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, return pagevec_count(pvec); } +EXPORT_SYMBOL(pagevec_lookup); + unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) { -- cgit v1.2.3 From a4fc7ab1d065a9dd89ed0e74439ef87d4a16e980 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Jan 2006 14:41:26 +0000 Subject: [PATCH] fix/simplify mutex debugging code Let's switch mutex_debug_check_no_locks_freed() to take (addr, len) as arguments instead, since all its callers were just calculating the 'to' address for themselves anyway... (and sometimes doing so badly). Signed-off-by: David Woodhouse Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/slab.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a5e6891f7bb..8e363536e2d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -417,7 +417,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); if (!PageHighMem(page)) mutex_debug_check_no_locks_freed(page_address(page), - page_address(page+(1< Date: Wed, 11 Jan 2006 12:17:18 -0800 Subject: [PATCH] Restore KERN_EMERG to each line printed by bad_page Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e363536e2d..ce991b173aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -137,9 +137,9 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { printk(KERN_EMERG "Bad page state in process '%s'\n" - "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" - "Trying to fix it up, but a reboot is needed\n" - "Backtrace:\n", + KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n", current->comm, page, (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); -- cgit v1.2.3 From 4eac915d02453e81a32595cd7423492c81337a26 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 11 Jan 2006 12:17:19 -0800 Subject: [PATCH] mm: gfp_atomic comments Clarify in comments that GFP_ATOMIC means both "don't sleep" and "use emergency pools", hence both ALLOC_HARDER and ALLOC_HIGH. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce991b173aa..d41a0662d4d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -931,7 +931,8 @@ restart: * * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling - * policy. + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags = ALLOC_WMARK_MIN; if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) -- cgit v1.2.3 From c59ede7b78db329949d9cdcd7064e22d357560ef Mon Sep 17 00:00:00 2001 From: "Randy.Dunlap" Date: Wed, 11 Jan 2006 12:17:46 -0800 Subject: [PATCH] move capable() to capability.h - Move capable() from sched.h to capability.h; - Use where capable() is used (in include/, block/, ipc/, kernel/, a few drivers/, mm/, security/, & sound/; many more drivers/ to go) Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 + mm/mlock.c | 1 + mm/mmap.c | 1 + mm/mremap.c | 1 + mm/swapfile.c | 1 + 5 files changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 96de772be48..a965b6b35f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mlock.c b/mm/mlock.c index 4ae3a46ff76..b90c59573ab 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -5,6 +5,7 @@ * (C) Copyright 2002 Christoph Hellwig */ +#include #include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index 64ba4dbcb7d..47556d2b3e9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mremap.c b/mm/mremap.c index ddaeee9a0b6..1903bdf65e4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/swapfile.c b/mm/swapfile.c index d8a5afc8b2a..957fef43fa6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From f4598c8b3678abd65be3be00ed3d046375a4777e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 12 Jan 2006 01:05:20 -0800 Subject: [PATCH] migration: make sure there is no attempt to migrate reserved pages. This ensures that reserved pages are not migrated. Reserved pages currently cause the WARN_ON to trigger in migrate_page_add() Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1850d0aef4a..b62cab575a8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -208,6 +208,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, *pte); if (!page) continue; + if (PageReserved(page)) + continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) continue; @@ -290,7 +292,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, static inline int vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & ( - VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP)) + VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) return 0; return 1; } -- cgit v1.2.3 From 2a7e2f7dcb81279e73aefb691ea55ab3540e408a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 12 Jan 2006 01:05:21 -0800 Subject: [PATCH] do_truncate() call fix in tiny-shmem.c Adapt tiny-shmem.c to the new do_truncate() prototype. Signed-off-by: Catalin Marinas Acked-by: Matt Mackall Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/tiny-shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index cdc6d431972..f9d6a9cc91c 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -90,7 +90,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) file->f_mode = FMODE_WRITE | FMODE_READ; /* notify everyone as to the change of file size */ - error = do_truncate(dentry, size, file); + error = do_truncate(dentry, size, 0, file); if (error < 0) goto close_file; -- cgit v1.2.3 From cbe8dd4af2967ee1c2d54ec9d4db35cf3ecc98d3 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 12 Jan 2006 01:05:24 -0800 Subject: [PATCH] memmap_init_zone(): remove uneccesary page++ Remove unecessary page++ from memmap_init_zone loop. Signed-off-by: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d41a0662d4d..8c960b46959 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1742,7 +1742,7 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long end_pfn = start_pfn + size; unsigned long pfn; - for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!early_pfn_valid(pfn)) continue; page = pfn_to_page(pfn); -- cgit v1.2.3 From 7339ff8302fd70aabf5f1ae26e0c4905fa74a495 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Sat, 14 Jan 2006 13:20:48 -0800 Subject: [PATCH] Add tmpfs options for memory placement policies Anything that writes into a tmpfs filesystem is liable to disproportionately decrease the available memory on a particular node. Since there's no telling what sort of application (e.g. dd/cp/cat) might be dropping large files there, this lets the admin choose the appropriate default behavior for their site's situation. Introduce a tmpfs mount option which allows specifying a memory policy and a second option to specify the nodelist for that policy. With the default policy, tmpfs will behave as it does today. This patch adds support for preferred, bind, and interleave policies. The default policy will cause pages to be added to tmpfs files on the node which is doing the writing. Some jobs expect a single process to create and manage the tmpfs files. This results in a node which has a significantly reduced number of free pages. With this patch, the administrator can specify the policy and nodes for that policy where they would prefer allocations. This patch was originally written by Brent Casavant and Hugh Dickins. I added support for the bind and preferred policies and the mpol_nodelist mount option. Signed-off-by: Brent Casavant Signed-off-by: Hugh Dickins Signed-off-by: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 24 ++++++++++++++++++++++++ mm/shmem.c | 39 ++++++++++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b62cab575a8..3171f884d24 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1359,6 +1359,30 @@ restart: return 0; } +void mpol_shared_policy_init(struct shared_policy *info, int policy, + nodemask_t *policy_nodes) +{ + info->root = RB_ROOT; + spin_lock_init(&info->lock); + + if (policy != MPOL_DEFAULT) { + struct mempolicy *newpol; + + /* Falls back to MPOL_DEFAULT on any error */ + newpol = mpol_new(policy, policy_nodes); + if (!IS_ERR(newpol)) { + /* Create pseudo-vma that contains just the policy */ + struct vm_area_struct pvma; + + memset(&pvma, 0, sizeof(struct vm_area_struct)); + /* Policy covers entire file */ + pvma.vm_end = TASK_SIZE; + mpol_set_shared_policy(info, &pvma, newpol); + mpol_free(newpol); + } + } +} + int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, struct mempolicy *npol) { diff --git a/mm/shmem.c b/mm/shmem.c index 343b3c0937e..ce501bce1c2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1316,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) case S_IFREG: inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; - mpol_shared_policy_init(&info->policy); + mpol_shared_policy_init(&info->policy, sbinfo->policy, + &sbinfo->policy_nodes); break; case S_IFDIR: inode->i_nlink++; @@ -1330,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ - mpol_shared_policy_init(&info->policy); + mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, + NULL); break; } } else if (sbinfo->max_inodes) { @@ -1843,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = { .put_link = shmem_put_link, }; -static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) +static int shmem_parse_options(char *options, int *mode, uid_t *uid, + gid_t *gid, unsigned long *blocks, unsigned long *inodes, + int *policy, nodemask_t *policy_nodes) { char *this_char, *value, *rest; @@ -1897,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, *gid = simple_strtoul(value,&rest,0); if (*rest) goto bad_val; + } else if (!strcmp(this_char,"mpol")) { + if (!strcmp(value,"default")) + *policy = MPOL_DEFAULT; + else if (!strcmp(value,"preferred")) + *policy = MPOL_PREFERRED; + else if (!strcmp(value,"bind")) + *policy = MPOL_BIND; + else if (!strcmp(value,"interleave")) + *policy = MPOL_INTERLEAVE; + else + goto bad_val; + } else if (!strcmp(this_char,"mpol_nodelist")) { + nodelist_parse(value, *policy_nodes); } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", this_char); @@ -1917,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) struct shmem_sb_info *sbinfo = SHMEM_SB(sb); unsigned long max_blocks = sbinfo->max_blocks; unsigned long max_inodes = sbinfo->max_inodes; + int policy = sbinfo->policy; + nodemask_t policy_nodes = sbinfo->policy_nodes; unsigned long blocks; unsigned long inodes; int error = -EINVAL; - if (shmem_parse_options(data, NULL, NULL, NULL, - &max_blocks, &max_inodes)) + if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, + &max_inodes, &policy, &policy_nodes)) return error; spin_lock(&sbinfo->stat_lock); @@ -1948,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) sbinfo->free_blocks = max_blocks - blocks; sbinfo->max_inodes = max_inodes; sbinfo->free_inodes = max_inodes - inodes; + sbinfo->policy = policy; + sbinfo->policy_nodes = policy_nodes; out: spin_unlock(&sbinfo->stat_lock); return error; @@ -1972,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb, struct shmem_sb_info *sbinfo; unsigned long blocks = 0; unsigned long inodes = 0; + int policy = MPOL_DEFAULT; + nodemask_t policy_nodes = node_online_map; #ifdef CONFIG_TMPFS /* @@ -1984,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb, inodes = totalram_pages - totalhigh_pages; if (inodes > blocks) inodes = blocks; - if (shmem_parse_options(data, &mode, &uid, &gid, - &blocks, &inodes)) + if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, + &inodes, &policy, &policy_nodes)) return -EINVAL; } #else @@ -2003,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb, sbinfo->free_blocks = blocks; sbinfo->max_inodes = inodes; sbinfo->free_inodes = inodes; + sbinfo->policy = policy; + sbinfo->policy_nodes = policy_nodes; sb->s_fs_info = sbinfo; sb->s_maxbytes = SHMEM_MAX_BYTES; -- cgit v1.2.3 From 505970b96e3b7d22177c38e03435a68376628e7a Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sat, 14 Jan 2006 13:21:06 -0800 Subject: [PATCH] cpuset oom lock fix The problem, reported in: http://bugzilla.kernel.org/show_bug.cgi?id=5859 and by various other email messages and lkml posts is that the cpuset hook in the oom (out of memory) code can try to take a cpuset semaphore while holding the tasklist_lock (a spinlock). One must not sleep while holding a spinlock. The fix seems easy enough - move the cpuset semaphore region outside the tasklist_lock region. This required a few lines of mechanism to implement. The oom code where the locking needs to be changed does not have access to the cpuset locks, which are internal to kernel/cpuset.c only. So I provided a couple more cpuset interface routines, available to the rest of the kernel, which simple take and drop the lock needed here (cpusets callback_sem). Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4748b906aff..14bd4ec7959 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -274,6 +274,7 @@ void out_of_memory(gfp_t gfp_mask, int order) show_mem(); } + cpuset_lock(); read_lock(&tasklist_lock); retry: p = select_bad_process(); @@ -284,6 +285,7 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); + cpuset_unlock(); panic("Out of memory and no killable processes...\n"); } @@ -293,6 +295,7 @@ retry: out: read_unlock(&tasklist_lock); + cpuset_unlock(); if (mm) mmput(mm); -- cgit v1.2.3 From c09b42404d29c8a9266f8186632330dc8474bf2e Mon Sep 17 00:00:00 2001 From: Matt Tolentino Date: Tue, 17 Jan 2006 07:03:44 +0100 Subject: [PATCH] x86_64: add __meminit for memory hotplug Add __meminit to the __init lineup to ensure functions default to __init when memory hotplug is not enabled. Replace __devinit with __meminit on functions that were changed when the memory hotplug code was introduced. Signed-off-by: Matt Tolentino Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c960b46959..c2e29743a8d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1735,7 +1735,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1788,7 +1788,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif -static int __devinit zone_batchsize(struct zone *zone) +static int __meminit zone_batchsize(struct zone *zone) { int batch; @@ -1882,7 +1882,7 @@ static struct per_cpu_pageset * Dynamically allocate memory for the * per cpu pageset array in struct zone. */ -static int __devinit process_zones(int cpu) +static int __meminit process_zones(int cpu) { struct zone *zone, *dzone; @@ -1923,7 +1923,7 @@ static inline void free_zone_pagesets(int cpu) } } -static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, +static int __meminit pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1963,7 +1963,7 @@ void __init setup_per_cpu_pageset(void) #endif -static __devinit +static __meminit void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -1983,7 +1983,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) init_waitqueue_head(zone->wait_table + i); } -static __devinit void zone_pcp_init(struct zone *zone) +static __meminit void zone_pcp_init(struct zone *zone) { int cpu; unsigned long batch = zone_batchsize(zone); @@ -2001,7 +2001,7 @@ static __devinit void zone_pcp_init(struct zone *zone) zone->name, zone->present_pages, batch); } -static __devinit void init_currently_empty_zone(struct zone *zone, +static __meminit void init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; -- cgit v1.2.3 From e236a166b2bc437769a9b8b5d19186a3761bde48 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 18 Jan 2006 17:42:26 -0800 Subject: [PATCH] mm: dirty_exceeded speedup Ravikiran reports that this variable is bouncing all around nodes on NUMA machines, causing measurable performance problems. Fix that up by only writing to it when it actually changed. And put it in a new cacheline to prevent it sharing with other things (this happened). Signed-off-by: Ravikiran Thirumalai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5240e426c1f..945559fb63d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -46,7 +46,7 @@ static long ratelimit_pages = 32; static long total_pages; /* The total number of pages in the machine. */ -static int dirty_exceeded; /* Dirty mem may be over limit */ +static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ /* * When balance_dirty_pages decides that the caller needs to perform some @@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping) if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) break; - dirty_exceeded = 1; + if (!dirty_exceeded) + dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping) blk_congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) dirty_exceeded = 0; if (writeback_in_progress(bdi)) -- cgit v1.2.3 From 053837fce7aa79025ed57656855df09f80175527 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Jan 2006 17:42:27 -0800 Subject: [PATCH] mm: migration page refcounting fix Migration code currently does not take a reference to target page properly, so between unlocking the pte and trying to take a new reference to the page with isolate_lru_page, anything could happen to it. Fix this by holding the pte lock until we get a chance to elevate the refcount. Other small cleanups while we're here. Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 + mm/mempolicy.c | 29 ++++++++++++++---------- mm/rmap.c | 2 +- mm/swap.c | 26 +++++++++++++++++++++ mm/vmscan.c | 71 +++++++++++++++++++++++++--------------------------------- 5 files changed, 75 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index a965b6b35f2..44da3d47699 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3171f884d24..551cde40520 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, *pte); if (!page) continue; + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ if (PageReserved(page)) continue; nid = page_to_nid(page); @@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (flags & MPOL_MF_STATS) gather_stats(page, private); - else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - spin_unlock(ptl); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(vma, page, private, flags); - spin_lock(ptl); - } else break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, int err; struct vm_area_struct *first, *vma, *prev; + /* Clear the LRU lists so pages can be isolated */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_add_drain_all(); + first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); @@ -555,15 +567,8 @@ static void migrate_page_add(struct vm_area_struct *vma, if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || mapping_writably_mapped(page->mapping) || single_mm_mapping(vma->vm_mm, page->mapping)) { - int rc = isolate_lru_page(page); - - if (rc == 1) + if (isolate_lru_page(page)) list_add(&page->lru, pagelist); - /* - * If the isolate attempt was not successful then we just - * encountered an unswappable page. Something must be wrong. - */ - WARN_ON(rc == 0); } } diff --git a/mm/rmap.c b/mm/rmap.c index dfbb89f99a1..d85a99d28c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -33,7 +33,7 @@ * mapping->i_mmap_lock * anon_vma->lock * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed) + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) diff --git a/mm/swap.c b/mm/swap.c index cbb48e721ab..bc2442a7b0e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -174,6 +174,32 @@ void lru_add_drain(void) put_cpu(); } +#ifdef CONFIG_NUMA +static void lru_add_drain_per_cpu(void *dummy) +{ + lru_add_drain(); +} + +/* + * Returns 0 for success + */ +int lru_add_drain_all(void) +{ + return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); +} + +#else + +/* + * Returns 0 for success + */ +int lru_add_drain_all(void) +{ + lru_add_drain(); + return 0; +} +#endif + /* * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. diff --git a/mm/vmscan.c b/mm/vmscan.c index bf903b2d198..827bf674577 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -586,7 +586,7 @@ static inline void move_to_lru(struct page *page) } /* - * Add isolated pages on the list back to the LRU + * Add isolated pages on the list back to the LRU. * * returns the number of pages put back. */ @@ -760,46 +760,33 @@ next: return nr_failed + retry; } -static void lru_add_drain_per_cpu(void *dummy) -{ - lru_add_drain(); -} - /* * Isolate one page from the LRU lists and put it on the - * indicated list. Do necessary cache draining if the - * page is not on the LRU lists yet. + * indicated list with elevated refcount. * * Result: * 0 = page not on LRU list * 1 = page removed from LRU list and added to the specified list. - * -ENOENT = page is being freed elsewhere. */ int isolate_lru_page(struct page *page) { - int rc = 0; - struct zone *zone = page_zone(page); + int ret = 0; -redo: - spin_lock_irq(&zone->lru_lock); - rc = __isolate_lru_page(page); - if (rc == 1) { - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - if (rc == 0) { - /* - * Maybe this page is still waiting for a cpu to drain it - * from one of the lru lists? - */ - rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); - if (rc == 0 && PageLRU(page)) - goto redo; + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (TestClearPageLRU(page)) { + ret = 1; + get_page(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); } - return rc; + + return ret; } #endif @@ -831,18 +818,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - switch (__isolate_lru_page(page)) { - case 1: - /* Succeeded to isolate page */ - list_move(&page->lru, dst); - nr_taken++; - break; - case -ENOENT: - /* Not possible to isolate */ - list_move(&page->lru, src); - break; - default: + if (!TestClearPageLRU(page)) BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + continue; + } else { + list_add(&page->lru, dst); + nr_taken++; } } -- cgit v1.2.3 From fc3012896337c83a056c496d7cfb0072e1591181 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 18 Jan 2006 17:42:29 -0800 Subject: [PATCH] Simplify migrate_page_add Simplify migrate_page_add after feedback from Hugh. This also allows us to drop one parameter from migrate_page_add. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 43 +++++++------------------------------------ 1 file changed, 7 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 551cde40520..a683a66599b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -185,8 +185,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) } static void gather_stats(struct page *, void *); -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); /* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -228,7 +228,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (flags & MPOL_MF_STATS) gather_stats(page, private); else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(vma, page, private, flags); + migrate_page_add(page, private, flags); else break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -531,42 +531,13 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, * page migration */ -/* Check if we are the only process mapping the page in question */ -static inline int single_mm_mapping(struct mm_struct *mm, - struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int rc = 1; - - spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } -out: - spin_unlock(&mapping->i_mmap_lock); - return rc; -} - -/* - * Add a page to be migrated to the pagelist - */ -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags) +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) { /* - * Avoid migrating a page that is shared by others and not writable. + * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || - mapping_writably_mapped(page->mapping) || - single_mm_mapping(vma->vm_mm, page->mapping)) { + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (isolate_lru_page(page)) list_add(&page->lru, pagelist); } -- cgit v1.2.3 From f1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 18 Jan 2006 17:42:30 -0800 Subject: [PATCH] Zone reclaim: resurrect may_swap Zone reclaim has a huge impact on NUMA performance (f.e. our maximum throughput with XFS is raised from 4GB to 6GB/sec / page cache contamination of numa nodes destroys locality if one just does a large copy operation which results in performance dropping for good until reboot). This patch: Resurrect may_swap in struct scan_control Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 827bf674577..e5117b6897a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -71,6 +71,9 @@ struct scan_control { int may_writepage; + /* Can pages be swapped as part of reclaim? */ + int may_swap; + /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -458,6 +461,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { + if (!sc->may_swap) + goto keep_locked; if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; } @@ -1166,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.gfp_mask = gfp_mask; sc.may_writepage = 0; + sc.may_swap = 1; inc_page_state(allocstall); @@ -1268,6 +1274,7 @@ loop_again: total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; + sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); -- cgit v1.2.3 From 9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 18 Jan 2006 17:42:31 -0800 Subject: [PATCH] Zone reclaim: Reclaim logic Some bits for zone reclaim exists in 2.6.15 but they are not usable. This patch fixes them up, removes unused code and makes zone reclaim usable. Zone reclaim allows the reclaiming of pages from a zone if the number of free pages falls below the watermarks even if other zones still have enough pages available. Zone reclaim is of particular importance for NUMA machines. It can be more beneficial to reclaim a page than taking the performance penalties that come with allocating a page on a remote zone. Zone reclaim is enabled if the maximum distance to another node is higher than RECLAIM_DISTANCE, which may be defined by an arch. By default RECLAIM_DISTANCE is 20. 20 is the distance to another node in the same component (enclosure or motherboard) on IA64. The meaning of the NUMA distance information seems to vary by arch. If zone reclaim is not successful then no further reclaim attempts will occur for a certain time period (ZONE_RECLAIM_INTERVAL). This patch was discussed before. See http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2 Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++++++++++++--- mm/vmscan.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2e29743a8d..df54e2fc8ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, mark = (*z)->pages_high; if (!zone_watermark_ok(*z, order, mark, classzone_idx, alloc_flags)) - continue; + if (!zone_reclaim_mode || + !zone_reclaim(*z, gfp_mask, order)) + continue; } page = buffered_rmqueue(zonelist, *z, order, gfp_mask); @@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) + + if (distance != node_distance(local_node, prev_node)) node_load[node] += load; prev_node = node; load--; diff --git a/mm/vmscan.c b/mm/vmscan.c index e5117b6897a..2e34b61a70c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1572,3 +1572,71 @@ static int __init kswapd_init(void) } module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Zone reclaim mode + * + * If non-zero call zone_reclaim when the number of free pages falls below + * the watermarks. + * + * In the future we may add flags to the mode. However, the page allocator + * should only have to check that zone_reclaim_mode != 0 before calling + * zone_reclaim(). + */ +int zone_reclaim_mode __read_mostly; + +/* + * Mininum time between zone reclaim scans + */ +#define ZONE_RECLAIM_INTERVAL HZ/2 +/* + * Try to free up some pages from this zone through reclaim. + */ +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + int nr_pages = 1 << order; + struct task_struct *p = current; + struct reclaim_state reclaim_state; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = 0, + .may_swap = 0, + .nr_mapped = read_page_state(nr_mapped), + .nr_scanned = 0, + .nr_reclaimed = 0, + .priority = 0 + }; + + if (!(gfp_mask & __GFP_WAIT) || + zone->zone_pgdat->node_id != numa_node_id() || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0) + return 0; + + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) + return 0; + + disable_swap_token(); + + if (nr_pages > SWAP_CLUSTER_MAX) + sc.swap_cluster_max = nr_pages; + else + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + + cond_resched(); + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + shrink_zone(zone, &sc); + p->reclaim_state = NULL; + current->flags &= ~PF_MEMALLOC; + + if (sc.nr_reclaimed == 0) + zone->last_unsuccessful_zone_reclaim = jiffies; + + return sc.nr_reclaimed > nr_pages; +} +#endif + -- cgit v1.2.3 From fc0abb1451c64c79ac80665d5ba74450ce274e4d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Jan 2006 17:42:33 -0800 Subject: [PATCH] sem2mutex: mm/slab.c Convert mm/swapfile.c's swapon_sem to swapon_mutex. Signed-off-by: Ingo Molnar Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 46 +++++++++++++++++++++++----------------------- mm/swapfile.c | 17 +++++++++-------- 2 files changed, 32 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 9374293a301..bd0317f1e06 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -68,7 +68,7 @@ * Further notes from the original documentation: * * 11 April '97. Started multi-threading - markhe - * The global cache-chain is protected by the semaphore 'cache_chain_sem'. + * The global cache-chain is protected by the mutex 'cache_chain_mutex'. * The sem is only needed when accessing/extending the cache-chain, which * can never happen inside an interrupt (kmem_cache_create(), * kmem_cache_shrink() and kmem_cache_reap()). @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -631,7 +632,7 @@ static kmem_cache_t cache_cache = { }; /* Guard access to the cache-chain. */ -static struct semaphore cache_chain_sem; +static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; /* @@ -857,7 +858,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); /* we need to do this right in the beginning since * alloc_arraycache's are going to use this list. * kmalloc_node allows us to add the slab to the right @@ -912,7 +913,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, l3->shared = nc; } } - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); break; case CPU_ONLINE: start_cpu_timer(cpu); @@ -921,7 +922,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, case CPU_DEAD: /* fall thru */ case CPU_UP_CANCELED: - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; @@ -973,13 +974,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, spin_unlock_irq(&cachep->spinlock); kfree(nc); } - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); break; #endif } return NOTIFY_OK; bad: - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); return NOTIFY_BAD; } @@ -1047,7 +1048,6 @@ void __init kmem_cache_init(void) */ /* 1) create the cache_cache */ - init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); @@ -1168,10 +1168,10 @@ void __init kmem_cache_init(void) /* 6) resize the head arrays to their final sizes */ { kmem_cache_t *cachep; - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) enable_cpucache(cachep); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); } /* Done! */ @@ -1590,7 +1590,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, BUG(); } - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_for_each(p, &cache_chain) { kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); @@ -1856,7 +1856,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2044,18 +2044,18 @@ int kmem_cache_destroy(kmem_cache_t *cachep) lock_cpu_hotplug(); /* Find the cache in the chain of caches. */ - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); /* * the chain is never empty, cache_cache is never destroyed */ list_del(&cachep->next); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); list_add(&cachep->next, &cache_chain); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); unlock_cpu_hotplug(); return 1; } @@ -3314,7 +3314,7 @@ static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, * - clear the per-cpu caches for this CPU. * - return freeable pages to the main free memory pool. * - * If we cannot acquire the cache chain semaphore then just give up - we'll + * If we cannot acquire the cache chain mutex then just give up - we'll * try again on the next iteration. */ static void cache_reap(void *unused) @@ -3322,7 +3322,7 @@ static void cache_reap(void *unused) struct list_head *walk; struct kmem_list3 *l3; - if (down_trylock(&cache_chain_sem)) { + if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); @@ -3393,7 +3393,7 @@ static void cache_reap(void *unused) cond_resched(); } check_irq_on(); - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); drain_remote_pages(); /* Setup the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); @@ -3429,7 +3429,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t n = *pos; struct list_head *p; - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); if (!n) print_slabinfo_header(m); p = cache_chain.next; @@ -3451,7 +3451,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) static void s_stop(struct seq_file *m, void *p) { - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); } static int s_show(struct seq_file *m, void *p) @@ -3603,7 +3603,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, return -EINVAL; /* Find the cache in the chain of caches. */ - down(&cache_chain_sem); + mutex_lock(&cache_chain_mutex); res = -EINVAL; list_for_each(p, &cache_chain) { kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); @@ -3620,7 +3620,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, break; } } - up(&cache_chain_sem); + mutex_unlock(&cache_chain_mutex); if (res >= 0) res = count; return res; diff --git a/mm/swapfile.c b/mm/swapfile.c index 957fef43fa6..f1e69c30d20 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -46,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1}; struct swap_info_struct swap_info[MAX_SWAPFILES]; -static DECLARE_MUTEX(swapon_sem); +static DEFINE_MUTEX(swapon_mutex); /* * We need this because the bdev->unplug_fn can sleep and we cannot * hold swap_lock while calling the unplug_fn. And swap_lock - * cannot be turned into a semaphore. + * cannot be turned into a mutex. */ static DECLARE_RWSEM(swap_unplug_sem); @@ -1161,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) up_write(&swap_unplug_sem); destroy_swap_extents(p); - down(&swapon_sem); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); drain_mmlist(); @@ -1180,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) p->swap_map = NULL; p->flags = 0; spin_unlock(&swap_lock); - up(&swapon_sem); + mutex_unlock(&swapon_mutex); vfree(swap_map); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -1209,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) int i; loff_t l = *pos; - down(&swapon_sem); + mutex_lock(&swapon_mutex); for (i = 0; i < nr_swapfiles; i++, ptr++) { if (!(ptr->flags & SWP_USED) || !ptr->swap_map) @@ -1238,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) static void swap_stop(struct seq_file *swap, void *v) { - up(&swapon_sem); + mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) @@ -1540,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } - down(&swapon_sem); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); p->flags = SWP_ACTIVE; nr_swap_pages += nr_good_pages; @@ -1566,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) swap_info[prev].next = p - swap_info; } spin_unlock(&swap_lock); - up(&swapon_sem); + mutex_unlock(&swapon_mutex); error = 0; goto out; bad_swap: -- cgit v1.2.3 From dc85da15d42b0efc792b0f5eab774dc5dbc1ceec Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 18 Jan 2006 17:42:36 -0800 Subject: [PATCH] NUMA policies in the slab allocator V2 This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 30 ++++++++++++++++++++++++++++++ mm/slab.c | 12 ++++++++++++ 2 files changed, 42 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a683a66599b..71430d44082 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -976,6 +976,36 @@ static unsigned interleave_nodes(struct mempolicy *policy) return nid; } +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned slab_node(struct mempolicy *policy) +{ + if (in_interrupt()) + return numa_node_id(); + + switch (policy->policy) { + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + return policy->v.zonelist->zones[0]->zone_pgdat->node_id; + + case MPOL_PREFERRED: + if (policy->v.preferred_node >= 0) + return policy->v.preferred_node; + /* Fall through */ + + default: + return numa_node_id(); + } +} + /* Do static interleaving for a VMA with known offset. */ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) diff --git a/mm/slab.c b/mm/slab.c index bd0317f1e06..9025608696e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -773,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries, } #ifdef CONFIG_NUMA +static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int); + static inline struct array_cache **alloc_alien_cache(int node, int limit) { struct array_cache **ac_ptr; @@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) void *objp; struct array_cache *ac; +#ifdef CONFIG_NUMA + if (current->mempolicy) { + int nid = slab_node(current->mempolicy); + + if (nid != numa_node_id()) + return __cache_alloc_node(cachep, flags, nid); + } +#endif + check_irq_off(); ac = ac_data(cachep); if (likely(ac->avail)) { -- cgit v1.2.3 From 86c562a9d6683063e071692fe14e0a18e64ee1be Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 18 Jan 2006 17:42:37 -0800 Subject: [PATCH] mm: optimize numa policy handling in slab allocator Move the interrupt check from slab_node into ___cache_alloc and adds an "unlikely()" to avoid pipeline stalls on some architectures. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 --- mm/slab.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71430d44082..73790188b0e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -982,9 +982,6 @@ static unsigned interleave_nodes(struct mempolicy *policy) */ unsigned slab_node(struct mempolicy *policy) { - if (in_interrupt()) - return numa_node_id(); - switch (policy->policy) { case MPOL_INTERLEAVE: return interleave_nodes(policy); diff --git a/mm/slab.c b/mm/slab.c index 9025608696e..6f8495e2185 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2574,7 +2574,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) struct array_cache *ac; #ifdef CONFIG_NUMA - if (current->mempolicy) { + if (unlikely(current->mempolicy && !in_interrupt())) { int nid = slab_node(current->mempolicy); if (nid != numa_node_id()) -- cgit v1.2.3