aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile4
-rw-r--r--mm/allocpercpu.c9
-rw-r--r--mm/filemap.c399
-rw-r--r--mm/filemap_xip.c41
-rw-r--r--mm/fremap.c179
-rw-r--r--mm/highmem.c7
-rw-r--r--mm/hugetlb.c71
-rw-r--r--mm/memory.c329
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mmap.c69
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/page-writeback.c23
-rw-r--r--mm/page_alloc.c301
-rw-r--r--mm/pdflush.c1
-rw-r--r--mm/readahead.c516
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c92
-rw-r--r--mm/slab.c92
-rw-r--r--mm/slob.c56
-rw-r--r--mm/slub.c670
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/truncate.c17
-rw-r--r--mm/util.c74
-rw-r--r--mm/vmalloc.c78
-rw-r--r--mm/vmscan.c212
-rw-r--r--mm/vmstat.c2
31 files changed, 1742 insertions, 1536 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 086af703da4..86187221e78 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -163,6 +163,10 @@ config ZONE_DMA_FLAG
default "0" if !ZONE_DMA
default "1"
+config BOUNCE
+ def_bool y
+ depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+
config NR_QUICK
int
depends on QUICKLIST
diff --git a/mm/Makefile b/mm/Makefile
index a9148ea329a..245e33ab00c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,9 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
$(mmu-y)
-ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
-obj-y += bounce.o
-endif
+obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index b2486cf887a..00b02623f00 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -53,12 +53,9 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
int node = cpu_to_node(cpu);
BUG_ON(pdata->ptrs[cpu]);
- if (node_online(node)) {
- /* FIXME: kzalloc_node(size, gfp, node) */
- pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
- if (pdata->ptrs[cpu])
- memset(pdata->ptrs[cpu], 0, size);
- } else
+ if (node_online(node))
+ pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
+ else
pdata->ptrs[cpu] = kzalloc(size, gfp);
return pdata->ptrs[cpu];
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 100b99c2d50..49a6fe375d0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -867,13 +867,11 @@ void do_generic_mapping_read(struct address_space *mapping,
{
struct inode *inode = mapping->host;
unsigned long index;
- unsigned long end_index;
unsigned long offset;
unsigned long last_index;
unsigned long next_index;
unsigned long prev_index;
unsigned int prev_offset;
- loff_t isize;
struct page *cached_page;
int error;
struct file_ra_state ra = *_ra;
@@ -886,42 +884,58 @@ void do_generic_mapping_read(struct address_space *mapping,
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
+ unsigned long end_index;
+ loff_t isize;
unsigned long nr, ret;
+ cond_resched();
+find_page:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page_cache_sync_readahead(mapping,
+ &ra, filp,
+ index, last_index - index);
+ page = find_get_page(mapping, index);
+ if (unlikely(page == NULL))
+ goto no_cached_page;
+ }
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(mapping,
+ &ra, filp, page,
+ index, last_index - index);
+ }
+ if (!PageUptodate(page))
+ goto page_not_up_to_date;
+page_ok:
+ /*
+ * i_size must be checked after we know the page is Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ page_cache_release(page);
+ goto out;
+ }
+
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
- if (index >= end_index) {
- if (index > end_index)
- goto out;
+ if (index == end_index) {
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
+ page_cache_release(page);
goto out;
}
}
nr = nr - offset;
- cond_resched();
- if (index == next_index)
- next_index = page_cache_readahead(mapping, &ra, filp,
- index, last_index - index);
-
-find_page:
- page = find_get_page(mapping, index);
- if (unlikely(page == NULL)) {
- handle_ra_miss(mapping, &ra, index);
- goto no_cached_page;
- }
- if (!PageUptodate(page))
- goto page_not_up_to_date;
-page_ok:
-
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
@@ -1007,31 +1021,6 @@ readpage:
unlock_page(page);
}
- /*
- * i_size must be checked after we have done ->readpage.
- *
- * Checking i_size after the readpage allows us to calculate
- * the correct value for "nr", which means the zero-filled
- * part of the page is not copied back to userspace (unless
- * another truncate extends the file - this is desired though).
- */
- isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(!isize || index > end_index)) {
- page_cache_release(page);
- goto out;
- }
-
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
- if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
- if (nr <= offset) {
- page_cache_release(page);
- goto out;
- }
- }
- nr = nr - offset;
goto page_ok;
readpage_error:
@@ -1067,6 +1056,7 @@ no_cached_page:
out:
*_ra = ra;
+ _ra->prev_index = prev_index;
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
if (cached_page)
@@ -1317,62 +1307,62 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
#define MMAP_LOTSAMISS (100)
/**
- * filemap_nopage - read in file data for page fault handling
- * @area: the applicable vm_area
- * @address: target address to read in
- * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ * filemap_fault - read in file data for page fault handling
+ * @vma: vma in which the fault was taken
+ * @vmf: struct vm_fault containing details of the fault
*
- * filemap_nopage() is invoked via the vma operations vector for a
+ * filemap_fault() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*/
-struct page *filemap_nopage(struct vm_area_struct *area,
- unsigned long address, int *type)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int error;
- struct file *file = area->vm_file;
+ struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
struct page *page;
- unsigned long size, pgoff;
- int did_readaround = 0, majmin = VM_FAULT_MINOR;
-
- pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+ unsigned long size;
+ int did_readaround = 0;
+ int ret = 0;
-retry_all:
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (pgoff >= size)
+ if (vmf->pgoff >= size)
goto outside_data_content;
/* If we don't want any read-ahead, don't bother */
- if (VM_RandomReadHint(area))
+ if (VM_RandomReadHint(vma))
goto no_cached_page;
/*
- * The readahead code wants to be told about each and every page
- * so it can build and shrink its windows appropriately
- *
- * For sequential accesses, we use the generic readahead logic.
- */
- if (VM_SequentialReadHint(area))
- page_cache_readahead(mapping, ra, file, pgoff, 1);
-
- /*
* Do we have something in the page cache already?
*/
retry_find:
- page = find_get_page(mapping, pgoff);
+ page = find_lock_page(mapping, vmf->pgoff);
+ /*
+ * For sequential accesses, we use the generic readahead logic.
+ */
+ if (VM_SequentialReadHint(vma)) {
+ if (!page) {
+ page_cache_sync_readahead(mapping, ra, file,
+ vmf->pgoff, 1);
+ page = find_lock_page(mapping, vmf->pgoff);
+ if (!page)
+ goto no_cached_page;
+ }
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(mapping, ra, file, page,
+ vmf->pgoff, 1);
+ }
+ }
+
if (!page) {
unsigned long ra_pages;
- if (VM_SequentialReadHint(area)) {
- handle_ra_miss(mapping, ra, pgoff);
- goto no_cached_page;
- }
ra->mmap_miss++;
/*
@@ -1387,7 +1377,7 @@ retry_find:
* check did_readaround, as this is an inner loop.
*/
if (!did_readaround) {
- majmin = VM_FAULT_MAJOR;
+ ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
did_readaround = 1;
@@ -1395,11 +1385,11 @@ retry_find:
if (ra_pages) {
pgoff_t start = 0;
- if (pgoff > ra_pages / 2)
- start = pgoff - ra_pages / 2;
+ if (vmf->pgoff > ra_pages / 2)
+ start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
- page = find_get_page(mapping, pgoff);
+ page = find_lock_page(mapping, vmf->pgoff);
if (!page)
goto no_cached_page;
}
@@ -1408,35 +1398,42 @@ retry_find:
ra->mmap_hit++;
/*
- * Ok, found a page in the page cache, now we need to check
- * that it's up-to-date.
+ * We have a locked page in the page cache, now we need to check
+ * that it's up-to-date. If not, it is going to be due to an error.
*/
- if (!PageUptodate(page))
+ if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
-success:
+ /* Must recheck i_size under page lock */
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(vmf->pgoff >= size)) {
+ unlock_page(page);
+ goto outside_data_content;
+ }
+
/*
* Found the page and have a reference on it.
*/
mark_page_accessed(page);
- if (type)
- *type = majmin;
- return page;
+ ra->prev_index = page->index;
+ vmf->page = page;
+ return ret | VM_FAULT_LOCKED;
outside_data_content:
/*
* An external ptracer can access pages that normally aren't
* accessible..
*/
- if (area->vm_mm == current->mm)
- return NOPAGE_SIGBUS;
+ if (vma->vm_mm == current->mm)
+ return VM_FAULT_SIGBUS;
+
/* Fall through to the non-read-ahead case */
no_cached_page:
/*
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
- error = page_cache_read(file, pgoff);
+ error = page_cache_read(file, vmf->pgoff);
/*
* The page we want has now been added to the page cache.
@@ -1452,12 +1449,13 @@ no_cached_page:
* to schedule I/O.
*/
if (error == -ENOMEM)
- return NOPAGE_OOM;
- return NOPAGE_SIGBUS;
+ return VM_FAULT_OOM;
+ return VM_FAULT_SIGBUS;
page_not_uptodate:
+ /* IO error path */
if (!did_readaround) {
- majmin = VM_FAULT_MAJOR;
+ ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
@@ -1467,217 +1465,21 @@ page_not_uptodate:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- lock_page(page);
-
- /* Somebody truncated the page on us? */
- if (!page->mapping) {
- unlock_page(page);
- page_cache_release(page);
- goto retry_all;
- }
-
- /* Somebody else successfully read it in? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto success;
- }
ClearPageError(page);
error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry_find;
- }
-
- /*
- * Things didn't work out. Return zero to tell the
- * mm layer so, possibly freeing the page cache page first.
- */
- shrink_readahead_size_eio(file, ra);
page_cache_release(page);
- return NOPAGE_SIGBUS;
-}
-EXPORT_SYMBOL(filemap_nopage);
-static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
- int nonblock)
-{
- struct address_space *mapping = file->f_mapping;
- struct page *page;
- int error;
-
- /*
- * Do we have something in the page cache already?
- */
-retry_find:
- page = find_get_page(mapping, pgoff);
- if (!page) {
- if (nonblock)
- return NULL;
- goto no_cached_page;
- }
-
- /*
- * Ok, found a page in the page cache, now we need to check
- * that it's up-to-date.
- */
- if (!PageUptodate(page)) {
- if (nonblock) {
- page_cache_release(page);
- return NULL;
- }
- goto page_not_uptodate;
- }
-
-success:
- /*
- * Found the page and have a reference on it.
- */
- mark_page_accessed(page);
- return page;
-
-no_cached_page:
- error = page_cache_read(file, pgoff);
-
- /*
- * The page we want has now been added to the page cache.
- * In the unlikely event that someone removed it in the
- * meantime, we'll just come back here and read it again.
- */
- if (error >= 0)
- goto retry_find;
-
- /*
- * An error return from page_cache_read can result if the
- * system is low on memory, or a problem occurs while trying
- * to schedule I/O.
- */
- return NULL;
-
-page_not_uptodate:
- lock_page(page);
-
- /* Did it get truncated while we waited for it? */
- if (!page->mapping) {
- unlock_page(page);
- goto err;
- }
-
- /* Did somebody else get it up-to-date? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto success;
- }
-
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry_find;
- }
-
- /*
- * Umm, take care of errors if the page isn't up-to-date.
- * Try to re-read it _once_. We do this synchronously,
- * because there really aren't any performance issues here
- * and we need to check for errors.
- */
- lock_page(page);
-
- /* Somebody truncated the page on us? */
- if (!page->mapping) {
- unlock_page(page);
- goto err;
- }
- /* Somebody else successfully read it in? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto success;
- }
-
- ClearPageError(page);
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
+ if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
- }
- /*
- * Things didn't work out. Return zero to tell the
- * mm layer so, possibly freeing the page cache page first.
- */
-err:
- page_cache_release(page);
-
- return NULL;
-}
-
-int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
- unsigned long len, pgprot_t prot, unsigned long pgoff,
- int nonblock)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- unsigned long size;
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- int err;
-
- if (!nonblock)
- force_page_cache_readahead(mapping, vma->vm_file,
- pgoff, len >> PAGE_CACHE_SHIFT);
-
-repeat:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
- return -EINVAL;
-
- page = filemap_getpage(file, pgoff, nonblock);
-
- /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
- * done in shmem_populate calling shmem_getpage */
- if (!page && !nonblock)
- return -ENOMEM;
-
- if (page) {
- err = install_page(mm, vma, addr, page, prot);
- if (err) {
- page_cache_release(page);
- return err;
- }
- } else if (vma->vm_flags & VM_NONLINEAR) {
- /* No page was found just because we can't read it in now (being
- * here implies nonblock != 0), but the page may exist, so set
- * the PTE to fault it in later. */
- err = install_file_pte(mm, vma, addr, pgoff, prot);
- if (err)
- return err;
- }
-
- len -= PAGE_SIZE;
- addr += PAGE_SIZE;
- pgoff++;
- if (len)
- goto repeat;
-
- return 0;
+ /* Things didn't work out. Return zero to tell the mm layer so. */
+ shrink_readahead_size_eio(file, ra);
+ return VM_FAULT_SIGBUS;
}
-EXPORT_SYMBOL(filemap_populate);
+EXPORT_SYMBOL(filemap_fault);
struct vm_operations_struct generic_file_vm_ops = {
- .nopage = filemap_nopage,
- .populate = filemap_populate,
+ .fault = filemap_fault,
};
/* This is used for a general mmap of a disk file */
@@ -1690,6 +1492,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &generic_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 65ffc321f0c..53ee6a29963 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -205,62 +205,58 @@ __xip_unmap (struct address_space * mapping,
}
/*
- * xip_nopage() is invoked via the vma operations vector for a
+ * xip_fault() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
- * This function is derived from filemap_nopage, but used for execute in place
+ * This function is derived from filemap_fault, but used for execute in place
*/
-static struct page *
-xip_file_nopage(struct vm_area_struct * area,
- unsigned long address,
- int *type)
+static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
{
struct file *file = area->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct page *page;
- unsigned long size, pgoff, endoff;
+ pgoff_t size;
- pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
- + area->vm_pgoff;
- endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
- + area->vm_pgoff;
+ /* XXX: are VM_FAULT_ codes OK? */
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (pgoff >= size)
- return NOPAGE_SIGBUS;
+ if (vmf->pgoff >= size)
+ return VM_FAULT_SIGBUS;
- page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
+ page = mapping->a_ops->get_xip_page(mapping,
+ vmf->pgoff*(PAGE_SIZE/512), 0);
if (!IS_ERR(page))
goto out;
if (PTR_ERR(page) != -ENODATA)
- return NOPAGE_SIGBUS;
+ return VM_FAULT_OOM;
/* sparse block */
if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
(area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
(!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
/* maybe shared writable, allocate new block */
- page = mapping->a_ops->get_xip_page (mapping,
- pgoff*(PAGE_SIZE/512), 1);
+ page = mapping->a_ops->get_xip_page(mapping,
+ vmf->pgoff*(PAGE_SIZE/512), 1);
if (IS_ERR(page))
- return NOPAGE_SIGBUS;
+ return VM_FAULT_SIGBUS;
/* unmap page at pgoff from all other vmas */
- __xip_unmap(mapping, pgoff);
+ __xip_unmap(mapping, vmf->pgoff);
} else {
/* not shared and writable, use xip_sparse_page() */
page = xip_sparse_page();
if (!page)
- return NOPAGE_OOM;
+ return VM_FAULT_OOM;
}
out:
page_cache_get(page);
- return page;
+ vmf->page = page;
+ return 0;
}
static struct vm_operations_struct xip_file_vm_ops = {
- .nopage = xip_file_nopage,
+ .fault = xip_file_fault,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -269,6 +265,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
file_accessed(file);
vma->vm_ops = &xip_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 4e3f53dd5fd..c395b1abf08 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,13 +20,14 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
- struct page *page = NULL;
if (pte_present(pte)) {
+ struct page *page;
+
flush_cache_page(vma, addr, pte_pfn(pte));
pte = ptep_clear_flush(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
@@ -35,68 +36,21 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
set_page_dirty(page);
page_remove_rmap(page, vma);
page_cache_release(page);
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, file_rss);
}
} else {
if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear_not_present_full(mm, addr, ptep, 0);
}
- return !!page;
}
/*
- * Install a file page to a given virtual memory address, release any
- * previously existing mapping.
- */
-int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, struct page *page, pgprot_t prot)
-{
- struct inode *inode;
- pgoff_t size;
- int err = -ENOMEM;
- pte_t *pte;
- pte_t pte_val;
- spinlock_t *ptl;
-
- pte = get_locked_pte(mm, addr, &ptl);
- if (!pte)
- goto out;
-
- /*
- * This page may have been truncated. Tell the
- * caller about it.
- */
- err = -EINVAL;
- inode = vma->vm_file->f_mapping->host;
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (!page->mapping || page->index >= size)
- goto unlock;
- err = -ENOMEM;
- if (page_mapcount(page) > INT_MAX/2)
- goto unlock;
-
- if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
- inc_mm_counter(mm, file_rss);
-
- flush_icache_page(vma, page);
- pte_val = mk_pte(page, prot);
- set_pte_at(mm, addr, pte, pte_val);
- page_add_file_rmap(page);
- update_mmu_cache(vma, addr, pte_val);
- lazy_mmu_prot_update(pte_val);
- err = 0;
-unlock:
- pte_unmap_unlock(pte, ptl);
-out:
- return err;
-}
-EXPORT_SYMBOL(install_page);
-
-/*
* Install a file pte to a given virtual memory address, release any
* previously existing mapping.
*/
-int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long pgoff, pgprot_t prot)
{
int err = -ENOMEM;
@@ -107,10 +61,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte)
goto out;
- if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
- update_hiwater_rss(mm);
- dec_mm_counter(mm, file_rss);
- }
+ if (!pte_none(*pte))
+ zap_pte(mm, vma, addr, pte);
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
/*
@@ -126,6 +78,25 @@ out:
return err;
}
+static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long size, pgoff_t pgoff)
+{
+ int err;
+
+ do {
+ err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
+ if (err)
+ return err;
+
+ size -= PAGE_SIZE;
+ addr += PAGE_SIZE;
+ pgoff++;
+ } while (size);
+
+ return 0;
+
+}
+
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma.
@@ -183,41 +154,77 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
* the single existing vma. vm_private_data is used as a
* swapout cursor in a VM_NONLINEAR vma.
*/
- if (vma && (vma->vm_flags & VM_SHARED) &&
- (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
- vma->vm_ops && vma->vm_ops->populate &&
- end > start && start >= vma->vm_start &&
- end <= vma->vm_end) {
-
- /* Must set VM_NONLINEAR before any pages are populated. */
- if (pgoff != linear_page_index(vma, start) &&
- !(vma->vm_flags & VM_NONLINEAR)) {
- if (!has_write_lock) {
- up_read(&mm->mmap_sem);
- down_write(&mm->mmap_sem);
- has_write_lock = 1;
- goto retry;
- }
- mapping = vma->vm_file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
- flush_dcache_mmap_lock(mapping);
- vma->vm_flags |= VM_NONLINEAR;
- vma_prio_tree_remove(vma, &mapping->i_mmap);
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
- }
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
- err = vma->vm_ops->populate(vma, start, size,
- vma->vm_page_prot,
- pgoff, flags & MAP_NONBLOCK);
+ if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
+ goto out;
+
+ if (!vma->vm_flags & VM_CAN_NONLINEAR)
+ goto out;
+ if (end <= start || start < vma->vm_start || end > vma->vm_end)
+ goto out;
+
+ /* Must set VM_NONLINEAR before any pages are populated. */
+ if (!(vma->vm_flags & VM_NONLINEAR)) {
+ /* Don't need a nonlinear mapping, exit success */
+ if (pgoff == linear_page_index(vma, start)) {
+ err = 0;
+ goto out;
+ }
+
+ if (!has_write_lock) {
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ has_write_lock = 1;
+ goto retry;
+ }
+ mapping = vma->vm_file->f_mapping;
/*
- * We can't clear VM_NONLINEAR because we'd have to do
- * it after ->populate completes, and that would prevent
- * downgrading the lock. (Locks can't be upgraded).
+ * page_mkclean doesn't work on nonlinear vmas, so if
+ * dirty pages need to be accounted, emulate with linear
+ * vmas.
*/
+ if (mapping_cap_account_dirty(mapping)) {
+ unsigned long addr;
+
+ flags &= MAP_NONBLOCK;
+ addr = mmap_region(vma->vm_file, start, size,
+ flags, vma->vm_flags, pgoff, 1);
+ if (IS_ERR_VALUE(addr)) {
+ err = addr;
+ } else {
+ BUG_ON(addr != start);
+ err = 0;
+ }
+ goto out;
+ }
+ spin_lock(&mapping->i_mmap_lock);
+ flush_dcache_mmap_lock(mapping);
+ vma->vm_flags |= VM_NONLINEAR;
+ vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+ flush_dcache_mmap_unlock(mapping);
+ spin_unlock(&mapping->i_mmap_lock);
+ }
+
+ err = populate_range(mm, vma, start, size, pgoff);
+ if (!err && !(flags & MAP_NONBLOCK)) {
+ if (unlikely(has_write_lock)) {
+ downgrade_write(&mm->mmap_sem);
+ has_write_lock = 0;
+ }
+ make_pages_present(start, start+size);
}
+
+ /*
+ * We can't clear VM_NONLINEAR because we'd have to do
+ * it after ->populate completes, and that would prevent
+ * downgrading the lock. (Locks can't be upgraded).
+ */
+
+out:
if (likely(!has_write_lock))
up_read(&mm->mmap_sem);
else
diff --git a/mm/highmem.c b/mm/highmem.c
index be8f8d36a8b..7a967bc3515 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,9 +46,14 @@ unsigned int nr_free_highpages (void)
pg_data_t *pgdat;
unsigned int pages = 0;
- for_each_online_pgdat(pgdat)
+ for_each_online_pgdat(pgdat) {
pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
NR_FREE_PAGES);
+ if (zone_movable_is_highmem())
+ pages += zone_page_state(
+ &pgdat->node_zones[ZONE_MOVABLE],
+ NR_FREE_PAGES);
+ }
return pages;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index acc0fb3cf06..f127940ec24 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,9 @@ unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
+unsigned long hugepages_treat_as_movable;
+
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
@@ -68,22 +71,20 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
{
int nid;
struct page *page = NULL;
- struct zonelist *zonelist = huge_zonelist(vma, address);
+ struct zonelist *zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask);
struct zone **z;
for (z = zonelist->zones; *z; z++) {
nid = zone_to_nid(*z);
- if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
- !list_empty(&hugepage_freelists[nid]))
- break;
- }
-
- if (*z) {
- page = list_entry(hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
+ if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
+ !list_empty(&hugepage_freelists[nid])) {
+ page = list_entry(hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ }
}
return page;
}
@@ -103,17 +104,21 @@ static int alloc_fresh_huge_page(void)
{
static int prev_nid;
struct page *page;
- static DEFINE_SPINLOCK(nid_lock);
int nid;
- spin_lock(&nid_lock);
+ /*
+ * Copy static prev_nid to local nid, work on that, then copy it
+ * back to prev_nid afterwards: otherwise there's a window in which
+ * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really doesn't
+ * matter if occasionally a racer chooses the same nid as we do.
+ */
nid = next_node(prev_nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
prev_nid = nid;
- spin_unlock(&nid_lock);
- page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+ page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
if (page) {
set_compound_page_dtor(page, free_huge_page);
@@ -203,7 +208,7 @@ static void update_and_free_page(struct page *page)
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1<< PG_writeback);
}
- page[1].lru.next = NULL;
+ set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
__free_pages(page, HUGETLB_PAGE_ORDER);
}
@@ -263,6 +268,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
max_huge_pages = set_max_huge_pages(max_huge_pages);
return 0;
}
+
+int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (hugepages_treat_as_movable)
+ htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
+ else
+ htlb_alloc_mask = GFP_HIGHUSER;
+ return 0;
+}
+
#endif /* CONFIG_SYSCTL */
int hugetlb_report_meminfo(char *buf)
@@ -299,15 +317,14 @@ unsigned long hugetlb_total_pages(void)
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
- unsigned long address, int *unused)
+static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
- return NULL;
+ return 0;
}
struct vm_operations_struct hugetlb_vm_ops = {
- .nopage = hugetlb_nopage,
+ .fault = hugetlb_vm_op_fault,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -453,7 +470,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
avoidcopy = (page_count(old_page) == 1);
if (avoidcopy) {
set_huge_ptep_writable(vma, address, ptep);
- return VM_FAULT_MINOR;
+ return 0;
}
page_cache_get(old_page);
@@ -478,10 +495,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
}
page_cache_release(new_page);
page_cache_release(old_page);
- return VM_FAULT_MINOR;
+ return 0;
}
-int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, int write_access)
{
int ret = VM_FAULT_SIGBUS;
@@ -535,7 +552,7 @@ retry:
if (idx >= size)
goto backout;
- ret = VM_FAULT_MINOR;
+ ret = 0;
if (!pte_none(*ptep))
goto backout;
@@ -586,7 +603,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
}
- ret = VM_FAULT_MINOR;
+ ret = 0;
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
@@ -625,7 +642,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(&mm->page_table_lock);
ret = hugetlb_fault(mm, vma, vaddr, 0);
spin_lock(&mm->page_table_lock);
- if (ret == VM_FAULT_MINOR)
+ if (!(ret & VM_FAULT_MAJOR))
continue;
remainder = 0;
diff --git a/mm/memory.c b/mm/memory.c
index b3d73bb1f68..8aace3db3a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1047,7 +1047,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (pages)
foll_flags |= FOLL_GET;
if (!write && !(vma->vm_flags & VM_LOCKED) &&
- (!vma->vm_ops || !vma->vm_ops->nopage))
+ (!vma->vm_ops || (!vma->vm_ops->nopage &&
+ !vma->vm_ops->fault)))
foll_flags |= FOLL_ANON;
do {
@@ -1067,31 +1068,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
- ret = __handle_mm_fault(mm, vma, start,
+ ret = handle_mm_fault(mm, vma, start,
foll_flags & FOLL_WRITE);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return i ? i : -ENOMEM;
+ else if (ret & VM_FAULT_SIGBUS)
+ return i ? i : -EFAULT;
+ BUG();
+ }
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+
/*
- * The VM_FAULT_WRITE bit tells us that do_wp_page has
- * broken COW when necessary, even if maybe_mkwrite
- * decided not to set pte_write. We can thus safely do
- * subsequent page lookups as if they were reads.
+ * The VM_FAULT_WRITE bit tells us that
+ * do_wp_page has broken COW when necessary,
+ * even if maybe_mkwrite decided not to set
+ * pte_write. We can thus safely do subsequent
+ * page lookups as if they were reads.
*/
if (ret & VM_FAULT_WRITE)
foll_flags &= ~FOLL_WRITE;
-
- switch (ret & ~VM_FAULT_WRITE) {
- case VM_FAULT_MINOR:
- tsk->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- tsk->maj_flt++;
- break;
- case VM_FAULT_SIGBUS:
- return i ? i : -EFAULT;
- case VM_FAULT_OOM:
- return i ? i : -ENOMEM;
- default:
- BUG();
- }
+
cond_resched();
}
if (pages) {
@@ -1638,7 +1638,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse = 0, ret = VM_FAULT_MINOR;
+ int reuse = 0, ret = 0;
struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);
@@ -1715,11 +1715,11 @@ gotten:
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (old_page == ZERO_PAGE(address)) {
- new_page = alloc_zeroed_user_highpage(vma, address);
+ new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
goto oom;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
@@ -1765,6 +1765,15 @@ gotten:
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
+ /*
+ * Yes, Virginia, this is actually required to prevent a race
+ * with clear_page_dirty_for_io() from clearing the page dirty
+ * bit after it clear all dirty ptes, but before a racing
+ * do_wp_page installs a dirty pte.
+ *
+ * do_no_page is protected similarly.
+ */
+ wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page);
put_page(dirty_page);
}
@@ -1831,6 +1840,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long restart_addr;
int need_break;
+ /*
+ * files that support invalidating or truncating portions of the
+ * file from under mmaped areas must have their ->fault function
+ * return a locked page (and set VM_FAULT_LOCKED in the return).
+ * This provides synchronisation against concurrent unmapping here.
+ */
+
again:
restart_addr = vma->vm_truncate_count;
if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
@@ -1959,17 +1975,8 @@ void unmap_mapping_range(struct address_space *mapping,
spin_lock(&mapping->i_mmap_lock);
- /* serialize i_size write against truncate_count write */
- smp_wmb();
- /* Protect against page faults, and endless unmapping loops */
+ /* Protect against endless unmapping loops */
mapping->truncate_count++;
- /*
- * For archs where spin_lock has inclusive semantics like ia64
- * this smp_mb() will prevent to read pagetable contents
- * before the truncate_count increment is visible to
- * other cpus.
- */
- smp_mb();
if (unlikely(is_restart_addr(mapping->truncate_count))) {
if (mapping->truncate_count == 0)
reset_vma_truncate_counts(mapping);
@@ -2008,8 +2015,18 @@ int vmtruncate(struct inode * inode, loff_t offset)
if (IS_SWAPFILE(inode))
goto out_busy;
i_size_write(inode, offset);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for efficiency
+ * so that truncate_inode_pages does fewer single-page unmaps. However
+ * after this first call, and before truncate_inode_pages finishes,
+ * it is possible for private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second unmap_mapping_range
+ * call must be made for correctness.
+ */
unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
truncate_inode_pages(mapping, offset);
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
goto out_truncate;
do_expand:
@@ -2049,6 +2066,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
down_write(&inode->i_alloc_sem);
unmap_mapping_range(mapping, offset, (end - offset), 1);
truncate_inode_pages_range(mapping, offset, end);
+ unmap_mapping_range(mapping, offset, (end - offset), 1);
inode->i_op->truncate_range(inode, offset, end);
up_write(&inode->i_alloc_sem);
mutex_unlock(&inode->i_mutex);
@@ -2130,7 +2148,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
swp_entry_t entry;
pte_t pte;
- int ret = VM_FAULT_MINOR;
+ int ret = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out;
@@ -2198,15 +2216,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(page);
if (write_access) {
+ /* XXX: We could OR the do_wp_page code with this one? */
if (do_wp_page(mm, vma, address,
- page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+ page_table, pmd, ptl, pte) & VM_FAULT_OOM)
ret = VM_FAULT_OOM;
goto out;
}
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
- lazy_mmu_prot_update(pte);
unlock:
pte_unmap_unlock(page_table, ptl);
out:
@@ -2237,7 +2255,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage(vma, address);
+ page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
@@ -2271,7 +2289,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
lazy_mmu_prot_update(entry);
unlock:
pte_unmap_unlock(page_table, ptl);
- return VM_FAULT_MINOR;
+ return 0;
release:
page_cache_release(page);
goto unlock;
@@ -2280,10 +2298,10 @@ oom:
}
/*
- * do_no_page() tries to create a new page mapping. It aggressively
+ * __do_fault() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
- * the "write_access" parameter is true in order to avoid the next
- * page fault.
+ * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
+ * the next page fault.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
@@ -2292,89 +2310,100 @@ oom:
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access)
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
- struct page *new_page;
- struct address_space *mapping = NULL;
+ struct page *page;
pte_t entry;
- unsigned int sequence = 0;
- int ret = VM_FAULT_MINOR;
int anon = 0;
struct page *dirty_page = NULL;
+ struct vm_fault vmf;
+ int ret;
+
+ vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.pgoff = pgoff;
+ vmf.flags = flags;
+ vmf.page = NULL;
pte_unmap(page_table);
BUG_ON(vma->vm_flags & VM_PFNMAP);
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- sequence = mapping->truncate_count;
- smp_rmb(); /* serializes i_size against truncate_count */
+ if (likely(vma->vm_ops->fault)) {
+ ret = vma->vm_ops->fault(vma, &vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ return ret;
+ } else {
+ /* Legacy ->nopage path */
+ ret = 0;
+ vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+ /* no page was available -- either SIGBUS or OOM */
+ if (unlikely(vmf.page == NOPAGE_SIGBUS))
+ return VM_FAULT_SIGBUS;
+ else if (unlikely(vmf.page == NOPAGE_OOM))
+ return VM_FAULT_OOM;
}
-retry:
- new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+
/*
- * No smp_rmb is needed here as long as there's a full
- * spin_lock/unlock sequence inside the ->nopage callback
- * (for the pagecache lookup) that acts as an implicit
- * smp_mb() and prevents the i_size read to happen
- * after the next truncate_count read.
+ * For consistency in subsequent calls, make the faulted page always
+ * locked.
*/
-
- /* no page was available -- either SIGBUS, OOM or REFAULT */
- if (unlikely(new_page == NOPAGE_SIGBUS))
- return VM_FAULT_SIGBUS;
- else if (unlikely(new_page == NOPAGE_OOM))
- return VM_FAULT_OOM;
- else if (unlikely(new_page == NOPAGE_REFAULT))
- return VM_FAULT_MINOR;
+ if (unlikely(!(ret & VM_FAULT_LOCKED)))
+ lock_page(vmf.page);
+ else
+ VM_BUG_ON(!PageLocked(vmf.page));
/*
* Should we do an early C-O-W break?
*/
- if (write_access) {
+ page = vmf.page;
+ if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
- struct page *page;
-
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
- page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!page)
- goto oom;
- copy_user_highpage(page, new_page, address, vma);
- page_cache_release(new_page);
- new_page = page;
anon = 1;
-
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, address);
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+ copy_user_highpage(page, vmf.page, address, vma);
} else {
- /* if the page will be shareable, see if the backing
+ /*
+ * If the page will be shareable, see if the backing
* address space wants to know that the page is about
- * to become writable */
- if (vma->vm_ops->page_mkwrite &&
- vma->vm_ops->page_mkwrite(vma, new_page) < 0
- ) {
- page_cache_release(new_page);
- return VM_FAULT_SIGBUS;
+ * to become writable
+ */
+ if (vma->vm_ops->page_mkwrite) {
+ unlock_page(page);
+ if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
+ ret = VM_FAULT_SIGBUS;
+ anon = 1; /* no anon but release vmf.page */
+ goto out_unlocked;
+ }
+ lock_page(page);
+ /*
+ * XXX: this is not quite right (racy vs
+ * invalidate) to unlock and relock the page
+ * like this, however a better fix requires
+ * reworking page_mkwrite locking API, which
+ * is better done later.
+ */
+ if (!page->mapping) {
+ ret = 0;
+ anon = 1; /* no anon but release vmf.page */
+ goto out;
+ }
}
}
+
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- /*
- * For a file-backed vma, someone could have truncated or otherwise
- * invalidated this page. If unmap_mapping_range got called,
- * retry getting the page.
- */
- if (mapping && unlikely(sequence != mapping->truncate_count)) {
- pte_unmap_unlock(page_table, ptl);
- page_cache_release(new_page);
- cond_resched();
- sequence = mapping->truncate_count;
- smp_rmb();
- goto retry;
- }
/*
* This silly early PAGE_DIRTY setting removes a race
@@ -2387,45 +2416,63 @@ retry:
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
- if (pte_none(*page_table)) {
- flush_icache_page(vma, new_page);
- entry = mk_pte(new_page, vma->vm_page_prot);
- if (write_access)
+ if (likely(pte_same(*page_table, orig_pte))) {
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
- inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(new_page);
- page_add_new_anon_rmap(new_page, vma, address);
+ inc_mm_counter(mm, anon_rss);
+ lru_cache_add_active(page);
+ page_add_new_anon_rmap(page, vma, address);
} else {
inc_mm_counter(mm, file_rss);
- page_add_file_rmap(new_page);
- if (write_access) {
- dirty_page = new_page;
+ page_add_file_rmap(page);
+ if (flags & FAULT_FLAG_WRITE) {
+ dirty_page = page;
get_page(dirty_page);
}
}
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
} else {
- /* One of our sibling threads was faster, back out. */
- page_cache_release(new_page);
- goto unlock;
+ if (anon)
+ page_cache_release(page);
+ else
+ anon = 1; /* no anon but release faulted_page */
}
- /* no need to invalidate: a not-present page shouldn't be cached */
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
-unlock:
pte_unmap_unlock(page_table, ptl);
- if (dirty_page) {
+
+out:
+ unlock_page(vmf.page);
+out_unlocked:
+ if (anon)
+ page_cache_release(vmf.page);
+ else if (dirty_page) {
set_page_dirty_balance(dirty_page);
put_page(dirty_page);
}
+
return ret;
-oom:
- page_cache_release(new_page);
- return VM_FAULT_OOM;
}
+static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
+{
+ pgoff_t pgoff = (((address & PAGE_MASK)
+ - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+ unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+
+ return __do_fault(mm, vma, address, page_table, pmd, pgoff,
+ flags, orig_pte);
+}
+
+
/*
* do_no_pfn() tries to create a new page mapping for a page without
* a struct_page backing it
@@ -2449,7 +2496,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t entry;
unsigned long pfn;
- int ret = VM_FAULT_MINOR;
pte_unmap(page_table);
BUG_ON(!(vma->vm_flags & VM_PFNMAP));
@@ -2461,7 +2507,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
else if (unlikely(pfn == NOPFN_SIGBUS))
return VM_FAULT_SIGBUS;
else if (unlikely(pfn == NOPFN_REFAULT))
- return VM_FAULT_MINOR;
+ return 0;
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2473,7 +2519,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, address, page_table, entry);
}
pte_unmap_unlock(page_table, ptl);
- return ret;
+ return 0;
}
/*
@@ -2485,33 +2531,30 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access, pte_t orig_pte)
{
+ unsigned int flags = FAULT_FLAG_NONLINEAR |
+ (write_access ? FAULT_FLAG_WRITE : 0);
pgoff_t pgoff;
- int err;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- return VM_FAULT_MINOR;
+ return 0;
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
+ !(vma->vm_flags & VM_CAN_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
*/
print_bad_pte(vma, orig_pte, address);
return VM_FAULT_OOM;
}
- /* We can then assume vm->vm_ops && vma->vm_ops->populate */
pgoff = pte_to_pgoff(orig_pte);
- err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
- vma->vm_page_prot, pgoff, 0);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_MAJOR;
+
+ return __do_fault(mm, vma, address, page_table, pmd, pgoff,
+ flags, orig_pte);
}
/*
@@ -2538,10 +2581,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
- if (vma->vm_ops->nopage)
- return do_no_page(mm, vma, address,
- pte, pmd,
- write_access);
+ if (vma->vm_ops->fault || vma->vm_ops->nopage)
+ return do_linear_fault(mm, vma, address,
+ pte, pmd, write_access, entry);
if (unlikely(vma->vm_ops->nopfn))
return do_no_pfn(mm, vma, address, pte,
pmd, write_access);
@@ -2550,7 +2592,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
pte, pmd, write_access);
}
if (pte_file(entry))
- return do_file_page(mm, vma, address,
+ return do_nonlinear_fault(mm, vma, address,
pte, pmd, write_access, entry);
return do_swap_page(mm, vma, address,
pte, pmd, write_access, entry);
@@ -2582,13 +2624,13 @@ static inline int handle_pte_fault(struct mm_struct *mm,
}
unlock:
pte_unmap_unlock(pte, ptl);
- return VM_FAULT_MINOR;
+ return 0;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
@@ -2617,7 +2659,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
}
-EXPORT_SYMBOL_GPL(__handle_mm_fault);
+EXPORT_SYMBOL_GPL(handle_mm_fault);
#ifndef __PAGETABLE_PUD_FOLDED
/*
@@ -2823,3 +2865,4 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
return buf - old_buf;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 188f8d9c4ae..9f4e9b95e8f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -594,7 +594,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- return alloc_pages_node(node, GFP_HIGHUSER, 0);
+ return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -710,7 +710,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
{
struct vm_area_struct *vma = (struct vm_area_struct *)private;
- return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ page_address_in_vma(page, vma));
}
#else
@@ -1202,7 +1203,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
#ifdef CONFIG_HUGETLBFS
/* Return a zonelist suitable for a huge page allocation. */
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ gfp_t gfp_flags)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
@@ -1210,7 +1212,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+ return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 3e8f1fed0e1..02d5ec3feab 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -62,10 +62,9 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data, int node_id)
{
mempool_t *pool;
- pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
+ pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
if (!pool)
return NULL;
- memset(pool, 0, sizeof(*pool));
pool->elements = kmalloc_node(min_nr * sizeof(void *),
GFP_KERNEL, node_id);
if (!pool->elements) {
diff --git a/mm/migrate.c b/mm/migrate.c
index a91ca00abeb..34d8ada053e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -761,7 +761,8 @@ static struct page *new_page_node(struct page *p, unsigned long private,
*result = &pm->status;
- return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
+ return alloc_pages_node(pm->node,
+ GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
}
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 144b4a290f2..7afc7a7cec6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1165,12 +1165,8 @@ out:
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
- if (flags & MAP_POPULATE) {
- up_write(&mm->mmap_sem);
- sys_remap_file_pages(addr, len, 0,
- pgoff, flags & MAP_NONBLOCK);
- down_write(&mm->mmap_sem);
- }
+ if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+ make_pages_present(addr, addr + len);
return addr;
unmap_and_free_vma:
@@ -1575,33 +1571,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
-#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
- return expand_upwards(vma, address);
-}
-
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
- struct vm_area_struct *vma, *prev;
-
- addr &= PAGE_MASK;
- vma = find_vma_prev(mm, addr, &prev);
- if (vma && (vma->vm_start <= addr))
- return vma;
- if (!prev || expand_stack(prev, addr))
- return NULL;
- if (prev->vm_flags & VM_LOCKED) {
- make_pages_present(addr, prev->vm_end);
- }
- return prev;
-}
-#else
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+static inline int expand_downwards(struct vm_area_struct *vma,
+ unsigned long address)
{
int error;
@@ -1638,6 +1612,38 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
return error;
}
+int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ addr &= PAGE_MASK;
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+ if (!prev || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ make_pages_present(addr, prev->vm_end);
+ return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
struct vm_area_struct *
find_extend_vma(struct mm_struct * mm, unsigned long addr)
{
@@ -1655,9 +1661,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
- if (vma->vm_flags & VM_LOCKED) {
+ if (vma->vm_flags & VM_LOCKED)
make_pages_present(addr, start);
- }
return vma;
}
#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3b8f3c0c63f..e8346c30abe 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -128,7 +128,7 @@ static void change_protection(struct vm_area_struct *vma,
flush_tlb_range(vma, start, end);
}
-static int
+int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
{
diff --git a/mm/mremap.c b/mm/mremap.c
index bc7c52efc71..8ea5c2412c6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -120,7 +120,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
#define LATENCY_LIMIT (64 * PAGE_SIZE)
-static unsigned long move_page_tables(struct vm_area_struct *vma,
+unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
diff --git a/mm/nommu.c b/mm/nommu.c
index 8bbbf147a79..1b105d28949 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1341,11 +1341,10 @@ int in_gate_area_no_task(unsigned long addr)
return 0;
}
-struct page *filemap_nopage(struct vm_area_struct *area,
- unsigned long address, int *type)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
- return NULL;
+ return 0;
}
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ea9da3bed3e..63512a9ed57 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -824,6 +824,7 @@ int __set_page_dirty_nobuffers(struct page *page)
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
task_io_account_write(PAGE_CACHE_SIZE);
@@ -917,6 +918,9 @@ int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
+ BUG_ON(!PageLocked(page));
+
+ ClearPageReclaim(page);
if (mapping && mapping_cap_account_dirty(mapping)) {
/*
* Yes, Virginia, this is indeed insane.
@@ -942,14 +946,19 @@ int clear_page_dirty_for_io(struct page *page)
* We basically use the page "master dirty bit"
* as a serialization point for all the different
* threads doing their things.
- *
- * FIXME! We still have a race here: if somebody
- * adds the page back to the page tables in
- * between the "page_mkclean()" and the "TestClearPageDirty()",
- * we might have it mapped without the dirty bit set.
*/
if (page_mkclean(page))
set_page_dirty(page);
+ /*
+ * We carefully synchronise fault handlers against
+ * installing a dirty pte and marking the page dirty
+ * at this point. We do this by having them hold the
+ * page lock at some point after installing their
+ * pte, but before marking the page dirty.
+ * Pages are always locked coming in here, so we get
+ * the desired exclusion. See mm/memory.c:do_wp_page()
+ * for more comments.
+ */
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
return 1;
@@ -978,6 +987,8 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
+ if (ret)
+ dec_zone_page_state(page, NR_WRITEBACK);
return ret;
}
@@ -1003,6 +1014,8 @@ int test_set_page_writeback(struct page *page)
} else {
ret = TestSetPageWriteback(page);
}
+ if (!ret)
+ inc_zone_page_state(page, NR_WRITEBACK);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f9e4e647d7e..43cb3b3e167 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -80,8 +80,9 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
256,
#endif
#ifdef CONFIG_HIGHMEM
- 32
+ 32,
#endif
+ 32,
};
EXPORT_SYMBOL(totalram_pages);
@@ -95,8 +96,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
"Normal",
#ifdef CONFIG_HIGHMEM
- "HighMem"
+ "HighMem",
#endif
+ "Movable",
};
int min_free_kbytes = 1024;
@@ -134,6 +136,13 @@ static unsigned long __meminitdata dma_reserve;
static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+ unsigned long __initdata required_kernelcore;
+ unsigned long __initdata required_movablecore;
+ unsigned long __initdata zone_movable_pfn[MAX_NUMNODES];
+
+ /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+ int movable_zone;
+ EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
#if MAX_NUMNODES > 1
@@ -444,12 +453,6 @@ static inline int free_pages_check(struct page *page)
1 << PG_reserved |
1 << PG_buddy ))))
bad_page(page);
- /*
- * PageReclaim == PageTail. It is only an error
- * for PageReclaim to be set if PageCompound is clear.
- */
- if (unlikely(!PageCompound(page) && PageReclaim(page)))
- bad_page(page);
if (PageDirty(page))
__ClearPageDirty(page);
/*
@@ -593,7 +596,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
1 << PG_locked |
1 << PG_active |
1 << PG_dirty |
- 1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
@@ -608,7 +610,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
if (PageReserved(page))
return 1;
- page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+ page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
set_page_private(page, 0);
@@ -1324,7 +1326,7 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -1361,7 +1363,8 @@ nofail_alloc:
*/
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
- if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+ if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
+ (gfp_mask & __GFP_REPEAT))
do_retry = 1;
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
@@ -1474,13 +1477,14 @@ unsigned int nr_free_buffer_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_USER));
}
+EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
/*
* Amount of free RAM allocatable within all zones
*/
unsigned int nr_free_pagecache_pages(void)
{
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
+ return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
}
static inline void show_node(struct zone *zone)
@@ -2667,6 +2671,63 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
}
/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+void __init find_usable_zone_for_movable(void)
+{
+ int zone_index;
+ for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+ if (zone_index == ZONE_MOVABLE)
+ continue;
+
+ if (arch_zone_highest_possible_pfn[zone_index] >
+ arch_zone_lowest_possible_pfn[zone_index])
+ break;
+ }
+
+ VM_BUG_ON(zone_index == -1);
+ movable_zone = zone_index;
+}
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independant of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+void __meminit adjust_zone_range_for_zone_movable(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ /* Only adjust if ZONE_MOVABLE is on this node */
+ if (zone_movable_pfn[nid]) {
+ /* Size ZONE_MOVABLE */
+ if (zone_type == ZONE_MOVABLE) {
+ *zone_start_pfn = zone_movable_pfn[nid];
+ *zone_end_pfn = min(node_end_pfn,
+ arch_zone_highest_possible_pfn[movable_zone]);
+
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
+ /* Check if this whole range is within ZONE_MOVABLE */
+ } else if (*zone_start_pfn >= zone_movable_pfn[nid])
+ *zone_start_pfn = *zone_end_pfn;
+ }
+}
+
+/*
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
@@ -2681,6 +2742,9 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
/* Check that this node has pages within the zone's required range */
if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
@@ -2771,6 +2835,9 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
node_end_pfn);
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
}
@@ -3148,6 +3215,157 @@ unsigned long __init find_max_pfn_with_active_regions(void)
return max_pfn;
}
+unsigned long __init early_calculate_totalpages(void)
+{
+ int i;
+ unsigned long totalpages = 0;
+
+ for (i = 0; i < nr_nodemap_entries; i++)
+ totalpages += early_node_map[i].end_pfn -
+ early_node_map[i].start_pfn;
+
+ return totalpages;
+}
+
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+{
+ int i, nid;
+ unsigned long usable_startpfn;
+ unsigned long kernelcore_node, kernelcore_remaining;
+ int usable_nodes = num_online_nodes();
+
+ /*
+ * If movablecore was specified, calculate what size of
+ * kernelcore that corresponds so that memory usable for
+ * any allocation type is evenly spread. If both kernelcore
+ * and movablecore are specified, then the value of kernelcore
+ * will be used for required_kernelcore if it's greater than
+ * what movablecore would have allowed.
+ */
+ if (required_movablecore) {
+ unsigned long totalpages = early_calculate_totalpages();
+ unsigned long corepages;
+
+ /*
+ * Round-up so that ZONE_MOVABLE is at least as large as what
+ * was requested by the user
+ */
+ required_movablecore =
+ roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ corepages = totalpages - required_movablecore;
+
+ required_kernelcore = max(required_kernelcore, corepages);
+ }
+
+ /* If kernelcore was not specified, there is no ZONE_MOVABLE */
+ if (!required_kernelcore)
+ return;
+
+ /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ find_usable_zone_for_movable();
+ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+ /* Spread kernelcore memory as evenly as possible throughout nodes */
+ kernelcore_node = required_kernelcore / usable_nodes;
+ for_each_online_node(nid) {
+ /*
+ * Recalculate kernelcore_node if the division per node
+ * now exceeds what is necessary to satisfy the requested
+ * amount of memory for the kernel
+ */
+ if (required_kernelcore < kernelcore_node)
+ kernelcore_node = required_kernelcore / usable_nodes;
+
+ /*
+ * As the map is walked, we track how much memory is usable
+ * by the kernel using kernelcore_remaining. When it is
+ * 0, the rest of the node is usable by ZONE_MOVABLE
+ */
+ kernelcore_remaining = kernelcore_node;
+
+ /* Go through each range of PFNs within this node */
+ for_each_active_range_index_in_nid(i, nid) {
+ unsigned long start_pfn, end_pfn;
+ unsigned long size_pages;
+
+ start_pfn = max(early_node_map[i].start_pfn,
+ zone_movable_pfn[nid]);
+ end_pfn = early_node_map[i].end_pfn;
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /* Account for what is only usable for kernelcore */
+ if (start_pfn < usable_startpfn) {
+ unsigned long kernel_pages;
+ kernel_pages = min(end_pfn, usable_startpfn)
+ - start_pfn;
+
+ kernelcore_remaining -= min(kernel_pages,
+ kernelcore_remaining);
+ required_kernelcore -= min(kernel_pages,
+ required_kernelcore);
+
+ /* Continue if range is now fully accounted */
+ if (end_pfn <= usable_startpfn) {
+
+ /*
+ * Push zone_movable_pfn to the end so
+ * that if we have to rebalance
+ * kernelcore across nodes, we will
+ * not double account here
+ */
+ zone_movable_pfn[nid] = end_pfn;
+ continue;
+ }
+ start_pfn = usable_startpfn;
+ }
+
+ /*
+ * The usable PFN range for ZONE_MOVABLE is from
+ * start_pfn->end_pfn. Calculate size_pages as the
+ * number of pages used as kernelcore
+ */
+ size_pages = end_pfn - start_pfn;
+ if (size_pages > kernelcore_remaining)
+ size_pages = kernelcore_remaining;
+ zone_movable_pfn[nid] = start_pfn + size_pages;
+
+ /*
+ * Some kernelcore has been met, update counts and
+ * break if the kernelcore for this node has been
+ * satisified
+ */
+ required_kernelcore -= min(required_kernelcore,
+ size_pages);
+ kernelcore_remaining -= size_pages;
+ if (!kernelcore_remaining)
+ break;
+ }
+ }
+
+ /*
+ * If there is still required_kernelcore, we do another pass with one
+ * less node in the count. This will push zone_movable_pfn[nid] further
+ * along on the nodes that still have memory until kernelcore is
+ * satisified
+ */
+ usable_nodes--;
+ if (usable_nodes && required_kernelcore > usable_nodes)
+ goto restart;
+
+ /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ zone_movable_pfn[nid] =
+ roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+}
+
/**
* free_area_init_nodes - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
@@ -3177,19 +3395,37 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
+ arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
+ arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
+
+ /* Find the PFNs that ZONE_MOVABLE begins at in each node */
+ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ find_zone_movable_pfns_for_nodes(zone_movable_pfn);
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
- for (i = 0; i < MAX_NR_ZONES; i++)
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
printk(" %-8s %8lu -> %8lu\n",
zone_names[i],
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
+ }
+
+ /* Print out the PFNs ZONE_MOVABLE begins at in each node */
+ printk("Movable zone start PFN for each node\n");
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (zone_movable_pfn[i])
+ printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
+ }
/* Print out the early_node_map[] */
printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
@@ -3206,6 +3442,43 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
find_min_pfn_for_node(nid), NULL);
}
}
+
+static int __init cmdline_parse_core(char *p, unsigned long *core)
+{
+ unsigned long long coremem;
+ if (!p)
+ return -EINVAL;
+
+ coremem = memparse(p, &p);
+ *core = coremem >> PAGE_SHIFT;
+
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+ return 0;
+}
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+ return cmdline_parse_core(p, &required_kernelcore);
+}
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+ return cmdline_parse_core(p, &required_movablecore);
+}
+
+early_param("kernelcore", cmdline_parse_kernelcore);
+early_param("movablecore", cmdline_parse_movablecore);
+
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
/**
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8ce0900dc95..8f6ee073c0e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -92,6 +92,7 @@ struct pdflush_work {
static int __pdflush(struct pdflush_work *my_work)
{
current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
my_work->fn = NULL;
my_work->who = current;
INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 9861e883fe5..39bf45d4332 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -21,8 +21,16 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
}
EXPORT_SYMBOL(default_unplug_io_fn);
+/*
+ * Convienent macros for min/max read-ahead pages.
+ * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
+ * The latter is necessary for systems with large page size(i.e. 64k).
+ */
+#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
+#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
+
struct backing_dev_info default_backing_dev_info = {
- .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+ .ra_pages = MAX_RA_PAGES,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
.unplug_io_fn = default_unplug_io_fn,
@@ -41,82 +49,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
-/*
- * Return max readahead size for this inode in number-of-pages.
- */
-static inline unsigned long get_max_readahead(struct file_ra_state *ra)
-{
- return ra->ra_pages;
-}
-
-static inline unsigned long get_min_readahead(struct file_ra_state *ra)
-{
- return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-}
-
-static inline void reset_ahead_window(struct file_ra_state *ra)
-{
- /*
- * ... but preserve ahead_start + ahead_size value,
- * see 'recheck:' label in page_cache_readahead().
- * Note: We never use ->ahead_size as rvalue without
- * checking ->ahead_start != 0 first.
- */
- ra->ahead_size += ra->ahead_start;
- ra->ahead_start = 0;
-}
-
-static inline void ra_off(struct file_ra_state *ra)
-{
- ra->start = 0;
- ra->flags = 0;
- ra->size = 0;
- reset_ahead_window(ra);
- return;
-}
-
-/*
- * Set the initial window size, round to next power of 2 and square
- * for small size, x 4 for medium, and x 2 for large
- * for 128k (32 page) max ra
- * 1-8 page = 32k initial, > 8 page = 128k initial
- */
-static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
-{
- unsigned long newsize = roundup_pow_of_two(size);
-
- if (newsize <= max / 32)
- newsize = newsize * 4;
- else if (newsize <= max / 4)
- newsize = newsize * 2;
- else
- newsize = max;
- return newsize;
-}
-
-/*
- * Set the new window size, this is called only when I/O is to be submitted,
- * not for each call to readahead. If a cache miss occured, reduce next I/O
- * size, else increase depending on how close to max we are.
- */
-static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
-{
- unsigned long max = get_max_readahead(ra);
- unsigned long min = get_min_readahead(ra);
- unsigned long cur = ra->size;
- unsigned long newsize;
-
- if (ra->flags & RA_FLAG_MISS) {
- ra->flags &= ~RA_FLAG_MISS;
- newsize = max((cur - 2), min);
- } else if (cur < max / 16) {
- newsize = 4 * cur;
- } else {
- newsize = 2 * cur;
- }
- return min(newsize, max);
-}
-
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
/**
@@ -193,66 +125,6 @@ out:
}
/*
- * Readahead design.
- *
- * The fields in struct file_ra_state represent the most-recently-executed
- * readahead attempt:
- *
- * start: Page index at which we started the readahead
- * size: Number of pages in that read
- * Together, these form the "current window".
- * Together, start and size represent the `readahead window'.
- * prev_index: The page which the readahead algorithm most-recently inspected.
- * It is mainly used to detect sequential file reading.
- * If page_cache_readahead sees that it is again being called for
- * a page which it just looked at, it can return immediately without
- * making any state changes.
- * offset: Offset in the prev_index where the last read ended - used for
- * detection of sequential file reading.
- * ahead_start,
- * ahead_size: Together, these form the "ahead window".
- * ra_pages: The externally controlled max readahead for this fd.
- *
- * When readahead is in the off state (size == 0), readahead is disabled.
- * In this state, prev_index is used to detect the resumption of sequential I/O.
- *
- * The readahead code manages two windows - the "current" and the "ahead"
- * windows. The intent is that while the application is walking the pages
- * in the current window, I/O is underway on the ahead window. When the
- * current window is fully traversed, it is replaced by the ahead window
- * and the ahead window is invalidated. When this copying happens, the
- * new current window's pages are probably still locked. So
- * we submit a new batch of I/O immediately, creating a new ahead window.
- *
- * So:
- *
- * ----|----------------|----------------|-----
- * ^start ^start+size
- * ^ahead_start ^ahead_start+ahead_size
- *
- * ^ When this page is read, we submit I/O for the
- * ahead window.
- *
- * A `readahead hit' occurs when a read request is made against a page which is
- * the next sequential page. Ahead window calculations are done only when it
- * is time to submit a new IO. The code ramps up the size agressively at first,
- * but slow down as it approaches max_readhead.
- *
- * Any seek/ramdom IO will result in readahead being turned off. It will resume
- * at the first sequential access.
- *
- * There is a special-case: if the first page which the application tries to
- * read happens to be the first page of the file, it is assumed that a linear
- * read is about to happen and the window is immediately set to the initial size
- * based on I/O request size and the max_readahead.
- *
- * This function is to be called for every read request, rather than when
- * it is time to perform readahead. It is called only once for the entire I/O
- * regardless of size unless readahead is unable to start enough I/O to satisfy
- * the request (I/O request > max_readahead).
- */
-
-/*
* do_page_cache_readahead actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
@@ -265,7 +137,8 @@ out:
*/
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
@@ -278,7 +151,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (isize == 0)
goto out;
- end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
/*
* Preallocate as many pages as we will need.
@@ -286,7 +159,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
read_lock_irq(&mapping->tree_lock);
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
-
+
if (page_offset > end_index)
break;
@@ -301,6 +174,8 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
+ if (page_idx == nr_to_read - lookahead_size)
+ SetPageReadahead(page);
ret++;
}
read_unlock_irq(&mapping->tree_lock);
@@ -337,7 +212,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
- offset, this_chunk);
+ offset, this_chunk, 0);
if (err < 0) {
ret = err;
break;
@@ -350,28 +225,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
}
/*
- * Check how effective readahead is being. If the amount of started IO is
- * less than expected then the file is partly or fully in pagecache and
- * readahead isn't helping.
- *
- */
-static inline int check_ra_success(struct file_ra_state *ra,
- unsigned long nr_to_read, unsigned long actual)
-{
- if (actual == 0) {
- ra->cache_hit += nr_to_read;
- if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
- ra_off(ra);
- ra->flags |= RA_FLAG_INCACHE;
- return 0;
- }
- } else {
- ra->cache_hit=0;
- }
- return 1;
-}
-
-/*
* This version skips the IO if the queue is read-congested, and will tell the
* block layer to abandon the readahead if request allocation would block.
*
@@ -384,200 +237,237 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (bdi_read_congested(mapping->backing_dev_info))
return -1;
- return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+ return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
}
/*
- * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
- * is set wait till the read completes. Otherwise attempt to read without
- * blocking.
- * Returns 1 meaning 'success' if read is successful without switching off
- * readahead mode. Otherwise return failure.
+ * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+ * sensible upper limit.
*/
-static int
-blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read,
- struct file_ra_state *ra, int block)
+unsigned long max_sane_readahead(unsigned long nr)
+{
+ return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
+ + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
+}
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
{
int actual;
- if (!block && bdi_read_congested(mapping->backing_dev_info))
- return 0;
+ actual = __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+
+ return actual;
+}
- actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+ unsigned long newsize = roundup_pow_of_two(size);
- return check_ra_success(ra, nr_to_read, actual);
+ if (newsize <= max / 32)
+ newsize = newsize * 4;
+ else if (newsize <= max / 4)
+ newsize = newsize * 2;
+ else
+ newsize = max;
+
+ return newsize;
}
-static int make_ahead_window(struct address_space *mapping, struct file *filp,
- struct file_ra_state *ra, int force)
+/*
+ * Get the previous window size, ramp it up, and
+ * return it as the new window size.
+ */
+static unsigned long get_next_ra_size(struct file_ra_state *ra,
+ unsigned long max)
{
- int block, ret;
-
- ra->ahead_size = get_next_ra_size(ra);
- ra->ahead_start = ra->start + ra->size;
-
- block = force || (ra->prev_index >= ra->ahead_start);
- ret = blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, block);
-
- if (!ret && !force) {
- /* A read failure in blocking mode, implies pages are
- * all cached. So we can safely assume we have taken
- * care of all the pages requested in this call.
- * A read failure in non-blocking mode, implies we are
- * reading more pages than requested in this call. So
- * we safely assume we have taken care of all the pages
- * requested in this call.
- *
- * Just reset the ahead window in case we failed due to
- * congestion. The ahead window will any way be closed
- * in case we failed due to excessive page cache hits.
- */
- reset_ahead_window(ra);
- }
+ unsigned long cur = ra->size;
+ unsigned long newsize;
- return ret;
+ if (cur < max / 16)
+ newsize = 4 * cur;
+ else
+ newsize = 2 * cur;
+
+ return min(newsize, max);
}
-/**
- * page_cache_readahead - generic adaptive readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
- * @req_size: hint: total size of the read which the caller is performing in
- * PAGE_CACHE_SIZE units
+/*
+ * On-demand readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * |<----- async_size ---------|
+ * |------------------- size -------------------->|
+ * |==================#===========================|
+ * ^start ^page marked with PG_readahead
*
- * page_cache_readahead() is the main function. If performs the adaptive
- * readahead window size management and submits the readahead I/O.
+ * To overlap application thinking time and disk I/O time, we do
+ * `readahead pipelining': Do not wait until the application consumed all
+ * readahead pages and stalled on the missing page at readahead_index;
+ * Instead, submit an asynchronous readahead I/O as soon as there are
+ * only async_size pages left in the readahead window. Normally async_size
+ * will be equal to size, for maximum pipelining.
*
- * Note that @filp is purely used for passing on to the ->readpage[s]()
- * handler: it may refer to a different file from @mapping (so we may not use
- * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
- * Also, @ra may not be equal to &@filp->f_ra.
+ * In interleaved sequential reads, concurrent streams on the same fd can
+ * be invalidating each other's readahead state. So we flag the new readahead
+ * page at (start+size-async_size) with PG_readahead, and use it as readahead
+ * indicator. The flag won't be set on already cached pages, to avoid the
+ * readahead-for-nothing fuss, saving pointless page cache lookups.
+ *
+ * prev_index tracks the last visited page in the _previous_ read request.
+ * It should be maintained by the caller, and will be used for detecting
+ * small random reads. Note that the readahead algorithm checks loosely
+ * for sequential patterns. Hence interleaved reads might be served as
+ * sequential ones.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
*
+ * The code ramps up the readahead size aggressively at first, but slow down as
+ * it approaches max_readhead.
+ */
+
+/*
+ * A minimal readahead algorithm for trivial sequential/random reads.
*/
-unsigned long
-page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
- struct file *filp, pgoff_t offset, unsigned long req_size)
+static unsigned long
+ondemand_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ bool hit_readahead_marker, pgoff_t offset,
+ unsigned long req_size)
{
- unsigned long max, newsize;
+ unsigned long max; /* max readahead pages */
int sequential;
- /*
- * We avoid doing extra work and bogusly perturbing the readahead
- * window expansion logic.
- */
- if (offset == ra->prev_index && --req_size)
- ++offset;
-
- /* Note that prev_index == -1 if it is a first read */
- sequential = (offset == ra->prev_index + 1);
- ra->prev_index = offset;
- ra->prev_offset = 0;
-
- max = get_max_readahead(ra);
- newsize = min(req_size, max);
-
- /* No readahead or sub-page sized read or file already in cache */
- if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
- goto out;
-
- ra->prev_index += newsize - 1;
+ max = ra->ra_pages;
+ sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
/*
- * Special case - first read at start of file. We'll assume it's
- * a whole-file read and grow the window fast. Or detect first
- * sequential access
+ * It's the expected callback offset, assume sequential access.
+ * Ramp up sizes, and push forward the readahead window.
*/
- if (sequential && ra->size == 0) {
- ra->size = get_init_ra_size(newsize, max);
- ra->start = offset;
- if (!blockable_page_cache_readahead(mapping, filp, offset,
- ra->size, ra, 1))
- goto out;
-
- /*
- * If the request size is larger than our max readahead, we
- * at least want to be sure that we get 2 IOs in flight and
- * we know that we will definitly need the new I/O.
- * once we do this, subsequent calls should be able to overlap
- * IOs,* thus preventing stalls. so issue the ahead window
- * immediately.
- */
- if (req_size >= max)
- make_ahead_window(mapping, filp, ra, 1);
-
- goto out;
+ if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
+ offset == (ra->start + ra->size))) {
+ ra->start += ra->size;
+ ra->size = get_next_ra_size(ra, max);
+ ra->async_size = ra->size;
+ goto readit;
}
/*
- * Now handle the random case:
- * partial page reads and first access were handled above,
- * so this must be the next page otherwise it is random
+ * Standalone, small read.
+ * Read as is, and do not pollute the readahead state.
*/
- if (!sequential) {
- ra_off(ra);
- blockable_page_cache_readahead(mapping, filp, offset,
- newsize, ra, 1);
- goto out;
+ if (!hit_readahead_marker && !sequential) {
+ return __do_page_cache_readahead(mapping, filp,
+ offset, req_size, 0);
}
/*
- * If we get here we are doing sequential IO and this was not the first
- * occurence (ie we have an existing window)
+ * It may be one of
+ * - first read on start of file
+ * - sequential cache miss
+ * - oversize random read
+ * Start readahead for it.
*/
- if (ra->ahead_start == 0) { /* no ahead window yet */
- if (!make_ahead_window(mapping, filp, ra, 0))
- goto recheck;
- }
+ ra->start = offset;
+ ra->size = get_init_ra_size(req_size, max);
+ ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
/*
- * Already have an ahead window, check if we crossed into it.
- * If so, shift windows and issue a new ahead window.
- * Only return the #pages that are in the current window, so that
- * we get called back on the first page of the ahead window which
- * will allow us to submit more IO.
+ * Hit on a marked page without valid readahead state.
+ * E.g. interleaved reads.
+ * Not knowing its readahead pos/size, bet on the minimal possible one.
*/
- if (ra->prev_index >= ra->ahead_start) {
- ra->start = ra->ahead_start;
- ra->size = ra->ahead_size;
- make_ahead_window(mapping, filp, ra, 0);
-recheck:
- /* prev_index shouldn't overrun the ahead window */
- ra->prev_index = min(ra->prev_index,
- ra->ahead_start + ra->ahead_size - 1);
+ if (hit_readahead_marker) {
+ ra->start++;
+ ra->size = get_next_ra_size(ra, max);
}
-out:
- return ra->prev_index + 1;
+readit:
+ return ra_submit(ra, mapping, filp);
}
-EXPORT_SYMBOL_GPL(page_cache_readahead);
-/*
- * handle_ra_miss() is called when it is known that a page which should have
- * been present in the pagecache (we just did some readahead there) was in fact
- * not found. This will happen if it was evicted by the VM (readahead
- * thrashing)
+/**
+ * page_cache_sync_readahead - generic file readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @offset: start offset into @mapping, in pagecache page-sized units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * pagecache pages
*
- * Turn on the cache miss flag in the RA struct, this will cause the RA code
- * to reduce the RA size on the next read.
+ * page_cache_sync_readahead() should be called when a cache miss happened:
+ * it will submit the read. The readahead logic may decide to piggyback more
+ * pages onto the read request if access patterns suggest it will improve
+ * performance.
*/
-void handle_ra_miss(struct address_space *mapping,
- struct file_ra_state *ra, pgoff_t offset)
+void page_cache_sync_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ pgoff_t offset, unsigned long req_size)
{
- ra->flags |= RA_FLAG_MISS;
- ra->flags &= ~RA_FLAG_INCACHE;
- ra->cache_hit = 0;
+ /* no read-ahead */
+ if (!ra->ra_pages)
+ return;
+
+ /* do read-ahead */
+ ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
+EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
-/*
- * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
- * sensible upper limit.
- */
-unsigned long max_sane_readahead(unsigned long nr)
+/**
+ * page_cache_async_readahead - file readahead for marked pages
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @page: the page at @offset which has the PG_readahead flag set
+ * @offset: start offset into @mapping, in pagecache page-sized units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * pagecache pages
+ *
+ * page_cache_async_ondemand() should be called when a page is used which
+ * has the PG_readahead flag: this is a marker to suggest that the application
+ * has used up enough of the readahead window that we should start pulling in
+ * more pages. */
+void
+page_cache_async_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ struct page *page, pgoff_t offset,
+ unsigned long req_size)
{
- return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
- + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
+ /* no read-ahead */
+ if (!ra->ra_pages)
+ return;
+
+ /*
+ * Same bit is used for PG_readahead and PG_reclaim.
+ */
+ if (PageWriteback(page))
+ return;
+
+ ClearPageReadahead(page);
+
+ /*
+ * Defer asynchronous read-ahead on IO congestion.
+ */
+ if (bdi_read_congested(mapping->backing_dev_info))
+ return;
+
+ /* do read-ahead */
+ ondemand_readahead(mapping, ra, filp, true, offset, req_size);
}
+EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index 61e492597a0..fede5c7910b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -621,8 +621,10 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
printk (KERN_EMERG " page->count = %x\n", page_count(page));
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
- if (vma->vm_ops)
+ if (vma->vm_ops) {
print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
+ print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
+ }
if (vma->vm_file && vma->vm_file->f_op)
print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
BUG();
diff --git a/mm/shmem.c b/mm/shmem.c
index 0493e4d0bca..ad155c7745d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -27,6 +27,7 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/xattr.h>
+#include <linux/exportfs.h>
#include <linux/generic_acl.h>
#include <linux/mm.h>
#include <linux/mman.h>
@@ -82,6 +83,7 @@ enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
SGP_WRITE, /* may exceed i_size, may allocate page */
+ SGP_FAULT, /* same as SGP_CACHE, return with page locked */
};
static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -93,8 +95,11 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
* The above definition of ENTRIES_PER_PAGE, and the use of
* BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
* might be reconsidered if it ever diverges from PAGE_SIZE.
+ *
+ * __GFP_MOVABLE is masked out as swap vectors cannot move
*/
- return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+ return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
}
static inline void shmem_dir_free(struct page *page)
@@ -372,7 +377,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
}
spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
if (page)
set_page_private(page, 0);
spin_lock(&info->lock);
@@ -1096,6 +1101,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
+
+ if (type)
+ *type = 0;
+
/*
* Normally, filepage is NULL on entry, and either found
* uptodate immediately, or allocated and zeroed, or read
@@ -1129,9 +1138,9 @@ repeat:
if (!swappage) {
shmem_swp_unmap(entry);
/* here we actually do the io */
- if (type && *type == VM_FAULT_MINOR) {
+ if (type && !(*type & VM_FAULT_MAJOR)) {
__count_vm_event(PGMAJFAULT);
- *type = VM_FAULT_MAJOR;
+ *type |= VM_FAULT_MAJOR;
}
spin_unlock(&info->lock);
swappage = shmem_swapin(info, swap, idx);
@@ -1285,8 +1294,10 @@ repeat:
}
done:
if (*pagep != filepage) {
- unlock_page(filepage);
*pagep = filepage;
+ if (sgp != SGP_FAULT)
+ unlock_page(filepage);
+
}
return 0;
@@ -1298,72 +1309,21 @@ failed:
return error;
}
-static struct page *shmem_nopage(struct vm_area_struct *vma,
- unsigned long address, int *type)
+static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
- struct page *page = NULL;
- unsigned long idx;
int error;
+ int ret;
- idx = (address - vma->vm_start) >> PAGE_SHIFT;
- idx += vma->vm_pgoff;
- idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return NOPAGE_SIGBUS;
+ if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ return VM_FAULT_SIGBUS;
- error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
+ error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
if (error)
- return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
-
- mark_page_accessed(page);
- return page;
-}
-
-static int shmem_populate(struct vm_area_struct *vma,
- unsigned long addr, unsigned long len,
- pgprot_t prot, unsigned long pgoff, int nonblock)
-{
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
- struct mm_struct *mm = vma->vm_mm;
- enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
- unsigned long size;
+ return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
- return -EINVAL;
-
- while ((long) len > 0) {
- struct page *page = NULL;
- int err;
- /*
- * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
- */
- err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
- if (err)
- return err;
- /* Page may still be null, but only if nonblock was set. */
- if (page) {
- mark_page_accessed(page);
- err = install_page(mm, vma, addr, page, prot);
- if (err) {
- page_cache_release(page);
- return err;
- }
- } else if (vma->vm_flags & VM_NONLINEAR) {
- /* No page was found just because we can't read it in
- * now (being here implies nonblock != 0), but the page
- * may exist, so set the PTE to fault it in later. */
- err = install_file_pte(mm, vma, addr, pgoff, prot);
- if (err)
- return err;
- }
-
- len -= PAGE_SIZE;
- addr += PAGE_SIZE;
- pgoff++;
- }
- return 0;
+ mark_page_accessed(vmf->page);
+ return ret | VM_FAULT_LOCKED;
}
#ifdef CONFIG_NUMA
@@ -1410,6 +1370,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
@@ -2455,8 +2416,7 @@ static const struct super_operations shmem_ops = {
};
static struct vm_operations_struct shmem_vm_ops = {
- .nopage = shmem_nopage,
- .populate = shmem_populate,
+ .fault = shmem_fault,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
diff --git a/mm/slab.c b/mm/slab.c
index a453383333f..c3feeaab387 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -775,6 +775,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
*/
BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
#endif
+ if (!size)
+ return ZERO_SIZE_PTR;
+
while (size > csizep->cs_size)
csizep++;
@@ -1160,7 +1163,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
int node = cpu_to_node(cpu);
- int memsize = sizeof(struct kmem_list3);
+ const int memsize = sizeof(struct kmem_list3);
switch (action) {
case CPU_LOCK_ACQUIRE:
@@ -2351,7 +2354,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
- BUG_ON(!cachep->slabp_cache);
+ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
cachep->ctor = ctor;
cachep->name = name;
@@ -2743,7 +2746,7 @@ static int cache_grow(struct kmem_cache *cachep,
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
+ BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
local_flags = (flags & GFP_LEVEL_MASK);
/* Take the l3 list lock to change the colour_next on this node */
@@ -3389,6 +3392,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+ if (unlikely((flags & __GFP_ZERO) && ptr))
+ memset(ptr, 0, obj_size(cachep));
+
return ptr;
}
@@ -3440,6 +3446,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
+ if (unlikely((flags & __GFP_ZERO) && objp))
+ memset(objp, 0, obj_size(cachep));
+
return objp;
}
@@ -3581,23 +3590,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
/**
- * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
- * @cache: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache and set the allocated memory to zero.
- * The flags are only relevant if the cache has no available objects.
- */
-void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
-{
- void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
- if (ret)
- memset(ret, 0, obj_size(cache));
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
-/**
* kmem_ptr_validate - check if an untrusted pointer might
* be a slab entry.
* @cachep: the cache we're checking against
@@ -3653,8 +3645,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
struct kmem_cache *cachep;
cachep = kmem_find_general_cachep(size, flags);
- if (unlikely(cachep == NULL))
- return NULL;
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
return kmem_cache_alloc_node(cachep, flags, node);
}
@@ -3698,8 +3690,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
* functions.
*/
cachep = __find_general_cachep(size, flags);
- if (unlikely(cachep == NULL))
- return NULL;
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
return __cache_alloc(cachep, flags, caller);
}
@@ -3726,52 +3718,6 @@ EXPORT_SYMBOL(__kmalloc);
#endif
/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- struct kmem_cache *cache, *new_cache;
- void *ret;
-
- if (unlikely(!p))
- return kmalloc_track_caller(new_size, flags);
-
- if (unlikely(!new_size)) {
- kfree(p);
- return NULL;
- }
-
- cache = virt_to_cache(p);
- new_cache = __find_general_cachep(new_size, flags);
-
- /*
- * If new size fits in the current cache, bail out.
- */
- if (likely(cache == new_cache))
- return (void *)p;
-
- /*
- * We are on the slow-path here so do not use __cache_alloc
- * because it bloats kernel text.
- */
- ret = kmalloc_track_caller(new_size, flags);
- if (ret) {
- memcpy(ret, p, min(new_size, ksize(p)));
- kfree(p);
- }
- return ret;
-}
-EXPORT_SYMBOL(krealloc);
-
-/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
* @objp: The previously allocated object.
@@ -3806,7 +3752,7 @@ void kfree(const void *objp)
struct kmem_cache *c;
unsigned long flags;
- if (unlikely(!objp))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)))
return;
local_irq_save(flags);
kfree_debugcheck(objp);
@@ -4398,7 +4344,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
unsigned long offset, size;
- char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1];
+ char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
@@ -4493,7 +4439,7 @@ const struct seq_operations slabstats_op = {
*/
size_t ksize(const void *objp)
{
- if (unlikely(objp == NULL))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)))
return 0;
return obj_size(virt_to_cache(objp));
diff --git a/mm/slob.c b/mm/slob.c
index b4899079d8b..c89ef116d7a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -334,6 +334,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
+ if (unlikely((gfp & __GFP_ZERO) && b))
+ memset(b, 0, size);
return b;
}
@@ -347,7 +349,7 @@ static void slob_free(void *block, int size)
slobidx_t units;
unsigned long flags;
- if (!block)
+ if (ZERO_OR_NULL_PTR(block))
return;
BUG_ON(!size);
@@ -424,10 +426,13 @@ out:
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
+ unsigned int *m;
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
if (size < PAGE_SIZE - align) {
- unsigned int *m;
+ if (!size)
+ return ZERO_SIZE_PTR;
+
m = slob_alloc(size + align, gfp, align, node);
if (m)
*m = size;
@@ -446,44 +451,11 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
}
EXPORT_SYMBOL(__kmalloc_node);
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- *
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- void *ret;
-
- if (unlikely(!p))
- return kmalloc_track_caller(new_size, flags);
-
- if (unlikely(!new_size)) {
- kfree(p);
- return NULL;
- }
-
- ret = kmalloc_track_caller(new_size, flags);
- if (ret) {
- memcpy(ret, p, min(new_size, ksize(p)));
- kfree(p);
- }
- return ret;
-}
-EXPORT_SYMBOL(krealloc);
-
void kfree(const void *block)
{
struct slob_page *sp;
- if (!block)
+ if (ZERO_OR_NULL_PTR(block))
return;
sp = (struct slob_page *)virt_to_page(block);
@@ -501,7 +473,7 @@ size_t ksize(const void *block)
{
struct slob_page *sp;
- if (!block)
+ if (ZERO_OR_NULL_PTR(block))
return 0;
sp = (struct slob_page *)virt_to_page(block);
@@ -571,16 +543,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
-{
- void *ret = kmem_cache_alloc(c, flags);
- if (ret)
- memset(ret, 0, c->size);
-
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
static void __kmem_cache_free(void *b, int size)
{
if (size < PAGE_SIZE)
diff --git a/mm/slub.c b/mm/slub.c
index 6aea48942c2..322f3a5d72c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -205,6 +205,11 @@ static inline void ClearSlabDebug(struct page *page)
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif
+/*
+ * The page->inuse field is 16 bit thus we have this limitation
+ */
+#define MAX_OBJECTS_PER_SLAB 65535
+
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
@@ -228,7 +233,7 @@ static enum {
/* A list of all slab caches on the system */
static DECLARE_RWSEM(slub_lock);
-LIST_HEAD(slab_caches);
+static LIST_HEAD(slab_caches);
/*
* Tracking user of a slab.
@@ -247,9 +252,10 @@ static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
#else
-static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
-static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; }
-static void sysfs_slab_remove(struct kmem_cache *s) {}
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
+static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
+ { return 0; }
+static inline void sysfs_slab_remove(struct kmem_cache *s) {}
#endif
/********************************************************************
@@ -344,7 +350,7 @@ static void print_section(char *text, u8 *addr, unsigned int length)
for (i = 0; i < length; i++) {
if (newline) {
- printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
+ printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
newline = 0;
}
printk(" %02x", addr[i]);
@@ -401,10 +407,11 @@ static void set_track(struct kmem_cache *s, void *object,
static void init_tracking(struct kmem_cache *s, void *object)
{
- if (s->flags & SLAB_STORE_USER) {
- set_track(s, object, TRACK_FREE, NULL);
- set_track(s, object, TRACK_ALLOC, NULL);
- }
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ set_track(s, object, TRACK_FREE, NULL);
+ set_track(s, object, TRACK_ALLOC, NULL);
}
static void print_track(const char *s, struct track *t)
@@ -412,65 +419,106 @@ static void print_track(const char *s, struct track *t)
if (!t->addr)
return;
- printk(KERN_ERR "%s: ", s);
+ printk(KERN_ERR "INFO: %s in ", s);
__print_symbol("%s", (unsigned long)t->addr);
- printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+ printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+}
+
+static void print_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ print_track("Allocated", get_track(s, object, TRACK_ALLOC));
+ print_track("Freed", get_track(s, object, TRACK_FREE));
+}
+
+static void print_page_info(struct page *page)
+{
+ printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->inuse, page->freelist, page->flags);
+
+}
+
+static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "========================================"
+ "=====================================\n");
+ printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+ printk(KERN_ERR "----------------------------------------"
+ "-------------------------------------\n\n");
}
-static void print_trailer(struct kmem_cache *s, u8 *p)
+static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
+}
+
+static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
+ u8 *addr = page_address(page);
+
+ print_tracking(s, p);
+
+ print_page_info(page);
+
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
+
+ if (p > addr + 16)
+ print_section("Bytes b4", p - 16, 16);
+
+ print_section("Object", p, min(s->objsize, 128));
if (s->flags & SLAB_RED_ZONE)
print_section("Redzone", p + s->objsize,
s->inuse - s->objsize);
- printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
- p + s->offset,
- get_freepointer(s, p));
-
if (s->offset)
off = s->offset + sizeof(void *);
else
off = s->inuse;
- if (s->flags & SLAB_STORE_USER) {
- print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
- print_track("Last free ", get_track(s, p, TRACK_FREE));
+ if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- }
if (off != s->size)
/* Beginning of the filler is the free pointer */
- print_section("Filler", p + off, s->size - off);
+ print_section("Padding", p + off, s->size - off);
+
+ dump_stack();
}
static void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
- u8 *addr = page_address(page);
-
- printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
- s->name, reason, object, page);
- printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
- object - addr, page->flags, page->inuse, page->freelist);
- if (object > addr + 16)
- print_section("Bytes b4", object - 16, 16);
- print_section("Object", object, min(s->objsize, 128));
- print_trailer(s, object);
- dump_stack();
+ slab_bug(s, reason);
+ print_trailer(s, page, object);
}
-static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
{
va_list args;
char buf[100];
- va_start(args, reason);
- vsnprintf(buf, sizeof(buf), reason, args);
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
- page);
+ slab_bug(s, fmt);
+ print_page_info(page);
dump_stack();
}
@@ -489,15 +537,46 @@ static void init_object(struct kmem_cache *s, void *object, int active)
s->inuse - s->objsize);
}
-static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
{
while (bytes) {
if (*start != (u8)value)
- return 0;
+ return start;
start++;
bytes--;
}
- return 1;
+ return NULL;
+}
+
+static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+ void *from, void *to)
+{
+ slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ memset(from, data, to - from);
+}
+
+static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
+ u8 *object, char *what,
+ u8* start, unsigned int value, unsigned int bytes)
+{
+ u8 *fault;
+ u8 *end;
+
+ fault = check_bytes(start, value, bytes);
+ if (!fault)
+ return 1;
+
+ end = start + bytes;
+ while (end > fault && end[-1] == value)
+ end--;
+
+ slab_bug(s, "%s overwritten", what);
+ printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault[0], value);
+ print_trailer(s, page, object);
+
+ restore_bytes(s, what, value, fault, end);
+ return 0;
}
/*
@@ -538,14 +617,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
* may be used with merged slabcaches.
*/
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
- void *from, void *to)
-{
- printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
- s->name, message, data, from, to - 1);
- memset(from, data, to - from);
-}
-
static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned long off = s->inuse; /* The end of info */
@@ -561,39 +632,39 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
if (s->size == off)
return 1;
- if (check_bytes(p + off, POISON_INUSE, s->size - off))
- return 1;
-
- object_err(s, page, p, "Object padding check fails");
-
- /*
- * Restore padding
- */
- restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
- return 0;
+ return check_bytes_and_report(s, page, p, "Object padding",
+ p + off, POISON_INUSE, s->size - off);
}
static int slab_pad_check(struct kmem_cache *s, struct page *page)
{
- u8 *p;
- int length, remainder;
+ u8 *start;
+ u8 *fault;
+ u8 *end;
+ int length;
+ int remainder;
if (!(s->flags & SLAB_POISON))
return 1;
- p = page_address(page);
+ start = page_address(page);
+ end = start + (PAGE_SIZE << s->order);
length = s->objects * s->size;
- remainder = (PAGE_SIZE << s->order) - length;
+ remainder = end - (start + length);
if (!remainder)
return 1;
- if (!check_bytes(p + length, POISON_INUSE, remainder)) {
- slab_err(s, page, "Padding check failed");
- restore_bytes(s, "slab padding", POISON_INUSE, p + length,
- p + length + remainder);
- return 0;
- }
- return 1;
+ fault = check_bytes(start + length, POISON_INUSE, remainder);
+ if (!fault)
+ return 1;
+ while (end > fault && end[-1] == POISON_INUSE)
+ end--;
+
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ print_section("Padding", start, length);
+
+ restore_bytes(s, "slab padding", POISON_INUSE, start, end);
+ return 0;
}
static int check_object(struct kmem_cache *s, struct page *page,
@@ -606,41 +677,22 @@ static int check_object(struct kmem_cache *s, struct page *page,
unsigned int red =
active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
- if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
- object_err(s, page, object,
- active ? "Redzone Active" : "Redzone Inactive");
- restore_bytes(s, "redzone", red,
- endobject, object + s->inuse);
+ if (!check_bytes_and_report(s, page, object, "Redzone",
+ endobject, red, s->inuse - s->objsize))
return 0;
- }
} else {
- if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
- !check_bytes(endobject, POISON_INUSE,
- s->inuse - s->objsize)) {
- object_err(s, page, p, "Alignment padding check fails");
- /*
- * Fix it so that there will not be another report.
- *
- * Hmmm... We may be corrupting an object that now expects
- * to be longer than allowed.
- */
- restore_bytes(s, "alignment padding", POISON_INUSE,
- endobject, object + s->inuse);
- }
+ if ((s->flags & SLAB_POISON) && s->objsize < s->inuse)
+ check_bytes_and_report(s, page, p, "Alignment padding", endobject,
+ POISON_INUSE, s->inuse - s->objsize);
}
if (s->flags & SLAB_POISON) {
if (!active && (s->flags & __OBJECT_POISON) &&
- (!check_bytes(p, POISON_FREE, s->objsize - 1) ||
- p[s->objsize - 1] != POISON_END)) {
-
- object_err(s, page, p, "Poison check failed");
- restore_bytes(s, "Poison", POISON_FREE,
- p, p + s->objsize -1);
- restore_bytes(s, "Poison", POISON_END,
- p + s->objsize - 1, p + s->objsize);
+ (!check_bytes_and_report(s, page, p, "Poison", p,
+ POISON_FREE, s->objsize - 1) ||
+ !check_bytes_and_report(s, page, p, "Poison",
+ p + s->objsize -1, POISON_END, 1)))
return 0;
- }
/*
* check_pad_bytes cleans up on its own.
*/
@@ -673,25 +725,17 @@ static int check_slab(struct kmem_cache *s, struct page *page)
VM_BUG_ON(!irqs_disabled());
if (!PageSlab(page)) {
- slab_err(s, page, "Not a valid slab page flags=%lx "
- "mapping=0x%p count=%d", page->flags, page->mapping,
- page_count(page));
+ slab_err(s, page, "Not a valid slab page");
return 0;
}
if (page->offset * sizeof(void *) != s->offset) {
- slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
- "mapping=0x%p count=%d",
- (unsigned long)(page->offset * sizeof(void *)),
- page->flags,
- page->mapping,
- page_count(page));
+ slab_err(s, page, "Corrupted offset %lu",
+ (unsigned long)(page->offset * sizeof(void *)));
return 0;
}
if (page->inuse > s->objects) {
- slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
- "mapping=0x%p count=%d",
- s->name, page->inuse, s->objects, page->flags,
- page->mapping, page_count(page));
+ slab_err(s, page, "inuse %u > max %u",
+ s->name, page->inuse, s->objects);
return 0;
}
/* Slab_pad_check fixes things up after itself */
@@ -719,13 +763,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
set_freepointer(s, object, NULL);
break;
} else {
- slab_err(s, page, "Freepointer 0x%p corrupt",
- fp);
+ slab_err(s, page, "Freepointer corrupt");
page->freelist = NULL;
page->inuse = s->objects;
- printk(KERN_ERR "@@@ SLUB %s: Freelist "
- "cleared. Slab 0x%p\n",
- s->name, page);
+ slab_fix(s, "Freelist cleared");
return 0;
}
break;
@@ -737,11 +778,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
if (page->inuse != s->objects - nr) {
slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", s, page, page->inuse,
- s->objects - nr);
+ "counted were %d", page->inuse, s->objects - nr);
page->inuse = s->objects - nr;
- printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
- "Slab @0x%p\n", s->name, page);
+ slab_fix(s, "Object count adjusted.");
}
return search == NULL;
}
@@ -803,7 +842,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
goto bad;
if (object && !on_freelist(s, page, object)) {
- slab_err(s, page, "Object 0x%p already allocated", object);
+ object_err(s, page, object, "Object already allocated");
goto bad;
}
@@ -829,8 +868,7 @@ bad:
* to avoid issues in the future. Marking all objects
* as used avoids touching the remaining objects.
*/
- printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
- s->name, page);
+ slab_fix(s, "Marking all objects used");
page->inuse = s->objects;
page->freelist = NULL;
/* Fix up fields that may be corrupted */
@@ -851,7 +889,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
}
if (on_freelist(s, page, object)) {
- slab_err(s, page, "Object 0x%p already free", object);
+ object_err(s, page, object, "Object already free");
goto fail;
}
@@ -870,8 +908,8 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
dump_stack();
}
else
- slab_err(s, page, "object at 0x%p belongs "
- "to slab %s", object, page->slab->name);
+ object_err(s, page, object,
+ "page slab pointer corrupt.");
goto fail;
}
@@ -885,8 +923,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
return 1;
fail:
- printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
- s->name, page, object);
+ slab_fix(s, "Object at 0x%p not freed", object);
return 0;
}
@@ -1041,7 +1078,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
void *last;
void *p;
- BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
+ BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
if (flags & __GFP_WAIT)
local_irq_enable();
@@ -1359,7 +1396,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
unfreeze_slab(s, page);
}
-static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
+static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
{
slab_lock(page);
deactivate_slab(s, page, cpu);
@@ -1369,7 +1406,7 @@ static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
* Flush cpu slab.
* Called from IPI handler with interrupts disabled.
*/
-static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct page *page = s->cpu_slab[cpu];
@@ -1504,7 +1541,7 @@ debug:
* Otherwise we can simply pick the next object from the lockless free list.
*/
static void __always_inline *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr)
+ gfp_t gfpflags, int node, void *addr)
{
struct page *page;
void **object;
@@ -1522,6 +1559,10 @@ static void __always_inline *slab_alloc(struct kmem_cache *s,
page->lockless_freelist = object[page->offset];
}
local_irq_restore(flags);
+
+ if (unlikely((gfpflags & __GFP_ZERO) && object))
+ memset(object, 0, s->objsize);
+
return object;
}
@@ -1705,8 +1746,17 @@ static inline int slab_order(int size, int min_objects,
{
int order;
int rem;
+ int min_order = slub_min_order;
- for (order = max(slub_min_order,
+ /*
+ * If we would create too many object per slab then reduce
+ * the slab order even if it goes below slub_min_order.
+ */
+ while (min_order > 0 &&
+ (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size)
+ min_order--;
+
+ for (order = max(min_order,
fls(min_objects * size - 1) - PAGE_SHIFT);
order <= max_order; order++) {
@@ -1720,6 +1770,9 @@ static inline int slab_order(int size, int min_objects,
if (rem <= slab_size / fract_leftover)
break;
+ /* If the next size is too high then exit now */
+ if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size)
+ break;
}
return order;
@@ -1800,7 +1853,9 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
atomic_long_set(&n->nr_slabs, 0);
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
+#ifdef CONFIG_SLUB_DEBUG
INIT_LIST_HEAD(&n->full);
+#endif
}
#ifdef CONFIG_NUMA
@@ -1828,7 +1883,10 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
page->freelist = get_freepointer(kmalloc_caches, n);
page->inuse++;
kmalloc_caches->node[node] = n;
- setup_object_debug(kmalloc_caches, page, n);
+#ifdef CONFIG_SLUB_DEBUG
+ init_object(kmalloc_caches, n, 1);
+ init_tracking(kmalloc_caches, n);
+#endif
init_kmem_cache_node(n);
atomic_long_inc(&n->nr_slabs);
add_partial(n, page);
@@ -2006,7 +2064,7 @@ static int calculate_sizes(struct kmem_cache *s)
* The page->inuse field is only 16 bit wide! So we cannot have
* more than 64k objects per slab.
*/
- if (!s->objects || s->objects > 65535)
+ if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB)
return 0;
return 1;
@@ -2110,7 +2168,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Release all resources used by a slab cache.
*/
-static int kmem_cache_close(struct kmem_cache *s)
+static inline int kmem_cache_close(struct kmem_cache *s)
{
int node;
@@ -2138,12 +2196,13 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ up_write(&slub_lock);
if (kmem_cache_close(s))
WARN_ON(1);
sysfs_slab_remove(s);
kfree(s);
- }
- up_write(&slub_lock);
+ } else
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2216,47 +2275,92 @@ panic:
panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
}
-static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+#ifdef CONFIG_ZONE_DMA
+static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
{
- int index = kmalloc_index(size);
+ struct kmem_cache *s;
+ struct kmem_cache *x;
+ char *text;
+ size_t realsize;
- if (!index)
- return NULL;
+ s = kmalloc_caches_dma[index];
+ if (s)
+ return s;
- /* Allocation too large? */
- BUG_ON(index < 0);
+ /* Dynamically create dma cache */
+ x = kmalloc(kmem_size, flags & ~SLUB_DMA);
+ if (!x)
+ panic("Unable to allocate memory for dma cache\n");
-#ifdef CONFIG_ZONE_DMA
- if ((flags & SLUB_DMA)) {
- struct kmem_cache *s;
- struct kmem_cache *x;
- char *text;
- size_t realsize;
-
- s = kmalloc_caches_dma[index];
- if (s)
- return s;
+ realsize = kmalloc_caches[index].objsize;
+ text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
+ (unsigned int)realsize);
+ s = create_kmalloc_cache(x, text, realsize, flags);
+ down_write(&slub_lock);
+ if (!kmalloc_caches_dma[index]) {
+ kmalloc_caches_dma[index] = s;
+ up_write(&slub_lock);
+ return s;
+ }
+ up_write(&slub_lock);
+ kmem_cache_destroy(s);
+ return kmalloc_caches_dma[index];
+}
+#endif
+
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] = {
+ 3, /* 8 */
+ 4, /* 16 */
+ 5, /* 24 */
+ 5, /* 32 */
+ 6, /* 40 */
+ 6, /* 48 */
+ 6, /* 56 */
+ 6, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 7, /* 104 */
+ 7, /* 112 */
+ 7, /* 120 */
+ 7, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
- /* Dynamically create dma cache */
- x = kmalloc(kmem_size, flags & ~SLUB_DMA);
- if (!x)
- panic("Unable to allocate memory for dma cache\n");
+static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+{
+ int index;
- if (index <= KMALLOC_SHIFT_HIGH)
- realsize = 1 << index;
- else {
- if (index == 1)
- realsize = 96;
- else
- realsize = 192;
- }
+ if (size <= 192) {
+ if (!size)
+ return ZERO_SIZE_PTR;
- text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
- (unsigned int)realsize);
- s = create_kmalloc_cache(x, text, realsize, flags);
- kmalloc_caches_dma[index] = s;
- return s;
+ index = size_index[(size - 1) / 8];
+ } else {
+ if (size > KMALLOC_MAX_SIZE)
+ return NULL;
+
+ index = fls(size - 1);
}
+
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely((flags & SLUB_DMA)))
+ return dma_kmalloc_cache(index, flags);
+
#endif
return &kmalloc_caches[index];
}
@@ -2265,9 +2369,10 @@ void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s = get_slab(size, flags);
- if (s)
- return slab_alloc(s, flags, -1, __builtin_return_address(0));
- return ZERO_SIZE_PTR;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
+
+ return slab_alloc(s, flags, -1, __builtin_return_address(0));
}
EXPORT_SYMBOL(__kmalloc);
@@ -2276,9 +2381,10 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s = get_slab(size, flags);
- if (s)
- return slab_alloc(s, flags, node, __builtin_return_address(0));
- return ZERO_SIZE_PTR;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
+
+ return slab_alloc(s, flags, node, __builtin_return_address(0));
}
EXPORT_SYMBOL(__kmalloc_node);
#endif
@@ -2288,7 +2394,7 @@ size_t ksize(const void *object)
struct page *page;
struct kmem_cache *s;
- if (object == ZERO_SIZE_PTR)
+ if (ZERO_OR_NULL_PTR(object))
return 0;
page = get_object_page(object);
@@ -2329,7 +2435,7 @@ void kfree(const void *x)
* this comparison would be true for all "negative" pointers
* (which would cover the whole upper half of the address space).
*/
- if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR)
+ if (ZERO_OR_NULL_PTR(x))
return;
page = virt_to_head_page(x);
@@ -2418,43 +2524,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- void *ret;
- size_t ks;
-
- if (unlikely(!p || p == ZERO_SIZE_PTR))
- return kmalloc(new_size, flags);
-
- if (unlikely(!new_size)) {
- kfree(p);
- return ZERO_SIZE_PTR;
- }
-
- ks = ksize(p);
- if (ks >= new_size)
- return (void *)p;
-
- ret = kmalloc(new_size, flags);
- if (ret) {
- memcpy(ret, p, min(new_size, ks));
- kfree(p);
- }
- return ret;
-}
-EXPORT_SYMBOL(krealloc);
-
/********************************************************************
* Basic setup of slabs
*******************************************************************/
@@ -2497,6 +2566,24 @@ void __init kmem_cache_init(void)
caches++;
}
+
+ /*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * mips it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+ (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+ for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
+ size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
@@ -2542,7 +2629,7 @@ static struct kmem_cache *find_mergeable(size_t size,
size_t align, unsigned long flags,
void (*ctor)(void *, struct kmem_cache *, unsigned long))
{
- struct list_head *h;
+ struct kmem_cache *s;
if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
return NULL;
@@ -2554,10 +2641,7 @@ static struct kmem_cache *find_mergeable(size_t size,
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
- list_for_each(h, &slab_caches) {
- struct kmem_cache *s =
- container_of(h, struct kmem_cache, list);
-
+ list_for_each_entry(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
@@ -2600,25 +2684,26 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ up_write(&slub_lock);
if (sysfs_slab_alias(s, name))
goto err;
- } else {
- s = kmalloc(kmem_size, GFP_KERNEL);
- if (s && kmem_cache_open(s, GFP_KERNEL, name,
+ return s;
+ }
+ s = kmalloc(kmem_size, GFP_KERNEL);
+ if (s) {
+ if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
- if (sysfs_slab_add(s)) {
- kfree(s);
- goto err;
- }
list_add(&s->list, &slab_caches);
- } else
- kfree(s);
+ up_write(&slub_lock);
+ if (sysfs_slab_add(s))
+ goto err;
+ return s;
+ }
+ kfree(s);
}
up_write(&slub_lock);
- return s;
err:
- up_write(&slub_lock);
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else
@@ -2627,45 +2712,7 @@ err:
}
EXPORT_SYMBOL(kmem_cache_create);
-void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags)
-{
- void *x;
-
- x = slab_alloc(s, flags, -1, __builtin_return_address(0));
- if (x)
- memset(x, 0, s->objsize);
- return x;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
#ifdef CONFIG_SMP
-static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
-{
- struct list_head *h;
-
- down_read(&slub_lock);
- list_for_each(h, &slab_caches) {
- struct kmem_cache *s =
- container_of(h, struct kmem_cache, list);
-
- func(s, cpu);
- }
- up_read(&slub_lock);
-}
-
-/*
- * Version of __flush_cpu_slab for the case that interrupts
- * are enabled.
- */
-static void cpu_slab_flush(struct kmem_cache *s, int cpu)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __flush_cpu_slab(s, cpu);
- local_irq_restore(flags);
-}
-
/*
* Use the cpu notifier to insure that the cpu slabs are flushed when
* necessary.
@@ -2674,13 +2721,21 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
+ struct kmem_cache *s;
+ unsigned long flags;
switch (action) {
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- for_all_slabs(cpu_slab_flush, cpu);
+ down_read(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ local_irq_save(flags);
+ __flush_cpu_slab(s, cpu);
+ local_irq_restore(flags);
+ }
+ up_read(&slub_lock);
break;
default:
break;
@@ -2697,8 +2752,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
{
struct kmem_cache *s = get_slab(size, gfpflags);
- if (!s)
- return ZERO_SIZE_PTR;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
return slab_alloc(s, gfpflags, -1, caller);
}
@@ -2708,18 +2763,18 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
{
struct kmem_cache *s = get_slab(size, gfpflags);
- if (!s)
- return ZERO_SIZE_PTR;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
return slab_alloc(s, gfpflags, node, caller);
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
-static int validate_slab(struct kmem_cache *s, struct page *page)
+static int validate_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
{
void *p;
void *addr = page_address(page);
- DECLARE_BITMAP(map, s->objects);
if (!check_slab(s, page) ||
!on_freelist(s, page, NULL))
@@ -2741,10 +2796,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page)
return 1;
}
-static void validate_slab_slab(struct kmem_cache *s, struct page *page)
+static void validate_slab_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
{
if (slab_trylock(page)) {
- validate_slab(s, page);
+ validate_slab(s, page, map);
slab_unlock(page);
} else
printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
@@ -2761,7 +2817,8 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page)
}
}
-static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
+static int validate_slab_node(struct kmem_cache *s,
+ struct kmem_cache_node *n, unsigned long *map)
{
unsigned long count = 0;
struct page *page;
@@ -2770,7 +2827,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru) {
- validate_slab_slab(s, page);
+ validate_slab_slab(s, page, map);
count++;
}
if (count != n->nr_partial)
@@ -2781,7 +2838,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
goto out;
list_for_each_entry(page, &n->full, lru) {
- validate_slab_slab(s, page);
+ validate_slab_slab(s, page, map);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -2794,17 +2851,23 @@ out:
return count;
}
-static unsigned long validate_slab_cache(struct kmem_cache *s)
+static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
+ unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) *
+ sizeof(unsigned long), GFP_KERNEL);
+
+ if (!map)
+ return -ENOMEM;
flush_all(s);
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
- count += validate_slab_node(s, n);
+ count += validate_slab_node(s, n, map);
}
+ kfree(map);
return count;
}
@@ -2893,18 +2956,14 @@ static void free_loc_track(struct loc_track *t)
get_order(sizeof(struct location) * t->max));
}
-static int alloc_loc_track(struct loc_track *t, unsigned long max)
+static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
{
struct location *l;
int order;
- if (!max)
- max = PAGE_SIZE / sizeof(struct location);
-
order = get_order(sizeof(struct location) * max);
- l = (void *)__get_free_pages(GFP_ATOMIC, order);
-
+ l = (void *)__get_free_pages(flags, order);
if (!l)
return 0;
@@ -2970,7 +3029,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
/*
* Not found. Insert new tracking element.
*/
- if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
+ if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
return 0;
l = t->loc + pos;
@@ -3013,11 +3072,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
{
int n = 0;
unsigned long i;
- struct loc_track t;
+ struct loc_track t = { 0, 0, NULL };
int node;
- t.count = 0;
- t.max = 0;
+ if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_KERNEL))
+ return sprintf(buf, "Out of memory\n");
/* Push back cpu slabs */
flush_all(s);
@@ -3421,11 +3481,14 @@ static ssize_t validate_show(struct kmem_cache *s, char *buf)
static ssize_t validate_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- if (buf[0] == '1')
- validate_slab_cache(s);
- else
- return -EINVAL;
- return length;
+ int ret = -EINVAL;
+
+ if (buf[0] == '1') {
+ ret = validate_slab_cache(s);
+ if (ret >= 0)
+ ret = length;
+ }
+ return ret;
}
SLAB_ATTR(validate);
@@ -3579,7 +3642,7 @@ static struct kset_uevent_ops slab_uevent_ops = {
.filter = uevent_filter,
};
-decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
+static decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
#define ID_STR_LENGTH 64
@@ -3677,7 +3740,7 @@ struct saved_alias {
struct saved_alias *next;
};
-struct saved_alias *alias_list;
+static struct saved_alias *alias_list;
static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
{
@@ -3705,7 +3768,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
static int __init slab_sysfs_init(void)
{
- struct list_head *h;
+ struct kmem_cache *s;
int err;
err = subsystem_register(&slab_subsys);
@@ -3716,10 +3779,7 @@ static int __init slab_sysfs_init(void)
slab_state = SYSFS;
- list_for_each(h, &slab_caches) {
- struct kmem_cache *s =
- container_of(h, struct kmem_cache, list);
-
+ list_for_each_entry(s, &slab_caches, list) {
err = sysfs_slab_add(s);
BUG_ON(err);
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 925d5c50f18..67daecb6031 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -334,7 +334,8 @@ struct page *read_swap_cache_async(swp_entry_t entry,
* Get a new page to read into from swap.
*/
if (!new_page) {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, addr);
if (!new_page)
break; /* Out of memory */
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 7c994f2d614..5cdfbc1a59f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes anonymous. It will be left on the LRU and may even be mapped into
- * user pagetables if we're racing with filemap_nopage().
+ * user pagetables if we're racing with filemap_fault().
*
* We need to bale out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
@@ -100,9 +100,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (PagePrivate(page))
do_invalidatepage(page, 0);
+ remove_from_page_cache(page);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
- remove_from_page_cache(page);
page_cache_release(page); /* pagecache ref */
}
@@ -192,6 +192,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
unlock_page(page);
continue;
}
+ if (page_mapped(page)) {
+ unmap_mapping_range(mapping,
+ (loff_t)page_index<<PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ }
truncate_complete_page(mapping, page);
unlock_page(page);
}
@@ -229,6 +234,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
break;
lock_page(page);
wait_on_page_writeback(page);
+ if (page_mapped(page)) {
+ unmap_mapping_range(mapping,
+ (loff_t)page->index<<PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ }
if (page->index > next)
next = page->index;
next++;
@@ -405,7 +415,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
break;
}
wait_on_page_writeback(page);
- while (page_mapped(page)) {
+ if (page_mapped(page)) {
if (!did_range_unmap) {
/*
* Zap the rest of the file in one hit.
@@ -425,6 +435,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
PAGE_CACHE_SIZE, 0);
}
}
+ BUG_ON(page_mapped(page));
ret = do_launder_page(mapping, page);
if (ret == 0 && !invalidate_complete_page2(mapping, page))
ret = -EIO;
diff --git a/mm/util.c b/mm/util.c
index ace2aea69f1..bf340d80686 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,22 +5,7 @@
#include <asm/uaccess.h>
/**
- * __kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *__kzalloc(size_t size, gfp_t flags)
-{
- void *ret = kmalloc_track_caller(size, flags);
- if (ret)
- memset(ret, 0, size);
- return ret;
-}
-EXPORT_SYMBOL(__kzalloc);
-
-/*
* kstrdup - allocate space for and copy an existing string
- *
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
@@ -41,6 +26,30 @@ char *kstrdup(const char *s, gfp_t gfp)
EXPORT_SYMBOL(kstrdup);
/**
+ * kstrndup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @max: read at most @max chars from @s
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrndup(const char *s, size_t max, gfp_t gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strnlen(s, max);
+ buf = kmalloc_track_caller(len+1, gfp);
+ if (buf) {
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ }
+ return buf;
+}
+EXPORT_SYMBOL(kstrndup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -58,9 +67,42 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
}
EXPORT_SYMBOL(kmemdup);
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes. If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc(). If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+ size_t ks;
+
+ if (unlikely(!new_size)) {
+ kfree(p);
+ return ZERO_SIZE_PTR;
+ }
+
+ ks = ksize(p);
+ if (ks >= new_size)
+ return (void *)p;
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret) {
+ memcpy(ret, p, min(new_size, ks));
+ kfree(p);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(krealloc);
+
/*
* strndup_user - duplicate an existing string from user space
- *
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
*/
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d3a9c536825..3cee76a8c9f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -68,12 +68,12 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
} while (pud++, addr = next, addr != end);
}
-void unmap_vm_area(struct vm_struct *area)
+void unmap_kernel_range(unsigned long addr, unsigned long size)
{
pgd_t *pgd;
unsigned long next;
- unsigned long addr = (unsigned long) area->addr;
- unsigned long end = addr + area->size;
+ unsigned long start = addr;
+ unsigned long end = addr + size;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
@@ -84,7 +84,12 @@ void unmap_vm_area(struct vm_struct *area)
continue;
vunmap_pud_range(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
- flush_tlb_kernel_range((unsigned long) area->addr, end);
+ flush_tlb_kernel_range(start, end);
+}
+
+static void unmap_vm_area(struct vm_struct *area)
+{
+ unmap_kernel_range((unsigned long)area->addr, area->size);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
@@ -159,6 +164,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
+EXPORT_SYMBOL_GPL(map_vm_area);
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
@@ -237,6 +243,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
{
return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
}
+EXPORT_SYMBOL_GPL(__get_vm_area);
/**
* get_vm_area - reserve a contingous kernel virtual area
@@ -427,11 +434,12 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
+ pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
+ PAGE_KERNEL, node);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size,
- (gfp_mask & GFP_LEVEL_MASK),
+ (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO,
node);
}
area->pages = pages;
@@ -440,7 +448,6 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
kfree(area);
return NULL;
}
- memset(area->pages, 0, array_size);
for (i = 0; i < area->nr_pages; i++) {
if (node < 0)
@@ -578,9 +585,9 @@ void *vmalloc_exec(unsigned long size)
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
-#define GFP_VMALLOC32 GFP_DMA32
+#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
-#define GFP_VMALLOC32 GFP_DMA
+#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
#else
#define GFP_VMALLOC32 GFP_KERNEL
#endif
@@ -762,3 +769,56 @@ EXPORT_SYMBOL(remap_vmalloc_range);
void __attribute__((weak)) vmalloc_sync_all(void)
{
}
+
+
+static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+ /* apply_to_page_range() does all the hard work. */
+ return 0;
+}
+
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ * @returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created. If the kernel address space is not shared
+ * between processes, it syncs the pagetable across all
+ * processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+ struct vm_struct *area;
+
+ area = get_vm_area(size, VM_IOREMAP);
+ if (area == NULL)
+ return NULL;
+
+ /*
+ * This ensures that page tables are constructed for this region
+ * of kernel virtual address space and mapped into init_mm.
+ */
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ area->size, f, NULL)) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ /* Make sure the pagetables are constructed in process kernel
+ mappings */
+ vmalloc_sync_all();
+
+ return area;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+ struct vm_struct *ret;
+ ret = remove_vm_area(area->addr);
+ BUG_ON(ret != area);
+ kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1be5a6376ef..d419e10e3da 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,17 +66,8 @@ struct scan_control {
int swappiness;
int all_unreclaimable;
-};
-/*
- * The list of shrinker callbacks used by to apply pressure to
- * ageable caches.
- */
-struct shrinker {
- shrinker_t shrinker;
- struct list_head list;
- int seeks; /* seeks to recreate an obj */
- long nr; /* objs pending delete */
+ int order;
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -121,34 +112,25 @@ static DECLARE_RWSEM(shrinker_rwsem);
/*
* Add a shrinker callback to be called from the vm
*/
-struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+void register_shrinker(struct shrinker *shrinker)
{
- struct shrinker *shrinker;
-
- shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
- if (shrinker) {
- shrinker->shrinker = theshrinker;
- shrinker->seeks = seeks;
- shrinker->nr = 0;
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
- up_write(&shrinker_rwsem);
- }
- return shrinker;
+ shrinker->nr = 0;
+ down_write(&shrinker_rwsem);
+ list_add_tail(&shrinker->list, &shrinker_list);
+ up_write(&shrinker_rwsem);
}
-EXPORT_SYMBOL(set_shrinker);
+EXPORT_SYMBOL(register_shrinker);
/*
* Remove one
*/
-void remove_shrinker(struct shrinker *shrinker)
+void unregister_shrinker(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
- kfree(shrinker);
}
-EXPORT_SYMBOL(remove_shrinker);
+EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
/*
@@ -185,7 +167,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
- unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
+ unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
delta *= max_pass;
@@ -213,8 +195,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
int shrink_ret;
int nr_before;
- nr_before = (*shrinker->shrinker)(0, gfp_mask);
- shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+ nr_before = (*shrinker->shrink)(0, gfp_mask);
+ shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
@@ -481,7 +463,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
- if (referenced && page_mapping_inuse(page))
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+ referenced && page_mapping_inuse(page))
goto activate_locked;
#ifdef CONFIG_SWAP
@@ -514,7 +497,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
- if (referenced)
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
@@ -598,6 +581,51 @@ keep:
return nr_reclaimed;
}
+/* LRU Isolation modes. */
+#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
+#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
+#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
+
+/*
+ * Attempt to remove the specified page from its LRU. Only take this page
+ * if it is of the appropriate PageActive status. Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page: page to consider
+ * mode: one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+static int __isolate_lru_page(struct page *page, int mode)
+{
+ int ret = -EINVAL;
+
+ /* Only take pages on the LRU. */
+ if (!PageLRU(page))
+ return ret;
+
+ /*
+ * When checking the active state, we need to be sure we are
+ * dealing with comparible boolean values. Take the logical not
+ * of each.
+ */
+ if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ return ret;
+
+ ret = -EBUSY;
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ ret = 0;
+ }
+
+ return ret;
+}
+
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
@@ -612,38 +640,90 @@ keep:
* @src: The LRU list to pull pages off.
* @dst: The temp list to put pages on to.
* @scanned: The number of pages that were scanned.
+ * @order: The caller's attempted allocation order
+ * @mode: One of the LRU isolation modes
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned)
+ unsigned long *scanned, int order, int mode)
{
unsigned long nr_taken = 0;
- struct page *page;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
- struct list_head *target;
+ struct page *page;
+ unsigned long pfn;
+ unsigned long end_pfn;
+ unsigned long page_pfn;
+ int zone_id;
+
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
VM_BUG_ON(!PageLRU(page));
- list_del(&page->lru);
- target = src;
- if (likely(get_page_unless_zero(page))) {
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- ClearPageLRU(page);
- target = dst;
+ switch (__isolate_lru_page(page, mode)) {
+ case 0:
+ list_move(&page->lru, dst);
nr_taken++;
- } /* else it is being freed elsewhere */
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
+
+ default:
+ BUG();
+ }
+
+ if (!order)
+ continue;
- list_add(&page->lru, target);
+ /*
+ * Attempt to take all pages in the order aligned region
+ * surrounding the tag page. Only take those pages of
+ * the same active state as that tag page. We may safely
+ * round the target page pfn down to the requested order
+ * as the mem_map is guarenteed valid out to MAX_ORDER,
+ * where that page is in a different zone we will detect
+ * it from its zone id and abort this block scan.
+ */
+ zone_id = page_zone_id(page);
+ page_pfn = page_to_pfn(page);
+ pfn = page_pfn & ~((1 << order) - 1);
+ end_pfn = pfn + (1 << order);
+ for (; pfn < end_pfn; pfn++) {
+ struct page *cursor_page;
+
+ /* The target page is in the block, ignore it. */
+ if (unlikely(pfn == page_pfn))
+ continue;
+
+ /* Avoid holes within the zone. */
+ if (unlikely(!pfn_valid_within(pfn)))
+ break;
+
+ cursor_page = pfn_to_page(pfn);
+ /* Check that we have not crossed a zone boundary. */
+ if (unlikely(page_zone_id(cursor_page) != zone_id))
+ continue;
+ switch (__isolate_lru_page(cursor_page, mode)) {
+ case 0:
+ list_move(&cursor_page->lru, dst);
+ nr_taken++;
+ scan++;
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&cursor_page->lru, src);
+ default:
+ break;
+ }
+ }
}
*scanned = scan;
@@ -651,6 +731,24 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
}
/*
+ * clear_active_flags() is a helper for shrink_active_list(), clearing
+ * any active bits from the pages in the list.
+ */
+static unsigned long clear_active_flags(struct list_head *page_list)
+{
+ int nr_active = 0;
+ struct page *page;
+
+ list_for_each_entry(page, page_list, lru)
+ if (PageActive(page)) {
+ ClearPageActive(page);
+ nr_active++;
+ }
+
+ return nr_active;
+}
+
+/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
@@ -671,11 +769,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
unsigned long nr_taken;
unsigned long nr_scan;
unsigned long nr_freed;
+ unsigned long nr_active;
nr_taken = isolate_lru_pages(sc->swap_cluster_max,
- &zone->inactive_list,
- &page_list, &nr_scan);
- __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
+ &zone->inactive_list,
+ &page_list, &nr_scan, sc->order,
+ (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
+ ISOLATE_BOTH : ISOLATE_INACTIVE);
+ nr_active = clear_active_flags(&page_list);
+
+ __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
+ __mod_zone_page_state(zone, NR_INACTIVE,
+ -(nr_taken - nr_active));
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
@@ -820,7 +925,7 @@ force_reclaim_mapped:
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
+ &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
zone->pages_scanned += pgscanned;
__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
spin_unlock_irq(&zone->lru_lock);
@@ -1011,7 +1116,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
{
int priority;
int ret = 0;
@@ -1026,6 +1131,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
.swappiness = vm_swappiness,
+ .order = order,
};
count_vm_event(ALLOCSTALL);
@@ -1131,6 +1237,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
+ .order = order,
};
/*
* temp_priority is used to remember the scanning priority at which
@@ -1314,6 +1421,7 @@ static int kswapd(void *p)
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ set_freezable();
order = 0;
for ( ; ; ) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index eceaf496210..fadf791cd7e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -472,7 +472,7 @@ const struct seq_operations fragmentation_op = {
#endif
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx)
+ TEXT_FOR_HIGHMEM(xx) xx "_movable",
static const char * const vmstat_text[] = {
/* Zoned VM counters */