From 4e02ed4b4a2fae34aae766a5bb93ae235f60adb8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 29 Oct 2008 14:00:55 -0700
Subject: fs: remove prepare_write/commit_write

Nothing uses prepare_write or commit_write. Remove them from the tree
completely.

[akpm@linux-foundation.org: schedule simple_prepare_write() for unexporting]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 242 +----------------------------------------------------------
 1 file changed, 4 insertions(+), 238 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index ab8553658af..f3e5f8944d1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2029,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping,
 {
 	const struct address_space_operations *aops = mapping->a_ops;
 
-	if (aops->write_begin) {
-		return aops->write_begin(file, mapping, pos, len, flags,
+	return aops->write_begin(file, mapping, pos, len, flags,
 							pagep, fsdata);
-	} else {
-		int ret;
-		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-		struct inode *inode = mapping->host;
-		struct page *page;
-again:
-		page = __grab_cache_page(mapping, index);
-		*pagep = page;
-		if (!page)
-			return -ENOMEM;
-
-		if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
-			/*
-			 * There is no way to resolve a short write situation
-			 * for a !Uptodate page (except by double copying in
-			 * the caller done by generic_perform_write_2copy).
-			 *
-			 * Instead, we have to bring it uptodate here.
-			 */
-			ret = aops->readpage(file, page);
-			page_cache_release(page);
-			if (ret) {
-				if (ret == AOP_TRUNCATED_PAGE)
-					goto again;
-				return ret;
-			}
-			goto again;
-		}
-
-		ret = aops->prepare_write(file, page, offset, offset+len);
-		if (ret) {
-			unlock_page(page);
-			page_cache_release(page);
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
-		}
-		return ret;
-	}
 }
 EXPORT_SYMBOL(pagecache_write_begin);
 
@@ -2079,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 				struct page *page, void *fsdata)
 {
 	const struct address_space_operations *aops = mapping->a_ops;
-	int ret;
-
-	if (aops->write_end) {
-		mark_page_accessed(page);
-		ret = aops->write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-	} else {
-		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-		struct inode *inode = mapping->host;
-
-		flush_dcache_page(page);
-		ret = aops->commit_write(file, page, offset, offset+len);
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-
-		if (ret < 0) {
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
-		} else if (ret > 0)
-			ret = min_t(size_t, copied, ret);
-		else
-			ret = copied;
-	}
 
-	return ret;
+	mark_page_accessed(page);
+	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
 }
 EXPORT_SYMBOL(pagecache_write_end);
 
@@ -2226,174 +2163,6 @@ repeat:
 }
 EXPORT_SYMBOL(__grab_cache_page);
 
-static ssize_t generic_perform_write_2copy(struct file *file,
-				struct iov_iter *i, loff_t pos)
-{
-	struct address_space *mapping = file->f_mapping;
-	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode *inode = mapping->host;
-	long status = 0;
-	ssize_t written = 0;
-
-	do {
-		struct page *src_page;
-		struct page *page;
-		pgoff_t index;		/* Pagecache index for current page */
-		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long bytes;	/* Bytes to write to page */
-		size_t copied;		/* Bytes copied from user */
-
-		offset = (pos & (PAGE_CACHE_SIZE - 1));
-		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-						iov_iter_count(i));
-
-		/*
-		 * a non-NULL src_page indicates that we're doing the
-		 * copy via get_user_pages and kmap.
-		 */
-		src_page = NULL;
-
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-			status = -EFAULT;
-			break;
-		}
-
-		page = __grab_cache_page(mapping, index);
-		if (!page) {
-			status = -ENOMEM;
-			break;
-		}
-
-		/*
-		 * non-uptodate pages cannot cope with short copies, and we
-		 * cannot take a pagefault with the destination page locked.
-		 * So pin the source page to copy it.
-		 */
-		if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
-			unlock_page(page);
-
-			src_page = alloc_page(GFP_KERNEL);
-			if (!src_page) {
-				page_cache_release(page);
-				status = -ENOMEM;
-				break;
-			}
-
-			/*
-			 * Cannot get_user_pages with a page locked for the
-			 * same reason as we can't take a page fault with a
-			 * page locked (as explained below).
-			 */
-			copied = iov_iter_copy_from_user(src_page, i,
-								offset, bytes);
-			if (unlikely(copied == 0)) {
-				status = -EFAULT;
-				page_cache_release(page);
-				page_cache_release(src_page);
-				break;
-			}
-			bytes = copied;
-
-			lock_page(page);
-			/*
-			 * Can't handle the page going uptodate here, because
-			 * that means we would use non-atomic usercopies, which
-			 * zero out the tail of the page, which can cause
-			 * zeroes to become transiently visible. We could just
-			 * use a non-zeroing copy, but the APIs aren't too
-			 * consistent.
-			 */
-			if (unlikely(!page->mapping || PageUptodate(page))) {
-				unlock_page(page);
-				page_cache_release(page);
-				page_cache_release(src_page);
-				continue;
-			}
-		}
-
-		status = a_ops->prepare_write(file, page, offset, offset+bytes);
-		if (unlikely(status))
-			goto fs_write_aop_error;
-
-		if (!src_page) {
-			/*
-			 * Must not enter the pagefault handler here, because
-			 * we hold the page lock, so we might recursively
-			 * deadlock on the same lock, or get an ABBA deadlock
-			 * against a different lock, or against the mmap_sem
-			 * (which nests outside the page lock).  So increment
-			 * preempt count, and use _atomic usercopies.
-			 *
-			 * The page is uptodate so we are OK to encounter a
-			 * short copy: if unmodified parts of the page are
-			 * marked dirty and written out to disk, it doesn't
-			 * really matter.
-			 */
-			pagefault_disable();
-			copied = iov_iter_copy_from_user_atomic(page, i,
-								offset, bytes);
-			pagefault_enable();
-		} else {
-			void *src, *dst;
-			src = kmap_atomic(src_page, KM_USER0);
-			dst = kmap_atomic(page, KM_USER1);
-			memcpy(dst + offset, src + offset, bytes);
-			kunmap_atomic(dst, KM_USER1);
-			kunmap_atomic(src, KM_USER0);
-			copied = bytes;
-		}
-		flush_dcache_page(page);
-
-		status = a_ops->commit_write(file, page, offset, offset+bytes);
-		if (unlikely(status < 0))
-			goto fs_write_aop_error;
-		if (unlikely(status > 0)) /* filesystem did partial write */
-			copied = min_t(size_t, copied, status);
-
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-		if (src_page)
-			page_cache_release(src_page);
-
-		iov_iter_advance(i, copied);
-		pos += copied;
-		written += copied;
-
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-		continue;
-
-fs_write_aop_error:
-		unlock_page(page);
-		page_cache_release(page);
-		if (src_page)
-			page_cache_release(src_page);
-
-		/*
-		 * prepare_write() may have instantiated a few blocks
-		 * outside i_size.  Trim these off again. Don't need
-		 * i_size_read because we hold i_mutex.
-		 */
-		if (pos + bytes > inode->i_size)
-			vmtruncate(inode, inode->i_size);
-		break;
-	} while (iov_iter_count(i));
-
-	return written ? written : status;
-}
-
 static ssize_t generic_perform_write(struct file *file,
 				struct iov_iter *i, loff_t pos)
 {
@@ -2494,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	struct iov_iter i;
 
 	iov_iter_init(&i, iov, nr_segs, count, written);
-	if (a_ops->write_begin)
-		status = generic_perform_write(file, &i, pos);
-	else
-		status = generic_perform_write_2copy(file, &i, pos);
+	status = generic_perform_write(file, &i, pos);
 
 	if (likely(status >= 0)) {
 		written += status;
-- 
cgit v1.2.3


From e99c97ade53fb6f5e665f2960eb86c624a532d7b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 29 Oct 2008 14:01:09 -0700
Subject: mm: fix kernel-doc function notation

Delete excess kernel-doc notation in mm/ subdirectory.
Actually this is a kernel-doc notation fix.

Warning(/var/linsrc/linux-2.6.27-git10//mm/vmalloc.c:902): Excess function parameter or struct member 'returns' description in 'vm_map_ram'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 036536945dd..f1cc03bbf6a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -897,7 +897,8 @@ EXPORT_SYMBOL(vm_unmap_ram);
  * @count: number of pages
  * @node: prefer to allocate data structures on this node
  * @prot: memory protection to use. PAGE_KERNEL for regular RAM
- * @returns: a pointer to the address that has been mapped, or NULL on failure
+ *
+ * Returns: a pointer to the address that has been mapped, or %NULL on failure
  */
 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
 {
-- 
cgit v1.2.3


From 731572d39fcd3498702eda4600db4c43d51e0b26 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Wed, 29 Oct 2008 14:01:20 -0700
Subject: nfsd: fix vm overcommit crash

Junjiro R.  Okajima reported a problem where knfsd crashes if you are
using it to export shmemfs objects and run strict overcommit.  In this
situation the current->mm based modifier to the overcommit goes through a
NULL pointer.

We could simply check for NULL and skip the modifier but we've caught
other real bugs in the past from mm being NULL here - cases where we did
need a valid mm set up (eg the exec bug about a year ago).

To preserve the checks and get the logic we want shuffle the checking
around and add a new helper to the vm_ security wrappers

Also fix a current->mm reference in nommu that should use the passed mm

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix build]
Reported-by: Junjiro R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c  | 3 ++-
 mm/nommu.c | 3 ++-
 mm/shmem.c | 8 ++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 74f4d158022..de14ac21e5b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -175,7 +175,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
 	/* Don't let a single process grow too big:
 	   leave 3% of the size of this process for other processes */
-	allowed -= mm->total_vm / 32;
+	if (mm)
+		allowed -= mm->total_vm / 32;
 
 	/*
 	 * cast `allowed' as a signed long because vm_committed_space
diff --git a/mm/nommu.c b/mm/nommu.c
index 2696b24f2bb..7695dc85078 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1454,7 +1454,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
 	/* Don't let a single process grow too big:
 	   leave 3% of the size of this process for other processes */
-	allowed -= current->mm->total_vm / 32;
+	if (mm)
+		allowed -= mm->total_vm / 32;
 
 	/*
 	 * cast `allowed' as a signed long because vm_committed_space
diff --git a/mm/shmem.c b/mm/shmem.c
index d38d7e61fcd..0ed075215e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -161,8 +161,8 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  */
 static inline int shmem_acct_size(unsigned long flags, loff_t size)
 {
-	return (flags & VM_ACCOUNT)?
-		security_vm_enough_memory(VM_ACCT(size)): 0;
+	return (flags & VM_ACCOUNT) ?
+		security_vm_enough_memory_kern(VM_ACCT(size)) : 0;
 }
 
 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -179,8 +179,8 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
  */
 static inline int shmem_acct_block(unsigned long flags)
 {
-	return (flags & VM_ACCOUNT)?
-		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
+	return (flags & VM_ACCOUNT) ?
+		0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE));
 }
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
-- 
cgit v1.2.3


From ab4f2ee130d5ffcf35616e1f5c6ab75af5b463b6 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Thu, 6 Nov 2008 17:11:07 +0000
Subject: [ARM] fix naming of MODULE_START / MODULE_END

As of 73bdf0a60e607f4b8ecc5aec597105976565a84f, the kernel needs
to know where modules are located in the virtual address space.
On ARM, we located this region between MODULE_START and MODULE_END.
Unfortunately, everyone else calls it MODULES_VADDR and MODULES_END.
Update ARM to use the same naming, so is_vmalloc_or_module_addr()
can work properly.  Also update the comment on mm/vmalloc.c to
reflect that ARM also places modules in a separate region from the
vmalloc space.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f1cc03bbf6a..66fad3fc02b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -178,7 +178,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end,
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
-	 * x86-64 and sparc64 put modules in a special place,
+	 * ARM, x86-64 and sparc64 put modules in a special place,
 	 * and fall back on vmalloc() if that fails. Others
 	 * just put it in the vmalloc space.
 	 */
-- 
cgit v1.2.3


From 69d177c2fc702d402b17fdca2190d5a7e3ca55c5 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 6 Nov 2008 12:53:26 -0800
Subject: hugetlbfs: handle pages higher order than MAX_ORDER

When working with hugepages, hugetlbfs assumes that those hugepages are
smaller than MAX_ORDER.  Specifically it assumes that the mem_map is
contigious and uses that to optimise access to the elements of the mem_map
that represent the hugepage.  Gigantic pages (such as 16GB pages on
powerpc) by definition are of greater order than MAX_ORDER (larger than
MAX_ORDER_NR_PAGES in size).  This means that we can no longer make use of
the buddy alloctor guarentees for the contiguity of the mem_map, which
ensures that the mem_map is at least contigious for maximmally aligned
areas of MAX_ORDER_NR_PAGES pages.

This patch adds new mem_map accessors and iterator helpers which handle
any discontiguity at MAX_ORDER_NR_PAGES boundaries.  It then uses these to
implement gigantic page versions of copy_huge_page and clear_huge_page,
and to allow follow_hugetlb_page handle gigantic pages.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org>		[2.6.27.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c  | 37 ++++++++++++++++++++++++++++++++++++-
 mm/internal.h | 28 ++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 421aee99b84..e6afe527bd0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -354,11 +354,26 @@ static int vma_has_reserves(struct vm_area_struct *vma)
 	return 0;
 }
 
+static void clear_gigantic_page(struct page *page,
+			unsigned long addr, unsigned long sz)
+{
+	int i;
+	struct page *p = page;
+
+	might_sleep();
+	for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
+		cond_resched();
+		clear_user_highpage(p, addr + i * PAGE_SIZE);
+	}
+}
 static void clear_huge_page(struct page *page,
 			unsigned long addr, unsigned long sz)
 {
 	int i;
 
+	if (unlikely(sz > MAX_ORDER_NR_PAGES))
+		return clear_gigantic_page(page, addr, sz);
+
 	might_sleep();
 	for (i = 0; i < sz/PAGE_SIZE; i++) {
 		cond_resched();
@@ -366,12 +381,32 @@ static void clear_huge_page(struct page *page,
 	}
 }
 
+static void copy_gigantic_page(struct page *dst, struct page *src,
+			   unsigned long addr, struct vm_area_struct *vma)
+{
+	int i;
+	struct hstate *h = hstate_vma(vma);
+	struct page *dst_base = dst;
+	struct page *src_base = src;
+	might_sleep();
+	for (i = 0; i < pages_per_huge_page(h); ) {
+		cond_resched();
+		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+		i++;
+		dst = mem_map_next(dst, dst_base, i);
+		src = mem_map_next(src, src_base, i);
+	}
+}
 static void copy_huge_page(struct page *dst, struct page *src,
 			   unsigned long addr, struct vm_area_struct *vma)
 {
 	int i;
 	struct hstate *h = hstate_vma(vma);
 
+	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
+		return copy_gigantic_page(dst, src, addr, vma);
+
 	might_sleep();
 	for (i = 0; i < pages_per_huge_page(h); i++) {
 		cond_resched();
@@ -2130,7 +2165,7 @@ same_page:
 			if (zeropage_ok)
 				pages[i] = ZERO_PAGE(0);
 			else
-				pages[i] = page + pfn_offset;
+				pages[i] = mem_map_offset(page, pfn_offset);
 			get_page(pages[i]);
 		}
 
diff --git a/mm/internal.h b/mm/internal.h
index e4e728bdf32..f482460de3e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -175,6 +175,34 @@ static inline void free_page_mlock(struct page *page) { }
 
 #endif /* CONFIG_UNEVICTABLE_LRU */
 
+/*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'.  Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
+{
+	if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+		return pfn_to_page(page_to_pfn(base) + offset);
+	return base + offset;
+}
+
+/*
+ * Iterator over all subpages withing the maximally aligned gigantic
+ * page 'base'.  Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+						struct page *base, int offset)
+{
+	if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+		unsigned long pfn = page_to_pfn(base) + offset;
+		if (!pfn_valid(pfn))
+			return NULL;
+		return pfn_to_page(pfn);
+	}
+	return iter + 1;
+}
+
 /*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
  * so all functions starting at paging_init should be marked __init
-- 
cgit v1.2.3


From 18229df5b613ed0732a766fc37850de2e7988e43 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 6 Nov 2008 12:53:27 -0800
Subject: hugetlb: pull gigantic page initialisation out of the default path

As we can determine exactly when a gigantic page is in use we can optimise
the common regular page cases by pulling out gigantic page initialisation
into its own function.  As gigantic pages are never released to buddy we
do not need a destructor.  This effectivly reverts the previous change to
the main buddy allocator.  It also adds a paranoid check to ensure we
never release gigantic pages from hugetlbfs to the main buddy.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org>		[2.6.27.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c    | 12 +++++++++++-
 mm/internal.h   |  1 +
 mm/page_alloc.c | 28 +++++++++++++++++++++-------
 3 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e6afe527bd0..d143ab67be4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -491,6 +491,8 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 {
 	int i;
 
+	VM_BUG_ON(h->order >= MAX_ORDER);
+
 	h->nr_huge_pages--;
 	h->nr_huge_pages_node[page_to_nid(page)]--;
 	for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -1005,6 +1007,14 @@ found:
 	return 1;
 }
 
+static void prep_compound_huge_page(struct page *page, int order)
+{
+	if (unlikely(order > (MAX_ORDER - 1)))
+		prep_compound_gigantic_page(page, order);
+	else
+		prep_compound_page(page, order);
+}
+
 /* Put bootmem huge pages into the standard lists after mem_map is up */
 static void __init gather_bootmem_prealloc(void)
 {
@@ -1015,7 +1025,7 @@ static void __init gather_bootmem_prealloc(void)
 		struct hstate *h = m->hstate;
 		__ClearPageReserved(page);
 		WARN_ON(page_count(page) != 1);
-		prep_compound_page(page, h->order);
+		prep_compound_huge_page(page, h->order);
 		prep_new_huge_page(h, page, page_to_nid(page));
 	}
 }
diff --git a/mm/internal.h b/mm/internal.h
index f482460de3e..13333bc2eb6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -17,6 +17,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
 extern void prep_compound_page(struct page *page, unsigned long order);
+extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
 
 static inline void set_page_count(struct page *page, int v)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d0a240fbb8b..54069e64e3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -260,6 +260,23 @@ static void free_compound_page(struct page *page)
 }
 
 void prep_compound_page(struct page *page, unsigned long order)
+{
+	int i;
+	int nr_pages = 1 << order;
+
+	set_compound_page_dtor(page, free_compound_page);
+	set_compound_order(page, order);
+	__SetPageHead(page);
+	for (i = 1; i < nr_pages; i++) {
+		struct page *p = page + i;
+
+		__SetPageTail(p);
+		p->first_page = page;
+	}
+}
+
+#ifdef CONFIG_HUGETLBFS
+void prep_compound_gigantic_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
@@ -268,19 +285,17 @@ void prep_compound_page(struct page *page, unsigned long order)
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
-	for (i = 1; i < nr_pages; i++, p++) {
-		if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
-			p = pfn_to_page(page_to_pfn(page) + i);
+	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
 		__SetPageTail(p);
 		p->first_page = page;
 	}
 }
+#endif
 
 static void destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
-	struct page *p = page + 1;
 
 	if (unlikely(compound_order(page) != order))
 		bad_page(page);
@@ -288,9 +303,8 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	if (unlikely(!PageHead(page)))
 			bad_page(page);
 	__ClearPageHead(page);
-	for (i = 1; i < nr_pages; i++, p++) {
-		if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
-			p = pfn_to_page(page_to_pfn(page) + i);
+	for (i = 1; i < nr_pages; i++) {
+		struct page *p = page + i;
 
 		if (unlikely(!PageTail(p) |
 				(p->first_page != page)))
-- 
cgit v1.2.3


From b4416d2bea007f07f2e74cdc4cb64042ec996c83 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 6 Nov 2008 12:53:29 -0800
Subject: oom: do not dump task state for non thread group leaders

When /proc/sys/vm/oom_dump_tasks is enabled, it's only necessary to dump
task state information for thread group leaders.  The kernel log gets
quickly overwhelmed on machines with a massive number of threads by
dumping non-thread group leaders.

Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 64e5b4bcd96..2846a58e5de 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -295,6 +295,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
 			continue;
 		if (mem && !task_in_mem_cgroup(p, mem))
 			continue;
+		if (!thread_group_leader(p))
+			continue;
 
 		task_lock(p);
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-- 
cgit v1.2.3


From 0aedadf91a70a11c4a3e7c7d99b21e5528af8d5d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Thu, 6 Nov 2008 12:53:30 -0800
Subject: mm: move migrate_prep out from under mmap_sem

Move the migrate_prep outside the mmap_sem for the following system calls

1. sys_move_pages
2. sys_migrate_pages
3. sys_mbind()

It really does not matter when we flush the lru.  The system is free to
add pages onto the lru even during migration which will make the page
migration either skip the page (mbind, migrate_pages) or return a busy
state (move_pages).

Fixes this lockdep warning (and potential deadlock):

Some VM place has
      mmap_sem -> kevent_wq via lru_add_drain_all()

net/core/dev.c::dev_ioctl()  has
     rtnl_lock  ->  mmap_sem        (*) the ioctl has copy_from_user() and it can do page fault.

linkwatch_event has
     kevent_wq -> rtnl_lock

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 18 +++++++++++-------
 mm/migrate.c   |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36f42573a33..e9493b1c111 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,12 +489,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
-	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-
-		err = migrate_prep();
-		if (err)
-			return ERR_PTR(err);
-	}
 
 	first = find_vma(mm, start);
 	if (!first)
@@ -809,9 +803,13 @@ int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
 	int busy = 0;
-	int err = 0;
+	int err;
 	nodemask_t tmp;
 
+	err = migrate_prep();
+	if (err)
+		return err;
+
 	down_read(&mm->mmap_sem);
 
 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
@@ -974,6 +972,12 @@ static long do_mbind(unsigned long start, unsigned long len,
 		 start, start + len, mode, mode_flags,
 		 nmask ? nodes_addr(*nmask)[0] : -1);
 
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+		err = migrate_prep();
+		if (err)
+			return err;
+	}
 	down_write(&mm->mmap_sem);
 	vma = check_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
index 6602941bfab..385db89f0c3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -841,12 +841,12 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 	struct page_to_node *pp;
 	LIST_HEAD(pagelist);
 
+	migrate_prep();
 	down_read(&mm->mmap_sem);
 
 	/*
 	 * Build a list of pages to migrate
 	 */
-	migrate_prep();
 	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
 		struct vm_area_struct *vma;
 		struct page *page;
-- 
cgit v1.2.3


From b41ad14c30acf023d09ac064096a4cf41248ce46 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 6 Nov 2008 12:53:31 -0800
Subject: vmemmap: warn about page_structs with remote distance

It's insufficient to simply compare node ids when warning about offnode
page_structs since it's possible to still have local affinity.

Acked-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse-vmemmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a91b5f8fcaf..a13ea6401ae 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -64,7 +64,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
 	unsigned long pfn = pte_pfn(*pte);
 	int actual_node = early_pfn_to_nid(pfn);
 
-	if (actual_node != node)
+	if (node_distance(actual_node, node) > LOCAL_DISTANCE)
 		printk(KERN_WARNING "[%lx-%lx] potential offnode "
 			"page_structs\n", start, end - 1);
 }
-- 
cgit v1.2.3


From fbdd12676c83df77480f00ebd32fc98fbe3bf836 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Thu, 6 Nov 2008 12:53:34 -0800
Subject: mm/oom_kill.c: fix badness() kerneldoc

Paramter @mem has been removed since v2.6.26, now delete it's comment.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2846a58e5de..a0a01902f55 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,7 +38,6 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
- * @mem: target memory controller
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
-- 
cgit v1.2.3


From a70dcb969f64e2fa98c24f47854f20bf02ff0092 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Thu, 6 Nov 2008 12:53:36 -0800
Subject: memory hotplug: fix page_zone() calculation in test_pages_isolated()

My last bugfix here (adding zone->lock) introduced a new problem: Using
page_zone(pfn_to_page(pfn)) to get the zone after the for() loop is wrong.
 pfn will then be >= end_pfn, which may be in a different zone or not
present at all.  This may lead to an addressing exception in page_zone()
or spin_lock_irqsave().

Now I use __first_valid_page() again after the loop to find a valid page
for page_zone().

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Acked-by: Nathan Fontenot <nfont@austin.ibm.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_isolation.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index b70a7fec1ff..5e0ffd96745 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -130,10 +130,11 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 		if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 			break;
 	}
-	if (pfn < end_pfn)
+	page = __first_valid_page(start_pfn, end_pfn - start_pfn);
+	if ((pfn < end_pfn) || !page)
 		return -EBUSY;
 	/* Check all pages are free or Marked as ISOLATED */
-	zone = page_zone(pfn_to_page(pfn));
+	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
 	spin_unlock_irqrestore(&zone->lock, flags);
-- 
cgit v1.2.3


From 9b46333406b9cb3397ab538485a4d57c316af0ff Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 28 Oct 2008 19:22:34 +1100
Subject: vmap: cope with vm_unmap_aliases before vmalloc_init()

Xen can end up calling vm_unmap_aliases() before vmalloc_init() has
been called.  In this case its safe to make it a simple no-op.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Linux Memory Management List <linux-mm@kvack.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/vmalloc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 66fad3fc02b..ba6b0f5f7fa 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -592,6 +592,8 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
 
 #define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
 
+static bool vmap_initialized __read_mostly = false;
+
 struct vmap_block_queue {
 	spinlock_t lock;
 	struct list_head free;
@@ -828,6 +830,9 @@ void vm_unmap_aliases(void)
 	int cpu;
 	int flush = 0;
 
+	if (unlikely(!vmap_initialized))
+		return;
+
 	for_each_possible_cpu(cpu) {
 		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
 		struct vmap_block *vb;
@@ -942,6 +947,8 @@ void __init vmalloc_init(void)
 		INIT_LIST_HEAD(&vbq->dirty);
 		vbq->nr_dirty = 0;
 	}
+
+	vmap_initialized = true;
 }
 
 void unmap_kernel_range(unsigned long addr, unsigned long size)
-- 
cgit v1.2.3


From 1c1271850494f06b63ae6b485e2e1b9c27ffb2d1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 12 Nov 2008 01:24:41 +0100
Subject: parisc: fix find_extend_vma() breakage

The STACK_GROWSUP case of stack expansion was missing a test for 'prev',
which got removed by commit cb8f488c33539f096580e202f5438a809195008f
("mmap.c: deinline a few functions") by mistake.

I found my original email in "sent" folder. The patch in that mail
does NOT remove !prev. That change had beed added by someone else.

Ok, I think we are not much interested in who did it, let's
fix it for good.

[ "It looks like this was caused by me fixing rejects.  That was the
  fancy include-lots-of-context-so-it-wont-apply patch." - akpm ]

Reported-and-bisected-by: Helge Deller <deller@gmx.de>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index de14ac21e5b..d4855a682ab 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1704,7 +1704,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
 	vma = find_vma_prev(mm, addr, &prev);
 	if (vma && (vma->vm_start <= addr))
 		return vma;
-	if (expand_stack(prev, addr))
+	if (!prev || expand_stack(prev, addr))
 		return NULL;
 	if (prev->vm_flags & VM_LOCKED) {
 		if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
-- 
cgit v1.2.3


From 7526674de0c921e7f1e9b6f71a1f9d832557b554 Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Wed, 12 Nov 2008 13:24:56 -0800
Subject: hugetlb: make unmap_ref_private multi-size-aware

Oops.  Part of the hugetlb private reservation code was not fully
converted to use hstates.

When a huge page must be unmapped from VMAs due to a failed COW,
HPAGE_SIZE is used in the call to unmap_hugepage_range() regardless of
the page size being used.  This works if the VMA is using the default
huge page size.  Otherwise we might unmap too much, too little, or
trigger a BUG_ON.  Rare but serious -- fix it.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d143ab67be4..6058b53dcb8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1796,6 +1796,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 				struct page *page, unsigned long address)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct vm_area_struct *iter_vma;
 	struct address_space *mapping;
 	struct prio_tree_iter iter;
@@ -1805,7 +1806,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
 	 * from page cache lookup which is in HPAGE_SIZE units.
 	 */
-	address = address & huge_page_mask(hstate_vma(vma));
+	address = address & huge_page_mask(h);
 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
 		+ (vma->vm_pgoff >> PAGE_SHIFT);
 	mapping = (struct address_space *)page_private(page);
@@ -1824,7 +1825,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
 			unmap_hugepage_range(iter_vma,
-				address, address + HPAGE_SIZE,
+				address, address + huge_page_size(h),
 				page);
 	}
 
-- 
cgit v1.2.3


From e33c3b5e172e2e45456f42fba47227d48745543f Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 12 Nov 2008 13:25:37 -0800
Subject: cpusets: update mems allowed in page allocator

If all allowable memory is unreclaimable, it is possible to loop forever
in the page allocator for ~__GFP_NORETRY allocations.

During this time, it is also possible for a task's cpuset to expand its
set of allowable nodes so that it now includes free memory.  The cached
copy of this set, current->mems_allowed, is stale, however, since there
has not been a subsequent call to cpuset_update_task_memory_state().

The cached copy of the set of allowable nodes is now updated in the page
allocator's slow path so the additional memory is available to
get_page_from_freelist().

[akpm@linux-foundation.org: add comment]
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54069e64e3a..d8ac0147456 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1561,6 +1561,10 @@ nofail_alloc:
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
+	/*
+	 * The task's cpuset might have expanded its set of allowable nodes
+	 */
+	cpuset_update_task_memory_state();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
-- 
cgit v1.2.3


From 8891d6da17db0f9bb507d3a017f130b9970c3087 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 12 Nov 2008 13:26:53 -0800
Subject: mm: remove lru_add_drain_all() from the munlock path

lockdep warns about following message at boot time on one of my test
machine.  Then, schedule_on_each_cpu() sholdn't be called when the task
have mmap_sem.

Actually, lru_add_drain_all() exist to prevent the unevictalble pages
stay on reclaimable lru list.  but currenct unevictable code can rescue
unevictable pages although it stay on reclaimable list.

So removing is better.

In addition, this patch add lru_add_drain_all() to sys_mlock() and
sys_mlockall().  it isn't must.  but it reduce the failure of moving to
unevictable list.  its failure can rescue in vmscan later.  but reducing
is better.

Note, if above rescuing happend, the Mlocked and the Unevictable field
mismatching happend in /proc/meminfo.  but it doesn't cause any real
trouble.

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.28-rc2-mm1 #2
-------------------------------------------------------
lvm/1103 is trying to acquire lock:
 (&cpu_hotplug.lock){--..}, at: [<c0130789>] get_online_cpus+0x29/0x50

but task is already holding lock:
 (&mm->mmap_sem){----}, at: [<c01878ae>] sys_mlockall+0x4e/0xb0

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (&mm->mmap_sem){----}:
       [<c0153da2>] check_noncircular+0x82/0x110
       [<c0185e6a>] might_fault+0x4a/0xa0
       [<c0156161>] validate_chain+0xb11/0x1070
       [<c0185e6a>] might_fault+0x4a/0xa0
       [<c0156923>] __lock_acquire+0x263/0xa10
       [<c015714c>] lock_acquire+0x7c/0xb0			(*) grab mmap_sem
       [<c0185e6a>] might_fault+0x4a/0xa0
       [<c0185e9b>] might_fault+0x7b/0xa0
       [<c0185e6a>] might_fault+0x4a/0xa0
       [<c0294dd0>] copy_to_user+0x30/0x60
       [<c01ae3ec>] filldir+0x7c/0xd0
       [<c01e3a6a>] sysfs_readdir+0x11a/0x1f0			(*) grab sysfs_mutex
       [<c01ae370>] filldir+0x0/0xd0
       [<c01ae370>] filldir+0x0/0xd0
       [<c01ae4c6>] vfs_readdir+0x86/0xa0			(*) grab i_mutex
       [<c01ae75b>] sys_getdents+0x6b/0xc0
       [<c010355a>] syscall_call+0x7/0xb
       [<ffffffff>] 0xffffffff

-> #2 (sysfs_mutex){--..}:
       [<c0153da2>] check_noncircular+0x82/0x110
       [<c01e3d2c>] sysfs_addrm_start+0x2c/0xc0
       [<c0156161>] validate_chain+0xb11/0x1070
       [<c01e3d2c>] sysfs_addrm_start+0x2c/0xc0
       [<c0156923>] __lock_acquire+0x263/0xa10
       [<c015714c>] lock_acquire+0x7c/0xb0			(*) grab sysfs_mutex
       [<c01e3d2c>] sysfs_addrm_start+0x2c/0xc0
       [<c04f8b55>] mutex_lock_nested+0xa5/0x2f0
       [<c01e3d2c>] sysfs_addrm_start+0x2c/0xc0
       [<c01e3d2c>] sysfs_addrm_start+0x2c/0xc0
       [<c01e3d2c>] sysfs_addrm_start+0x2c/0xc0
       [<c01e422f>] create_dir+0x3f/0x90
       [<c01e42a9>] sysfs_create_dir+0x29/0x50
       [<c04faaf5>] _spin_unlock+0x25/0x40
       [<c028f21d>] kobject_add_internal+0xcd/0x1a0
       [<c028f37a>] kobject_set_name_vargs+0x3a/0x50
       [<c028f41d>] kobject_init_and_add+0x2d/0x40
       [<c019d4d2>] sysfs_slab_add+0xd2/0x180
       [<c019d580>] sysfs_add_func+0x0/0x70
       [<c019d5dc>] sysfs_add_func+0x5c/0x70			(*) grab slub_lock
       [<c01400f2>] run_workqueue+0x172/0x200
       [<c014008f>] run_workqueue+0x10f/0x200
       [<c0140bd0>] worker_thread+0x0/0xf0
       [<c0140c6c>] worker_thread+0x9c/0xf0
       [<c0143c80>] autoremove_wake_function+0x0/0x50
       [<c0140bd0>] worker_thread+0x0/0xf0
       [<c0143972>] kthread+0x42/0x70
       [<c0143930>] kthread+0x0/0x70
       [<c01042db>] kernel_thread_helper+0x7/0x1c
       [<ffffffff>] 0xffffffff

-> #1 (slub_lock){----}:
       [<c0153d2d>] check_noncircular+0xd/0x110
       [<c04f650f>] slab_cpuup_callback+0x11f/0x1d0
       [<c0156161>] validate_chain+0xb11/0x1070
       [<c04f650f>] slab_cpuup_callback+0x11f/0x1d0
       [<c015433d>] mark_lock+0x35d/0xd00
       [<c0156923>] __lock_acquire+0x263/0xa10
       [<c015714c>] lock_acquire+0x7c/0xb0
       [<c04f650f>] slab_cpuup_callback+0x11f/0x1d0
       [<c04f93a3>] down_read+0x43/0x80
       [<c04f650f>] slab_cpuup_callback+0x11f/0x1d0		(*) grab slub_lock
       [<c04f650f>] slab_cpuup_callback+0x11f/0x1d0
       [<c04fd9ac>] notifier_call_chain+0x3c/0x70
       [<c04f5454>] _cpu_up+0x84/0x110
       [<c04f552b>] cpu_up+0x4b/0x70				(*) grab cpu_hotplug.lock
       [<c06d1530>] kernel_init+0x0/0x170
       [<c06d15e5>] kernel_init+0xb5/0x170
       [<c06d1530>] kernel_init+0x0/0x170
       [<c01042db>] kernel_thread_helper+0x7/0x1c
       [<ffffffff>] 0xffffffff

-> #0 (&cpu_hotplug.lock){--..}:
       [<c0155bff>] validate_chain+0x5af/0x1070
       [<c040f7e0>] dev_status+0x0/0x50
       [<c0156923>] __lock_acquire+0x263/0xa10
       [<c015714c>] lock_acquire+0x7c/0xb0
       [<c0130789>] get_online_cpus+0x29/0x50
       [<c04f8b55>] mutex_lock_nested+0xa5/0x2f0
       [<c0130789>] get_online_cpus+0x29/0x50
       [<c0130789>] get_online_cpus+0x29/0x50
       [<c017bc30>] lru_add_drain_per_cpu+0x0/0x10
       [<c0130789>] get_online_cpus+0x29/0x50			(*) grab cpu_hotplug.lock
       [<c0140cf2>] schedule_on_each_cpu+0x32/0xe0
       [<c0187095>] __mlock_vma_pages_range+0x85/0x2c0
       [<c0156945>] __lock_acquire+0x285/0xa10
       [<c0188f09>] vma_merge+0xa9/0x1d0
       [<c0187450>] mlock_fixup+0x180/0x200
       [<c0187548>] do_mlockall+0x78/0x90			(*) grab mmap_sem
       [<c01878e1>] sys_mlockall+0x81/0xb0
       [<c010355a>] syscall_call+0x7/0xb
       [<ffffffff>] 0xffffffff

other info that might help us debug this:

1 lock held by lvm/1103:
 #0:  (&mm->mmap_sem){----}, at: [<c01878ae>] sys_mlockall+0x4e/0xb0

stack backtrace:
Pid: 1103, comm: lvm Not tainted 2.6.28-rc2-mm1 #2
Call Trace:
 [<c01555fc>] print_circular_bug_tail+0x7c/0xd0
 [<c0155bff>] validate_chain+0x5af/0x1070
 [<c040f7e0>] dev_status+0x0/0x50
 [<c0156923>] __lock_acquire+0x263/0xa10
 [<c015714c>] lock_acquire+0x7c/0xb0
 [<c0130789>] get_online_cpus+0x29/0x50
 [<c04f8b55>] mutex_lock_nested+0xa5/0x2f0
 [<c0130789>] get_online_cpus+0x29/0x50
 [<c0130789>] get_online_cpus+0x29/0x50
 [<c017bc30>] lru_add_drain_per_cpu+0x0/0x10
 [<c0130789>] get_online_cpus+0x29/0x50
 [<c0140cf2>] schedule_on_each_cpu+0x32/0xe0
 [<c0187095>] __mlock_vma_pages_range+0x85/0x2c0
 [<c0156945>] __lock_acquire+0x285/0xa10
 [<c0188f09>] vma_merge+0xa9/0x1d0
 [<c0187450>] mlock_fixup+0x180/0x200
 [<c0187548>] do_mlockall+0x78/0x90
 [<c01878e1>] sys_mlockall+0x81/0xb0
 [<c010355a>] syscall_call+0x7/0xb

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index 008ea70b7af..a6da2aee940 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -66,14 +66,10 @@ void __clear_page_mlock(struct page *page)
 		putback_lru_page(page);
 	} else {
 		/*
-		 * Page not on the LRU yet.  Flush all pagevecs and retry.
+		 * We lost the race. the page already moved to evictable list.
 		 */
-		lru_add_drain_all();
-		if (!isolate_lru_page(page))
-			putback_lru_page(page);
-		else if (PageUnevictable(page))
+		if (PageUnevictable(page))
 			count_vm_event(UNEVICTABLE_PGSTRANDED);
-
 	}
 }
 
@@ -187,8 +183,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 	if (vma->vm_flags & VM_WRITE)
 		gup_flags |= GUP_FLAGS_WRITE;
 
-	lru_add_drain_all();	/* push cached pages to LRU */
-
 	while (nr_pages > 0) {
 		int i;
 
@@ -251,8 +245,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 		ret = 0;
 	}
 
-	lru_add_drain_all();	/* to update stats */
-
 	return ret;	/* count entire vma as locked_vm */
 }
 
@@ -546,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
 	if (!can_do_mlock())
 		return -EPERM;
 
+	lru_add_drain_all();	/* flush pagevec */
+
 	down_write(&current->mm->mmap_sem);
 	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
 	start &= PAGE_MASK;
@@ -612,6 +606,8 @@ asmlinkage long sys_mlockall(int flags)
 	if (!can_do_mlock())
 		goto out;
 
+	lru_add_drain_all();	/* flush pagevec */
+
 	down_write(&current->mm->mmap_sem);
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-- 
cgit v1.2.3


From 33c5d3d64589c5d379db5a5615735f6d08438369 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 12 Nov 2008 13:27:01 -0800
Subject: memcg: bugfix for memory hotplug

The start pfn calculation in page_cgroup's memory hotplug notifier chain
is wrong.

Tested-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f59d797dc5a..1223d927904 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -165,7 +165,7 @@ int online_page_cgroup(unsigned long start_pfn,
 	unsigned long start, end, pfn;
 	int fail = 0;
 
-	start = start_pfn & (PAGES_PER_SECTION - 1);
+	start = start_pfn & ~(PAGES_PER_SECTION - 1);
 	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 
 	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
@@ -188,7 +188,7 @@ int offline_page_cgroup(unsigned long start_pfn,
 {
 	unsigned long start, end, pfn;
 
-	start = start_pfn & (PAGES_PER_SECTION - 1);
+	start = start_pfn & ~(PAGES_PER_SECTION - 1);
 	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
 
 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
-- 
cgit v1.2.3


From 748f1a2ed7a68e15b28a1da3559afbebba121772 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 14 Nov 2008 16:25:01 +0900
Subject: mm: remove unevictable's show_page_path

Hugh Dickins reported show_page_path() is buggy and unsafe because

 - lack dput() against d_find_alias()
 - don't concern vma->vm_mm->owner == NULL
 - lack lock_page()

it was only for debugging, so rather than trying to fix it, just remove
it now.

Reported-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
CC: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 35 -----------------------------------
 1 file changed, 35 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3b5860294bb..c141b3e7807 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2368,39 +2368,6 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
 	return 1;
 }
 
-static void show_page_path(struct page *page)
-{
-	char buf[256];
-	if (page_is_file_cache(page)) {
-		struct address_space *mapping = page->mapping;
-		struct dentry *dentry;
-		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-
-		spin_lock(&mapping->i_mmap_lock);
-		dentry = d_find_alias(mapping->host);
-		printk(KERN_INFO "rescued: %s %lu\n",
-		       dentry_path(dentry, buf, 256), pgoff);
-		spin_unlock(&mapping->i_mmap_lock);
-	} else {
-#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
-		struct anon_vma *anon_vma;
-		struct vm_area_struct *vma;
-
-		anon_vma = page_lock_anon_vma(page);
-		if (!anon_vma)
-			return;
-
-		list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-			printk(KERN_INFO "rescued: anon %s\n",
-			       vma->vm_mm->owner->comm);
-			break;
-		}
-		page_unlock_anon_vma(anon_vma);
-#endif
-	}
-}
-
-
 /**
  * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
  * @page: page to check evictability and move to appropriate lru list
@@ -2421,8 +2388,6 @@ retry:
 	if (page_evictable(page, NULL)) {
 		enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
 
-		show_page_path(page);
-
 		__dec_zone_state(zone, NR_UNEVICTABLE);
 		list_move(&page->lru, &zone->lru[l].list);
 		__inc_zone_state(zone, NR_INACTIVE_ANON + l);
-- 
cgit v1.2.3


From 72eb8c6747b49e41fd2b042510f03ac7c13426fc Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Mon, 17 Nov 2008 00:30:57 +0100
Subject: unitialized return value in mm/mlock.c: __mlock_vma_pages_range()

Fix an unitialized return value when compiling on parisc (with CONFIG_UNEVICTABLE_LRU=y):
	mm/mlock.c: In function `__mlock_vma_pages_range':
	mm/mlock.c:165: warning: `ret' might be used uninitialized in this function

Signed-off-by: Helge Deller <deller@gmx.de>
[ It isn't ever really used uninitialized, since no caller should ever
  call this function with an empty range.  But the compiler is correct
  that from a local analysis standpoint that is impossible to see, and
  fixing the warning is appropriate.  ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index a6da2aee940..1ada366570c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -162,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 	unsigned long addr = start;
 	struct page *pages[16]; /* 16 gives a reasonable batch */
 	int nr_pages = (end - start) / PAGE_SIZE;
-	int ret;
+	int ret = 0;
 	int gup_flags = 0;
 
 	VM_BUG_ON(start & ~PAGE_MASK);
-- 
cgit v1.2.3


From f481891fdc49d3d1b8a9674a1825d183069a805f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 19 Nov 2008 15:36:30 -0800
Subject: cpuset: update top cpuset's mems after adding a node

After adding a node into the machine, top cpuset's mems isn't updated.

By reviewing the code, we found that the update function

  cpuset_track_online_nodes()

was invoked after node_states[N_ONLINE] changes.  It is wrong because
N_ONLINE just means node has pgdat, and if node has/added memory, we use
N_HIGH_MEMORY.  So, We should invoke the update function after
node_states[N_HIGH_MEMORY] changes, just like its commit says.

This patch fixes it.  And we use notifier of memory hotplug instead of
direct calling of cpuset_track_online_nodes().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6837a101437..b5b2b15085a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -22,7 +22,6 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
-#include <linux/cpuset.h>
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
@@ -498,8 +497,6 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
-	cpuset_track_online_nodes();
-
 	if (new_pgdat) {
 		ret = register_one_node(nid);
 		/*
-- 
cgit v1.2.3


From f011c2dae6cffc50ef67d9bd937b488ba5db8913 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 19 Nov 2008 15:36:32 -0800
Subject: mm: vmalloc allocator off by one

Fix off by one bug in the KVA allocator that can leave gaps in the address
space.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ba6b0f5f7fa..46aab4dbf61 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -362,7 +362,7 @@ retry:
 				goto found;
 		}
 
-		while (addr + size >= first->va_start && addr + size <= vend) {
+		while (addr + size > first->va_start && addr + size <= vend) {
 			addr = ALIGN(first->va_end + PAGE_SIZE, align);
 
 			n = rb_next(&first->rb_node);
-- 
cgit v1.2.3


From 496850e5f5a372029ceb2b35c811770a9bb073b6 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 19 Nov 2008 15:36:33 -0800
Subject: mm: vmalloc failure flush fix

An initial vmalloc failure should start off a synchronous flush of lazy
areas, in case someone is in progress flushing them already, which could
cause us to return an allocation failure even if there is plenty of KVA
free.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 46aab4dbf61..04f5e320e74 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -521,6 +521,17 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 	spin_unlock(&purge_lock);
 }
 
+/*
+ * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
+ * is already purging.
+ */
+static void try_purge_vmap_area_lazy(void)
+{
+	unsigned long start = ULONG_MAX, end = 0;
+
+	__purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+
 /*
  * Kick off a purge of the outstanding lazy areas.
  */
@@ -528,7 +539,7 @@ static void purge_vmap_area_lazy(void)
 {
 	unsigned long start = ULONG_MAX, end = 0;
 
-	__purge_vmap_area_lazy(&start, &end, 0, 0);
+	__purge_vmap_area_lazy(&start, &end, 1, 0);
 }
 
 /*
@@ -539,7 +550,7 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 	va->flags |= VM_LAZY_FREE;
 	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
 	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
-		purge_vmap_area_lazy();
+		try_purge_vmap_area_lazy();
 }
 
 static struct vmap_area *find_vmap_area(unsigned long addr)
-- 
cgit v1.2.3


From 0ae15132a4f5c758a6ffcde74495641dc3f62ba1 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Wed, 19 Nov 2008 15:36:33 -0800
Subject: mm: vmalloc search restart fix

Current vmalloc restart search for a free area in case we can't find one.
The reason is there are areas which are lazily freed, and could be
possibly freed now.  However, current implementation start searching the
tree from the last failing address, which is pretty much by definition at
the end of address space.  So, we fail.

The proposal of this patch is to restart the search from the beginning of
the requested vstart address.  This fixes the regression in running KVM
virtual machines for me, described in http://lkml.org/lkml/2008/10/28/349,
caused by commit db64fe02258f1507e13fe5212a989922323685ce.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 04f5e320e74..30f826d484f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -324,14 +324,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 
 	BUG_ON(size & ~PAGE_MASK);
 
-	addr = ALIGN(vstart, align);
-
 	va = kmalloc_node(sizeof(struct vmap_area),
 			gfp_mask & GFP_RECLAIM_MASK, node);
 	if (unlikely(!va))
 		return ERR_PTR(-ENOMEM);
 
 retry:
+	addr = ALIGN(vstart, align);
+
 	spin_lock(&vmap_area_lock);
 	/* XXX: could have a last_hole cache */
 	n = vmap_area_root.rb_node;
-- 
cgit v1.2.3


From bda8550deed96687f29992d711a88ea21cff4d26 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 19 Nov 2008 15:36:36 -0800
Subject: migration: fix writepage error

Page migration's writeout() has got understandably confused by the nasty
AOP_WRITEPAGE_ACTIVATE case: as in normal success, a writepage() error has
unlocked the page, so writeout() then needs to relock it.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 385db89f0c3..1e0d6b237f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -522,15 +522,12 @@ static int writeout(struct address_space *mapping, struct page *page)
 	remove_migration_ptes(page, page);
 
 	rc = mapping->a_ops->writepage(page, &wbc);
-	if (rc < 0)
-		/* I/O Error writing */
-		return -EIO;
 
 	if (rc != AOP_WRITEPAGE_ACTIVATE)
 		/* unlocked. Relock */
 		lock_page(page);
 
-	return -EAGAIN;
+	return (rc < 0) ? -EIO : -EAGAIN;
 }
 
 /*
-- 
cgit v1.2.3


From 63eb6b93ce725e4c5f38fc85dd703d49465b03cb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 19 Nov 2008 15:36:37 -0800
Subject: vmscan: let GFP_NOFS go to swap again

In the past, GFP_NOFS (but of course not GFP_NOIO) was allowed to reclaim
by writing to swap.  That got partially broken in 2.6.23, when may_enter_fs
initialization was moved up before the allocation of swap, so its
PageSwapCache test was failing the first time around,

Fix it by setting may_enter_fs when add_to_swap() succeeds with
__GFP_IO.  In fact, check __GFP_IO before calling add_to_swap():
allocating swap we're not ready to use just increases disk seeking.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c141b3e7807..f83a7ed5c6c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -623,6 +623,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!(sc->gfp_mask & __GFP_IO))
+				goto keep_locked;
 			switch (try_to_munlock(page)) {
 			case SWAP_FAIL:		/* shouldn't happen */
 			case SWAP_AGAIN:
@@ -634,6 +636,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
+			may_enter_fs = 1;
 		}
 #endif /* CONFIG_SWAP */
 
-- 
cgit v1.2.3


From 00d8089c54867053a5aae062b765f257ca419e27 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 19 Nov 2008 15:36:44 -0800
Subject: vmscan: fix get_scan_ratio() comment

Fix the old comment on the scan ratio calculations.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f83a7ed5c6c..7ea1440b53d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1389,9 +1389,9 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 	file_prio = 200 - sc->swappiness;
 
 	/*
-	 *                  anon       recent_rotated[0]
-	 * %anon = 100 * ----------- / ----------------- * IO cost
-	 *               anon + file      rotate_sum
+	 * The amount of pressure on anon vs file pages is inversely
+	 * proportional to the fraction of recently scanned pages on
+	 * each list that were recently referenced and in active use.
 	 */
 	ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
 	ap /= zone->recent_rotated[0] + 1;
-- 
cgit v1.2.3


From 31168481c32c8a485e1003af9433124dede57f8d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 22 Nov 2008 17:33:24 +0000
Subject: meminit section warnings

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c |  9 +++++----
 mm/page_cgroup.c    | 13 +++++++------
 mm/sparse.c         |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b5b2b15085a..b1737118546 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -189,7 +189,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 					pgdat->node_start_pfn;
 }
 
-static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
@@ -216,7 +216,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	return 0;
 }
 
-static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
 	int nr_pages = PAGES_PER_SECTION;
 	int ret;
@@ -273,7 +273,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 		 unsigned long nr_pages)
 {
 	unsigned long i;
@@ -470,7 +470,8 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 
 
-int add_memory(int nid, u64 start, u64 size)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+int __ref add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat = NULL;
 	int new_pgdat = 0;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1223d927904..436c00229e7 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -21,7 +21,7 @@ static unsigned long total_usage;
 #if !defined(CONFIG_SPARSEMEM)
 
 
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
 	pgdat->node_page_cgroup = NULL;
 }
@@ -97,7 +97,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 	return section->page_cgroup + pfn;
 }
 
-int __meminit init_section_page_cgroup(unsigned long pfn)
+/* __alloc_bootmem...() is protected by !slab_available() */
+int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
 	struct mem_section *section;
 	struct page_cgroup *base, *pc;
@@ -158,7 +159,7 @@ void __free_page_cgroup(unsigned long pfn)
 	}
 }
 
-int online_page_cgroup(unsigned long start_pfn,
+int __meminit online_page_cgroup(unsigned long start_pfn,
 			unsigned long nr_pages,
 			int nid)
 {
@@ -183,7 +184,7 @@ int online_page_cgroup(unsigned long start_pfn,
 	return -ENOMEM;
 }
 
-int offline_page_cgroup(unsigned long start_pfn,
+int __meminit offline_page_cgroup(unsigned long start_pfn,
 		unsigned long nr_pages, int nid)
 {
 	unsigned long start, end, pfn;
@@ -197,7 +198,7 @@ int offline_page_cgroup(unsigned long start_pfn,
 
 }
 
-static int page_cgroup_callback(struct notifier_block *self,
+static int __meminit page_cgroup_callback(struct notifier_block *self,
 			       unsigned long action, void *arg)
 {
 	struct memory_notify *mn = arg;
@@ -248,7 +249,7 @@ void __init page_cgroup_init(void)
 	" want\n");
 }
 
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
 	return;
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 39db301b920..083f5b63e7a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -570,7 +570,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
  * set.  If this is <=0, then that means that the passed-in
  * map was not consumed and must be freed.
  */
-int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 			   int nr_pages)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-- 
cgit v1.2.3


From 2a1dc509747fdcfdf3a2df818a14908aed86c3d4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Mon, 1 Dec 2008 03:00:35 +0100
Subject: vmscan: protect zone rotation stats by lru lock

The zone's rotation statistics must not be accessed without the
corresponding LRU lock held.  Fix an unprotected write in
shrink_active_list().

Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ea1440b53d..62e7f62fb55 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1248,6 +1248,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		list_add(&page->lru, &l_inactive);
 	}
 
+	spin_lock_irq(&zone->lru_lock);
 	/*
 	 * Count referenced pages from currently used mappings as
 	 * rotated, even though they are moved to the inactive list.
@@ -1263,7 +1264,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 	pgmoved = 0;
 	lru = LRU_BASE + file * LRU_FILE;
-	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-- 
cgit v1.2.3


From b29acbdcf877009af3f1fc0750bcac314c51e055 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 1 Dec 2008 13:13:47 -0800
Subject: mm: vmalloc fix lazy unmapping cache aliasing

Jim Radford has reported that the vmap subsystem rewrite was sometimes
causing his VIVT ARM system to behave strangely (seemed like going into
infinite loops trying to fault in pages to userspace).

We determined that the problem was most likely due to a cache aliasing
issue.  flush_cache_vunmap was only being called at the moment the page
tables were to be taken down, however with lazy unmapping, this can happen
after the page has subsequently been freed and allocated for something
else.  The dangling alias may still have dirty data attached to it.

The fix for this problem is to do the cache flushing when the caller has
called vunmap -- it would be a bug for them to write anything else to the
mapping at that point.

That appeared to solve Jim's problems.

Reported-by: Jim Radford <radford@blackbean.org>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 30f826d484f..f3f6e075856 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -77,7 +77,6 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
-	flush_cache_vunmap(addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
@@ -543,9 +542,10 @@ static void purge_vmap_area_lazy(void)
 }
 
 /*
- * Free and unmap a vmap area
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
  */
-static void free_unmap_vmap_area(struct vmap_area *va)
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
 {
 	va->flags |= VM_LAZY_FREE;
 	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -553,6 +553,15 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 		try_purge_vmap_area_lazy();
 }
 
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+	flush_cache_vunmap(va->va_start, va->va_end);
+	free_unmap_vmap_area_noflush(va);
+}
+
 static struct vmap_area *find_vmap_area(unsigned long addr)
 {
 	struct vmap_area *va;
@@ -734,7 +743,7 @@ static void free_vmap_block(struct vmap_block *vb)
 	spin_unlock(&vmap_block_tree_lock);
 	BUG_ON(tmp != vb);
 
-	free_unmap_vmap_area(vb->va);
+	free_unmap_vmap_area_noflush(vb->va);
 	call_rcu(&vb->rcu_head, rcu_free_vb);
 }
 
@@ -796,6 +805,9 @@ static void vb_free(const void *addr, unsigned long size)
 
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+
+	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+
 	order = get_order(size);
 
 	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
-- 
cgit v1.2.3


From dc19f9db38295f811d9041bd89b113beccbd763a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 1 Dec 2008 13:13:48 -0800
Subject: memcg: memory hotplug fix for notifier callback

Fixes for memcg/memory hotplug.

While memory hotplug allocate/free memmap, page_cgroup doesn't free
page_cgroup at OFFLINE when page_cgroup is allocated via bootomem.
(Because freeing bootmem requires special care.)

Then, if page_cgroup is allocated by bootmem and memmap is freed/allocated
by memory hotplug, page_cgroup->page == page is no longer true.

But current MEM_ONLINE handler doesn't check it and update
page_cgroup->page if it's not necessary to allocate page_cgroup.  (This
was not found because memmap is not freed if SPARSEMEM_VMEMMAP is y.)

And I noticed that MEM_ONLINE can be called against "part of section".
So, freeing page_cgroup at CANCEL_ONLINE will cause trouble.  (freeing
used page_cgroup) Don't rollback at CANCEL.

One more, current memory hotplug notifier is stopped by slub because it
sets NOTIFY_STOP_MASK to return vaule.  So, page_cgroup's callback never
be called.  (low priority than slub now.)

I think this slub's behavior is not intentional(BUG). and fixes it.

Another way to be considered about page_cgroup allocation:
  - free page_cgroup at OFFLINE even if it's from bootmem
    and remove specieal handler. But it requires more changes.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12041

Signed-off-by: KAMEZAWA Hiruyoki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Tested-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_cgroup.c | 43 +++++++++++++++++++++++++++++--------------
 mm/slub.c        |  6 ++++--
 2 files changed, 33 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 436c00229e7..0b3cbf090a6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -107,19 +107,29 @@ int __init_refok init_section_page_cgroup(unsigned long pfn)
 
 	section = __pfn_to_section(pfn);
 
-	if (section->page_cgroup)
-		return 0;
-
-	nid = page_to_nid(pfn_to_page(pfn));
-
-	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-	if (slab_is_available()) {
-		base = kmalloc_node(table_size, GFP_KERNEL, nid);
-		if (!base)
-			base = vmalloc_node(table_size, nid);
-	} else {
-		base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
+	if (!section->page_cgroup) {
+		nid = page_to_nid(pfn_to_page(pfn));
+		table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+		if (slab_is_available()) {
+			base = kmalloc_node(table_size, GFP_KERNEL, nid);
+			if (!base)
+				base = vmalloc_node(table_size, nid);
+		} else {
+			base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+				table_size,
 				PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+		}
+	} else {
+		/*
+ 		 * We don't have to allocate page_cgroup again, but
+		 * address of memmap may be changed. So, we have to initialize
+		 * again.
+		 */
+		base = section->page_cgroup + pfn;
+		table_size = 0;
+		/* check address of memmap is changed or not. */
+		if (base->page == pfn_to_page(pfn))
+			return 0;
 	}
 
 	if (!base) {
@@ -208,18 +218,23 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
 		ret = online_page_cgroup(mn->start_pfn,
 				   mn->nr_pages, mn->status_change_nid);
 		break;
-	case MEM_CANCEL_ONLINE:
 	case MEM_OFFLINE:
 		offline_page_cgroup(mn->start_pfn,
 				mn->nr_pages, mn->status_change_nid);
 		break;
+	case MEM_CANCEL_ONLINE:
 	case MEM_GOING_OFFLINE:
 		break;
 	case MEM_ONLINE:
 	case MEM_CANCEL_OFFLINE:
 		break;
 	}
-	ret = notifier_from_errno(ret);
+
+	if (ret)
+		ret = notifier_from_errno(ret);
+	else
+		ret = NOTIFY_OK;
+
 	return ret;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index 7ad489af956..749588a50a5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2931,8 +2931,10 @@ static int slab_memory_callback(struct notifier_block *self,
 	case MEM_CANCEL_OFFLINE:
 		break;
 	}
-
-	ret = notifier_from_errno(ret);
+	if (ret)
+		ret = notifier_from_errno(ret);
+	else
+		ret = NOTIFY_OK;
 	return ret;
 }
 
-- 
cgit v1.2.3


From f1d0b063d993527754f062c589b73f125024d216 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 2 Dec 2008 10:31:50 -0800
Subject: bdi: register sysfs bdi device only once per queue

Devices which share the same queue, like floppies and mtd devices, get
registered multiple times in the bdi interface, but bdi accounts only the
last registered device of the devices sharing one queue.

On remove, all earlier registered devices leak, stay around in sysfs, and
cause "duplicate filename" errors if the devices are re-created.

This prevents the creation of multiple bdi interfaces per queue, and the
bdi device will carry the dev_t name of the block device which is the
first one registered, of the pool of devices using the same queue.

[akpm@linux-foundation.org: add a WARN_ON so we know which drivers are misbehaving]
Tested-by: Peter Korsgaard <jacmet@sunsite.dk>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/backing-dev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f2e574dbc30..2a56124dbc2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -176,6 +176,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	int ret = 0;
 	struct device *dev;
 
+	if (WARN_ON(bdi->dev))
+		goto exit;
+
 	va_start(args, fmt);
 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
 	va_end(args);
-- 
cgit v1.2.3


From 9ff473b9a72942c5ac0ad35607cae28d8d59ed7a Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 2 Dec 2008 10:31:52 -0800
Subject: vmscan: evict streaming IO first

Count the insertion of new pages in the statistics used to drive the
pageout scanning code.  This should help the kernel quickly evict
streaming file IO.

We count on the fact that new file pages start on the inactive file LRU
and new anonymous pages start on the active anon list.  This means
streaming file IO will increment the recent scanned file statistic, while
leaving the recent rotated file statistic alone, driving pageout scanning
to the file LRUs.

Pageout activity does its own list manipulation.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: Gene Heskett <gene.heskett@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 2152e48a7b8..2881987603e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -445,6 +445,7 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 		struct zone *pagezone = page_zone(page);
+		int file;
 
 		if (pagezone != zone) {
 			if (zone)
@@ -456,8 +457,12 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 		VM_BUG_ON(PageUnevictable(page));
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (is_active_lru(lru))
+		file = is_file_lru(lru);
+		zone->recent_scanned[file]++;
+		if (is_active_lru(lru)) {
 			SetPageActive(page);
+			zone->recent_rotated[file]++;
+		}
 		add_page_to_lru_list(zone, page, lru);
 	}
 	if (zone)
-- 
cgit v1.2.3


From 69fc208be5b7eb18d22d1eca185b201400fd5ffc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 9 Dec 2008 13:14:06 -0800
Subject: mm/backing-dev.c: remove recently-added WARN_ON()

On second thoughts, this is just going to disturb people while telling us
things which we already knew.

Cc: Peter Korsgaard <jacmet@sunsite.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/backing-dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2a56124dbc2..801c08b046e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -176,7 +176,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	int ret = 0;
 	struct device *dev;
 
-	if (WARN_ON(bdi->dev))
+	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		goto exit;
 
 	va_start(args, fmt);
-- 
cgit v1.2.3


From 6841c8e26357904ef462650273f5d5015f7bb370 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 9 Dec 2008 13:14:16 -0800
Subject: mm: remove UP version of lru_add_drain_all()

Currently, lru_add_drain_all() has two version.
  (1) use schedule_on_each_cpu()
  (2) don't use schedule_on_each_cpu()

Gerald Schaefer reported it doesn't work well on SMP (not NUMA) S390
machine.

  offline_pages() calls lru_add_drain_all() followed by drain_all_pages().
  While drain_all_pages() works on each cpu, lru_add_drain_all() only runs
  on the current cpu for architectures w/o CONFIG_NUMA. This let us run
  into the BUG_ON(!PageBuddy(page)) in __offline_isolated_pages() during
  memory hotplug stress test on s390. The page in question was still on the
  pcp list, because of a race with lru_add_drain_all() and drain_all_pages()
  on different cpus.

Actually, Almost machine has CONFIG_UNEVICTABLE_LRU=y. Then almost machine use
(1) version lru_add_drain_all although the machine is UP.

Then this ifdef is not valueable.
simple removing is better.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Acked-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 2881987603e..b135ec90cde 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -299,7 +299,6 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
-#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
 	lru_add_drain();
@@ -313,18 +312,6 @@ int lru_add_drain_all(void)
 	return schedule_on_each_cpu(lru_add_drain_per_cpu);
 }
 
-#else
-
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
-{
-	lru_add_drain();
-	return 0;
-}
-#endif
-
 /*
  * Batched page_cache_release().  Decrement the reference count on all the
  * passed pages.  If it fell to zero then remove the page from the LRU and
-- 
cgit v1.2.3


From 653d22c0f5c41496c0e949ef5d141ab37c0b0580 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 9 Dec 2008 13:14:20 -0800
Subject: page_cgroup should ignore empty nodes

Fix a total bootup freeze on ia64.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 0b3cbf090a6..ab27ff75051 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -49,6 +49,9 @@ static int __init alloc_node_page_cgroup(int nid)
 	start_pfn = NODE_DATA(nid)->node_start_pfn;
 	nr_pages = NODE_DATA(nid)->node_spanned_pages;
 
+	if (!nr_pages)
+		return 0;
+
 	table_size = sizeof(struct page_cgroup) * nr_pages;
 
 	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-- 
cgit v1.2.3


From 80bba1290ab5122c60cdb73332b26d288dc8aedd Mon Sep 17 00:00:00 2001
From: Brice Goglin <Brice.Goglin@inria.fr>
Date: Tue, 9 Dec 2008 13:14:23 -0800
Subject: mm: no get_user/put_user while holding mmap_sem in do_pages_stat?

Since commit 2f007e74bb85b9fc4eab28524052161703300f1a, do_pages_stat()
gets the page address from user-space and puts the corresponding status
back while holding the mmap_sem for read.  There is no need to hold
mmap_sem there while some page-faults may occur.

This patch adds a temporary address and status buffer so as to only
hold mmap_sem while working on these kernel buffers.  This is
implemented by extracting do_pages_stat_array() out of do_pages_stat().

Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 1e0d6b237f4..d8f07667fc8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -987,25 +987,18 @@ out:
 /*
  * Determine the nodes of an array of pages and store it in an array of status.
  */
-static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
-			 const void __user * __user *pages,
-			 int __user *status)
+static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
+				const void __user **pages, int *status)
 {
 	unsigned long i;
-	int err;
 
 	down_read(&mm->mmap_sem);
 
 	for (i = 0; i < nr_pages; i++) {
-		const void __user *p;
-		unsigned long addr;
+		unsigned long addr = (unsigned long)(*pages);
 		struct vm_area_struct *vma;
 		struct page *page;
-
-		err = -EFAULT;
-		if (get_user(p, pages+i))
-			goto out;
-		addr = (unsigned long) p;
+		int err;
 
 		vma = find_vma(mm, addr);
 		if (!vma)
@@ -1024,12 +1017,52 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
 
 		err = page_to_nid(page);
 set_status:
-		put_user(err, status+i);
+		*status = err;
+
+		pages++;
+		status++;
+	}
+
+	up_read(&mm->mmap_sem);
+}
+
+/*
+ * Determine the nodes of a user array of pages and store it in
+ * a user array of status.
+ */
+static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+			 const void __user * __user *pages,
+			 int __user *status)
+{
+#define DO_PAGES_STAT_CHUNK_NR 16
+	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
+	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+	unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+	int err;
+
+	for (i = 0; i < nr_pages; i += chunk_nr) {
+		if (chunk_nr + i > nr_pages)
+			chunk_nr = nr_pages - i;
+
+		err = copy_from_user(chunk_pages, &pages[i],
+				     chunk_nr * sizeof(*chunk_pages));
+		if (err) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
+
+		err = copy_to_user(&status[i], chunk_status,
+				   chunk_nr * sizeof(*chunk_status));
+		if (err) {
+			err = -EFAULT;
+			goto out;
+		}
 	}
 	err = 0;
 
 out:
-	up_read(&mm->mmap_sem);
 	return err;
 }
 
-- 
cgit v1.2.3


From 9c24624727f6d6c460e45762a408ca5f5b9b8ef2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 9 Dec 2008 13:14:27 -0800
Subject: KSYM_SYMBOL_LEN fixes

Miles Lane tailing /sys files hit a BUG which Pekka Enberg has tracked
to my 966c8c12dc9e77f931e2281ba25d2f0244b06949 sprint_symbol(): use
less stack exposing a bug in slub's list_locations() -
kallsyms_lookup() writes a 0 to namebuf[KSYM_NAME_LEN-1], but that was
beyond the end of page provided.

The 100 slop which list_locations() allows at end of page looks roughly
enough for all the other stuff it might print after the symbol before
it checks again: break out KSYM_SYMBOL_LEN earlier than before.

Latencytop and ftrace and are using KSYM_NAME_LEN buffers where they
need KSYM_SYMBOL_LEN buffers, and vmallocinfo a 2*KSYM_NAME_LEN buffer
where it wants a KSYM_SYMBOL_LEN buffer: fix those before anyone copies
them.

[akpm@linux-foundation.org: ftrace.h needs module.h]
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc Miles Lane <miles.lane@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c    | 2 +-
 mm/vmalloc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 749588a50a5..a2cd47d89e0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3597,7 +3597,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 	for (i = 0; i < t.count; i++) {
 		struct location *l = &t.loc[i];
 
-		if (len > PAGE_SIZE - 100)
+		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
 			break;
 		len += sprintf(buf + len, "%7ld ", l->count);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f3f6e075856..1ddb77ba399 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1717,7 +1717,7 @@ static int s_show(struct seq_file *m, void *p)
 		v->addr, v->addr + v->size, v->size);
 
 	if (v->caller) {
-		char buff[2 * KSYM_NAME_LEN];
+		char buff[KSYM_SYMBOL_LEN];
 
 		seq_putc(m, ' ');
 		sprint_symbol(buff, (unsigned long)v->caller);
-- 
cgit v1.2.3


From 5e18e2b8b3d453e68accc3e295643fe4b5bbc295 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 15 Dec 2008 13:54:16 -0800
Subject: slob: do not pass the SLAB flags as GFP in kmem_cache_create()

The kmem_cache_create() function in the slob allocator passes the SLAB
flags as GFP flags to the slob_alloc() function.  The patch changes this
call to pass GFP_KERNEL as the other allocators seem to do.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slob.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index cb675d12679..bf7e8fc3aed 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -535,7 +535,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 	struct kmem_cache *c;
 
 	c = slob_alloc(sizeof(struct kmem_cache),
-		flags, ARCH_KMALLOC_MINALIGN, -1);
+		GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
 
 	if (c) {
 		c->name = name;
-- 
cgit v1.2.3


From c095adbc211f9f4e990eac7d6cb440de35e4f05f Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 16 Dec 2008 16:06:43 +0900
Subject: mm: Don't touch uninitialized variable in do_pages_stat_array()

Commit 80bba1290ab5122c60cdb73332b26d288dc8aedd removed one necessary
variable initialization.  As a result following warning happened:

    CC      mm/migrate.o
  mm/migrate.c: In function 'sys_move_pages':
  mm/migrate.c:1001: warning: 'err' may be used uninitialized in this function

More unfortunately, if find_vma() failed, kernel read uninitialized
memory.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Brice Goglin <Brice.Goglin@inria.fr>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index d8f07667fc8..037b0967c1e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -998,7 +998,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 		unsigned long addr = (unsigned long)(*pages);
 		struct vm_area_struct *vma;
 		struct page *page;
-		int err;
+		int err = -EFAULT;
 
 		vma = find_vma(mm, addr);
 		if (!vma)
-- 
cgit v1.2.3