From 802f192e4a600f7ef84ca25c8b818c8830acef5a Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 3 Sep 2005 15:54:26 -0700
Subject: [PATCH] SPARSEMEM EXTREME

A new option for SPARSEMEM is ARCH_SPARSEMEM_EXTREME.  Architecture
platforms with a very sparse physical address space would likely want to
select this option.  For those architecture platforms that don't select the
option, the code generated is equivalent to SPARSEMEM currently in -mm.
I'll be posting a patch on ia64 ml which uses this new SPARSEMEM feature.

ARCH_SPARSEMEM_EXTREME makes mem_section a one dimensional array of
pointers to mem_sections.  This two level layout scheme is able to achieve
smaller memory requirements for SPARSEMEM with the tradeoff of an
additional shift and load when fetching the memory section.  The current
SPARSEMEM -mm implementation is a one dimensional array of mem_sections
which is the default SPARSEMEM configuration.  The patch attempts isolates
the implementation details of the physical layout of the sparsemem section
array.

ARCH_SPARSEMEM_EXTREME depends on 64BIT and is by default boolean false.

I've boot tested under aim load ia64 configured for ARCH_SPARSEMEM_EXTREME.
 I've also boot tested a 4 way Opteron machine with !ARCH_SPARSEMEM_EXTREME
and tested with aim.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig  |  9 +++++++++
 mm/sparse.c | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index cd379936cac..fc644c5c065 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -89,3 +89,12 @@ config NEED_MULTIPLE_NODES
 config HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
+
+#
+# Architectecture platforms which require a two level mem_section in SPARSEMEM
+# must select this option. This is usually for architecture platforms with
+# an extremely sparse physical address space.
+#
+config ARCH_SPARSEMEM_EXTREME
+	def_bool n
+	depends on SPARSEMEM && 64BIT
diff --git a/mm/sparse.c b/mm/sparse.c
index b54e304df4a..b2b456bf0a5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,7 +13,26 @@
  *
  * 1) mem_section	- memory sections, mem_map's for valid memory
  */
-struct mem_section mem_section[NR_MEM_SECTIONS];
+#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME
+struct mem_section *mem_section[NR_SECTION_ROOTS]
+	____cacheline_maxaligned_in_smp;
+
+static void sparse_index_init(unsigned long section, int nid)
+{
+	unsigned long root = SECTION_TO_ROOT(section);
+
+	if (mem_section[root])
+		return;
+	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
+	if (mem_section[root])
+		memset(mem_section[root], 0, PAGE_SIZE);
+	else
+		panic("memory_present: NO MEMORY\n");
+}
+#else
+struct mem_section mem_section[NR_MEM_SECTIONS]
+	____cacheline_maxaligned_in_smp;
+#endif
 EXPORT_SYMBOL(mem_section);
 
 /* Record a memory area against a node. */
@@ -24,8 +43,13 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 	start &= PAGE_SECTION_MASK;
 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 		unsigned long section = pfn_to_section_nr(pfn);
-		if (!mem_section[section].section_mem_map)
-			mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+		struct mem_section *ms;
+
+		sparse_index_init(section, nid);
+
+		ms = __nr_to_section(section);
+		if (!ms->section_mem_map)
+			ms->section_mem_map = SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -85,6 +109,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
 	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+	struct mem_section *ms = __nr_to_section(pnum);
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
@@ -96,7 +121,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 		return map;
 
 	printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
-	mem_section[pnum].section_mem_map = 0;
+	ms->section_mem_map = 0;
 	return NULL;
 }
 
@@ -114,8 +139,9 @@ void sparse_init(void)
 			continue;
 
 		map = sparse_early_mem_map_alloc(pnum);
-		if (map)
-			sparse_init_one_section(&mem_section[pnum], pnum, map);
+		if (!map)
+			continue;
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map);
 	}
 }
 
-- 
cgit v1.2.3


From 3e347261a80b57df792ab9464b5f0ed59add53a8 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 3 Sep 2005 15:54:28 -0700
Subject: [PATCH] sparsemem extreme implementation

With cleanups from Dave Hansen <haveblue@us.ibm.com>

SPARSEMEM_EXTREME makes mem_section a one dimensional array of pointers to
mem_sections.  This two level layout scheme is able to achieve smaller
memory requirements for SPARSEMEM with the tradeoff of an additional shift
and load when fetching the memory section.  The current SPARSEMEM
implementation is a one dimensional array of mem_sections which is the
default SPARSEMEM configuration.  The patch attempts isolates the
implementation details of the physical layout of the sparsemem section
array.

SPARSEMEM_EXTREME requires bootmem to be functioning at the time of
memory_present() calls.  This is not always feasible, so architectures
which do not need it may allocate everything statically by using
SPARSEMEM_STATIC.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig  | 19 ++++++++++++++++---
 mm/sparse.c | 26 +++++++++++++++++---------
 2 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index fc644c5c065..4e9937ac352 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -90,11 +90,24 @@ config HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
 
+#
+# SPARSEMEM_EXTREME (which is the default) does some bootmem
+# allocations when memory_present() is called.  If this can not
+# be done on your architecture, select this option.  However,
+# statically allocating the mem_section[] array can potentially
+# consume vast quantities of .bss, so be careful.
+#
+# This option will also potentially produce smaller runtime code
+# with gcc 3.4 and later.
+#
+config SPARSEMEM_STATIC
+	def_bool n
+
 #
 # Architectecture platforms which require a two level mem_section in SPARSEMEM
 # must select this option. This is usually for architecture platforms with
 # an extremely sparse physical address space.
 #
-config ARCH_SPARSEMEM_EXTREME
-	def_bool n
-	depends on SPARSEMEM && 64BIT
+config SPARSEMEM_EXTREME
+	def_bool y
+	depends on SPARSEMEM && !SPARSEMEM_STATIC
diff --git a/mm/sparse.c b/mm/sparse.c
index b2b456bf0a5..fa01292157a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,28 +13,36 @@
  *
  * 1) mem_section	- memory sections, mem_map's for valid memory
  */
-#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME
+#ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
 	____cacheline_maxaligned_in_smp;
+#else
+struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
+	____cacheline_maxaligned_in_smp;
+#endif
+EXPORT_SYMBOL(mem_section);
+
+static void sparse_alloc_root(unsigned long root, int nid)
+{
+#ifdef CONFIG_SPARSEMEM_EXTREME
+	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
+#endif
+}
 
 static void sparse_index_init(unsigned long section, int nid)
 {
-	unsigned long root = SECTION_TO_ROOT(section);
+	unsigned long root = SECTION_NR_TO_ROOT(section);
 
 	if (mem_section[root])
 		return;
-	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
+
+	sparse_alloc_root(root, nid);
+
 	if (mem_section[root])
 		memset(mem_section[root], 0, PAGE_SIZE);
 	else
 		panic("memory_present: NO MEMORY\n");
 }
-#else
-struct mem_section mem_section[NR_MEM_SECTIONS]
-	____cacheline_maxaligned_in_smp;
-#endif
-EXPORT_SYMBOL(mem_section);
-
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From 28ae55c98e4d16eac9a05a8a259d7763ef3aeb18 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 3 Sep 2005 15:54:29 -0700
Subject: [PATCH] sparsemem extreme: hotplug preparation

This splits up sparse_index_alloc() into two pieces.  This is needed
because we'll allocate the memory for the second level in a different place
from where we actually consume it to keep the allocation from happening
underneath a lock

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/sparse.c | 53 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index fa01292157a..347249a4917 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/spinlock.h>
 #include <asm/dma.h>
 
 /*
@@ -22,27 +23,55 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
 #endif
 EXPORT_SYMBOL(mem_section);
 
-static void sparse_alloc_root(unsigned long root, int nid)
-{
 #ifdef CONFIG_SPARSEMEM_EXTREME
-	mem_section[root] = alloc_bootmem_node(NODE_DATA(nid), PAGE_SIZE);
-#endif
+static struct mem_section *sparse_index_alloc(int nid)
+{
+	struct mem_section *section = NULL;
+	unsigned long array_size = SECTIONS_PER_ROOT *
+				   sizeof(struct mem_section);
+
+	section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+
+	if (section)
+		memset(section, 0, array_size);
+
+	return section;
 }
 
-static void sparse_index_init(unsigned long section, int nid)
+static int sparse_index_init(unsigned long section_nr, int nid)
 {
-	unsigned long root = SECTION_NR_TO_ROOT(section);
+	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
+	struct mem_section *section;
+	int ret = 0;
 
 	if (mem_section[root])
-		return;
+		return -EEXIST;
 
-	sparse_alloc_root(root, nid);
+	section = sparse_index_alloc(nid);
+	/*
+	 * This lock keeps two different sections from
+	 * reallocating for the same index
+	 */
+	spin_lock(&index_init_lock);
 
-	if (mem_section[root])
-		memset(mem_section[root], 0, PAGE_SIZE);
-	else
-		panic("memory_present: NO MEMORY\n");
+	if (mem_section[root]) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	mem_section[root] = section;
+out:
+	spin_unlock(&index_init_lock);
+	return ret;
 }
+#else /* !SPARSEMEM_EXTREME */
+static inline int sparse_index_init(unsigned long section_nr, int nid)
+{
+	return 0;
+}
+#endif
+
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From b0d9bcd4bb79a7834f8492f2ae5c2655a551f23d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:31 -0700
Subject: [PATCH] swap: update swapfile i_sem comment

Update swap extents comment: nowadays we guard with S_SWAPFILE not i_sem.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 60cd24a5520..9f46d83b4ec 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -926,7 +926,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  * requirements, they are simply tossed out - we will never use those blocks
  * for swapping.
  *
- * For S_ISREG swapfiles we hold i_sem across the life of the swapon.  This
+ * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
  * prevents root from shooting her foot off by ftruncating an in-use swapfile,
  * which will scribble on the fs.
  *
-- 
cgit v1.2.3


From e2244ec2efa4ee1edf391d0001d314933e2b2974 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:32 -0700
Subject: [PATCH] swap: correct swapfile nr_good_pages

If a regular swapfile lies on a filesystem whose blocksize is less than
PAGE_SIZE, then setup_swap_extents may have to cut the number of usable swap
pages; but sys_swapon's nr_good_pages was not expecting that.  Also,
setup_swap_extents takes no account of badpages listed in the swap header: not
worth doing so, but ensure nr_badpages is 0 for a regular swapfile.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9f46d83b4ec..5ac5333f37a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1008,8 +1008,9 @@ reprobe:
 	}
 	ret = 0;
 	if (page_no == 0)
-		ret = -EINVAL;
+		page_no = 1;	/* force Empty message */
 	sis->max = page_no;
+	sis->pages = page_no - 1;
 	sis->highest_bit = page_no - 1;
 done:
 	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
@@ -1446,6 +1447,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->highest_bit = maxpages - 1;
 
 		error = -EINVAL;
+		if (!maxpages)
+			goto bad_swap;
+		if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+			goto bad_swap;
 		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 			goto bad_swap;
 		
@@ -1470,25 +1475,27 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		if (error) 
 			goto bad_swap;
 	}
-	
+
 	if (swapfilesize && maxpages > swapfilesize) {
 		printk(KERN_WARNING
 		       "Swap area shorter than signature indicates\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
+	if (nr_good_pages) {
+		p->swap_map[0] = SWAP_MAP_BAD;
+		p->max = maxpages;
+		p->pages = nr_good_pages;
+		error = setup_swap_extents(p);
+		if (error)
+			goto bad_swap;
+		nr_good_pages = p->pages;
+	}
 	if (!nr_good_pages) {
 		printk(KERN_WARNING "Empty swap-file\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
-	p->swap_map[0] = SWAP_MAP_BAD;
-	p->max = maxpages;
-	p->pages = nr_good_pages;
-
-	error = setup_swap_extents(p);
-	if (error)
-		goto bad_swap;
 
 	down(&swapon_sem);
 	swap_list_lock();
-- 
cgit v1.2.3


From 4cd3bb10ff0b21b77b5a4cd13b4bd36694e054c4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:33 -0700
Subject: [PATCH] swap: move destroy_swap_extents calls

sys_swapon's call to destroy_swap_extents on failure is made after the final
swap_list_unlock, which is faintly unsafe: another sys_swapon might already be
setting up that swap_info_struct.  Calling it earlier, before taking
swap_list_lock, is safe.  sys_swapoff's call to destroy_swap_extents was safe,
but likewise move it earlier, before taking the locks (once try_to_unuse has
completed, nothing can be needing the swap extents).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5ac5333f37a..4b39e9501d4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1129,6 +1129,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		swap_list_unlock();
 		goto out_dput;
 	}
+	destroy_swap_extents(p);
 	down(&swapon_sem);
 	swap_list_lock();
 	drain_mmlist();
@@ -1139,7 +1140,6 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
-	destroy_swap_extents(p);
 	swap_device_unlock(p);
 	swap_list_unlock();
 	up(&swapon_sem);
@@ -1531,6 +1531,7 @@ bad_swap:
 		set_blocksize(bdev, p->old_block_size);
 		bd_release(bdev);
 	}
+	destroy_swap_extents(p);
 bad_swap_2:
 	swap_list_lock();
 	swap_map = p->swap_map;
@@ -1540,7 +1541,6 @@ bad_swap_2:
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
 	swap_list_unlock();
-	destroy_swap_extents(p);
 	vfree(swap_map);
 	if (swap_file)
 		filp_close(swap_file, NULL);
-- 
cgit v1.2.3


From 11d31886dbcb61039ed3789e583d21c6e70960fd Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:34 -0700
Subject: [PATCH] swap: swap extent list is ordered

There are several comments that swap's extent_list.prev points to the lowest
extent: that's not so, it's extent_list.next which points to it, as you'd
expect.  And a couple of loops in add_swap_extent which go all the way through
the list, when they should just add to the other end.

Fix those up, and let map_swap_page search the list forwards: profiles shows
it to be twice as quick that way - because prefetch works better on how the
structs are typically kmalloc'ed?  or because usually more is written to than
read from swap, and swap is allocated ascendingly?

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4b39e9501d4..73521d39e98 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -832,9 +832,9 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
 				offset < (se->start_page + se->nr_pages)) {
 			return se->start_block + (offset - se->start_page);
 		}
-		lh = se->list.prev;
+		lh = se->list.next;
 		if (lh == &sis->extent_list)
-			lh = lh->prev;
+			lh = lh->next;
 		se = list_entry(lh, struct swap_extent, list);
 		sis->curr_swap_extent = se;
 		BUG_ON(se == start_se);		/* It *must* be present */
@@ -859,10 +859,9 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 
 /*
  * Add a block range (and the corresponding page range) into this swapdev's
- * extent list.  The extent list is kept sorted in block order.
+ * extent list.  The extent list is kept sorted in page order.
  *
- * This function rather assumes that it is called in ascending sector_t order.
- * It doesn't look for extent coalescing opportunities.
+ * This function rather assumes that it is called in ascending page order.
  */
 static int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
@@ -872,16 +871,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	struct swap_extent *new_se;
 	struct list_head *lh;
 
-	lh = sis->extent_list.next;	/* The highest-addressed block */
-	while (lh != &sis->extent_list) {
+	lh = sis->extent_list.prev;	/* The highest page extent */
+	if (lh != &sis->extent_list) {
 		se = list_entry(lh, struct swap_extent, list);
-		if (se->start_block + se->nr_pages == start_block &&
-		    se->start_page  + se->nr_pages == start_page) {
+		BUG_ON(se->start_page + se->nr_pages != start_page);
+		if (se->start_block + se->nr_pages == start_block) {
 			/* Merge it */
 			se->nr_pages += nr_pages;
 			return 0;
 		}
-		lh = lh->next;
 	}
 
 	/*
@@ -894,14 +892,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	new_se->nr_pages = nr_pages;
 	new_se->start_block = start_block;
 
-	lh = sis->extent_list.prev;	/* The lowest block */
-	while (lh != &sis->extent_list) {
-		se = list_entry(lh, struct swap_extent, list);
-		if (se->start_block > start_block)
-			break;
-		lh = lh->prev;
-	}
-	list_add_tail(&new_se->list, lh);
+	list_add_tail(&new_se->list, &sis->extent_list);
 	sis->nr_extents++;
 	return 0;
 }
-- 
cgit v1.2.3


From 53092a7402f227151a681b0c92ec8598c5618b1a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:34 -0700
Subject: [PATCH] swap: show span of swap extents

The "Adding %dk swap" message shows the number of swap extents, as a guide to
how fragmented the swapfile may be.  But a useful further guide is what total
extent they span across (sometimes scarily large).

And there's no need to keep nr_extents in swap_info: it's unused after the
initial message, so save a little space by keeping it on stack.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 73521d39e98..d4da84ee392 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -854,7 +854,6 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 		list_del(&se->list);
 		kfree(se);
 	}
-	sis->nr_extents = 0;
 }
 
 /*
@@ -893,8 +892,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	new_se->start_block = start_block;
 
 	list_add_tail(&new_se->list, &sis->extent_list);
-	sis->nr_extents++;
-	return 0;
+	return 1;
 }
 
 /*
@@ -928,7 +926,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  * This is extremely effective.  The average number of iterations in
  * map_swap_page() has been measured at about 0.3 per page.  - akpm.
  */
-static int setup_swap_extents(struct swap_info_struct *sis)
+static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
 	struct inode *inode;
 	unsigned blocks_per_page;
@@ -936,11 +934,15 @@ static int setup_swap_extents(struct swap_info_struct *sis)
 	unsigned blkbits;
 	sector_t probe_block;
 	sector_t last_block;
+	sector_t lowest_block = -1;
+	sector_t highest_block = 0;
+	int nr_extents = 0;
 	int ret;
 
 	inode = sis->swap_file->f_mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
+		*span = sis->pages;
 		goto done;
 	}
 
@@ -985,19 +987,28 @@ static int setup_swap_extents(struct swap_info_struct *sis)
 			}
 		}
 
+		first_block >>= (PAGE_SHIFT - blkbits);
+		if (page_no) {	/* exclude the header page */
+			if (first_block < lowest_block)
+				lowest_block = first_block;
+			if (first_block > highest_block)
+				highest_block = first_block;
+		}
+
 		/*
 		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
 		 */
-		ret = add_swap_extent(sis, page_no, 1,
-				first_block >> (PAGE_SHIFT - blkbits));
-		if (ret)
+		ret = add_swap_extent(sis, page_no, 1, first_block);
+		if (ret < 0)
 			goto out;
+		nr_extents += ret;
 		page_no++;
 		probe_block += blocks_per_page;
 reprobe:
 		continue;
 	}
-	ret = 0;
+	ret = nr_extents;
+	*span = 1 + highest_block - lowest_block;
 	if (page_no == 0)
 		page_no = 1;	/* force Empty message */
 	sis->max = page_no;
@@ -1265,6 +1276,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	union swap_header *swap_header = NULL;
 	int swap_header_version;
 	int nr_good_pages = 0;
+	int nr_extents;
+	sector_t span;
 	unsigned long maxpages = 1;
 	int swapfilesize;
 	unsigned short *swap_map;
@@ -1300,7 +1313,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		nr_swapfiles = type+1;
 	INIT_LIST_HEAD(&p->extent_list);
 	p->flags = SWP_USED;
-	p->nr_extents = 0;
 	p->swap_file = NULL;
 	p->old_block_size = 0;
 	p->swap_map = NULL;
@@ -1477,9 +1489,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->swap_map[0] = SWAP_MAP_BAD;
 		p->max = maxpages;
 		p->pages = nr_good_pages;
-		error = setup_swap_extents(p);
-		if (error)
+		nr_extents = setup_swap_extents(p, &span);
+		if (nr_extents < 0) {
+			error = nr_extents;
 			goto bad_swap;
+		}
 		nr_good_pages = p->pages;
 	}
 	if (!nr_good_pages) {
@@ -1494,9 +1508,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	p->flags = SWP_ACTIVE;
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;
-	printk(KERN_INFO "Adding %dk swap on %s.  Priority:%d extents:%d\n",
-		nr_good_pages<<(PAGE_SHIFT-10), name,
-		p->prio, p->nr_extents);
+
+	printk(KERN_INFO "Adding %dk swap on %s.  "
+			"Priority:%d extents:%d across:%lluk\n",
+		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
 
 	/* insert swap space into swap_list: */
 	prev = -1;
-- 
cgit v1.2.3


From 6eb396dc4a9781c5e7951143ab56ce5710687ab3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:35 -0700
Subject: [PATCH] swap: swap unsigned int consistency

The swap header's unsigned int last_page determines the range of swap pages,
but swap_info has been using int or unsigned long in some cases: use unsigned
int throughout (except, in several places a local unsigned long is useful to
avoid overflows when adding).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d4da84ee392..6cc6dfb4d27 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -84,7 +84,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	up_read(&swap_unplug_sem);
 }
 
-static inline int scan_swap_map(struct swap_info_struct *si)
+static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
 	/* 
@@ -531,10 +531,11 @@ static int unuse_mm(struct mm_struct *mm,
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
  */
-static int find_next_to_unuse(struct swap_info_struct *si, int prev)
+static unsigned int find_next_to_unuse(struct swap_info_struct *si,
+					unsigned int prev)
 {
-	int max = si->max;
-	int i = prev;
+	unsigned int max = si->max;
+	unsigned int i = prev;
 	int count;
 
 	/*
@@ -577,7 +578,7 @@ static int try_to_unuse(unsigned int type)
 	unsigned short swcount;
 	struct page *page;
 	swp_entry_t entry;
-	int i = 0;
+	unsigned int i = 0;
 	int retval = 0;
 	int reset_overflow = 0;
 	int shmem;
@@ -1216,7 +1217,7 @@ static int swap_show(struct seq_file *swap, void *v)
 
 	file = ptr->swap_file;
 	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
-	seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
+	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
 		       len < 40 ? 40 - len : 1, " ",
 		       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
 				"partition" : "file\t",
@@ -1275,8 +1276,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	static int least_priority;
 	union swap_header *swap_header = NULL;
 	int swap_header_version;
-	int nr_good_pages = 0;
-	int nr_extents;
+	unsigned int nr_good_pages = 0;
+	int nr_extents = 0;
 	sector_t span;
 	unsigned long maxpages = 1;
 	int swapfilesize;
@@ -1509,7 +1510,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;
 
-	printk(KERN_INFO "Adding %dk swap on %s.  "
+	printk(KERN_INFO "Adding %uk swap on %s.  "
 			"Priority:%d extents:%d across:%lluk\n",
 		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
-- 
cgit v1.2.3


From 89d09a2c80ea6baafb559b86d545fada05e14ab5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:36 -0700
Subject: [PATCH] swap: freeing update swap_list.next

This makes negligible difference in practice: but swap_list.next should not be
updated to a higher prio in the general helper swap_info_get, but rather in
swap_entry_free; and then only in the case when entry is actually freed.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cc6dfb4d27..62e0da8f7e6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -215,8 +215,6 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 	if (!p->swap_map[offset])
 		goto bad_free;
 	swap_list_lock();
-	if (p->prio > swap_info[swap_list.next].prio)
-		swap_list.next = type;
 	swap_device_lock(p);
 	return p;
 
@@ -253,6 +251,8 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 				p->lowest_bit = offset;
 			if (offset > p->highest_bit)
 				p->highest_bit = offset;
+			if (p->prio > swap_info[swap_list.next].prio)
+				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
 		}
-- 
cgit v1.2.3


From fb4f88dcabdc716c7c350e09cf4a38a419b007e1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:37 -0700
Subject: [PATCH] swap: get_swap_page drop swap_list_lock

Rewrite get_swap_page to allocate in just the same sequence as before, but
without holding swap_list_lock across its scan_swap_map.  Decrement
nr_swap_pages and update swap_list.next in advance, while still holding
swap_list_lock.  Skip full devices by testing highest_bit.  Swapoff hold
swap_device_lock as well as swap_list_lock to clear SWP_WRITEOK.  Reduces lock
contention when there are parallel swap devices of the same priority.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 75 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 36 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62e0da8f7e6..e54d60af6b5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,6 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 		}
 		si->swap_map[offset] = 1;
 		si->inuse_pages++;
-		nr_swap_pages--;
 		si->cluster_next = offset+1;
 		return offset;
 	}
@@ -150,50 +149,45 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 
 swp_entry_t get_swap_page(void)
 {
-	struct swap_info_struct * p;
-	unsigned long offset;
-	swp_entry_t entry;
-	int type, wrapped = 0;
+	struct swap_info_struct *si;
+	pgoff_t offset;
+	int type, next;
+	int wrapped = 0;
 
-	entry.val = 0;	/* Out of memory */
 	swap_list_lock();
-	type = swap_list.next;
-	if (type < 0)
-		goto out;
 	if (nr_swap_pages <= 0)
-		goto out;
-
-	while (1) {
-		p = &swap_info[type];
-		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
-			swap_device_lock(p);
-			offset = scan_swap_map(p);
-			swap_device_unlock(p);
-			if (offset) {
-				entry = swp_entry(type,offset);
-				type = swap_info[type].next;
-				if (type < 0 ||
-					p->prio != swap_info[type].prio) {
-						swap_list.next = swap_list.head;
-				} else {
-					swap_list.next = type;
-				}
-				goto out;
-			}
+		goto noswap;
+	nr_swap_pages--;
+
+	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+		si = swap_info + type;
+		next = si->next;
+		if (next < 0 ||
+		    (!wrapped && si->prio != swap_info[next].prio)) {
+			next = swap_list.head;
+			wrapped++;
 		}
-		type = p->next;
-		if (!wrapped) {
-			if (type < 0 || p->prio != swap_info[type].prio) {
-				type = swap_list.head;
-				wrapped = 1;
-			}
-		} else
-			if (type < 0)
-				goto out;	/* out of swap space */
+
+		if (!si->highest_bit)
+			continue;
+		if (!(si->flags & SWP_WRITEOK))
+			continue;
+
+		swap_list.next = next;
+		swap_device_lock(si);
+		swap_list_unlock();
+		offset = scan_swap_map(si);
+		swap_device_unlock(si);
+		if (offset)
+			return swp_entry(type, offset);
+		swap_list_lock();
+		next = swap_list.next;
 	}
-out:
+
+	nr_swap_pages++;
+noswap:
 	swap_list_unlock();
-	return entry;
+	return (swp_entry_t) {0};
 }
 
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
@@ -1105,8 +1099,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	}
 	nr_swap_pages -= p->pages;
 	total_swap_pages -= p->pages;
+	swap_device_lock(p);
 	p->flags &= ~SWP_WRITEOK;
+	swap_device_unlock(p);
 	swap_list_unlock();
+
 	current->flags |= PF_SWAPOFF;
 	err = try_to_unuse(type);
 	current->flags &= ~PF_SWAPOFF;
-- 
cgit v1.2.3


From 7dfad4183bf9cd92f977caa3c12cc74f0eefc0e6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:38 -0700
Subject: [PATCH] swap: scan_swap_map restyled

Rewrite scan_swap_map to allocate in just the same way as before (taking the
next free entry SWAPFILE_CLUSTER-1 times, then restarting at the lowest wholly
empty cluster, falling back to lowest entry if none), but with a view towards
dropping the lock in the next patch.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 93 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 48 insertions(+), 45 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index e54d60af6b5..c70248aab53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -86,64 +86,67 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
-	unsigned long offset;
+	unsigned long offset, last_in_cluster;
+
 	/* 
-	 * We try to cluster swap pages by allocating them
-	 * sequentially in swap.  Once we've allocated
-	 * SWAPFILE_CLUSTER pages this way, however, we resort to
-	 * first-free allocation, starting a new cluster.  This
-	 * prevents us from scattering swap pages all over the entire
-	 * swap partition, so that we reduce overall disk seek times
-	 * between swap pages.  -- sct */
-	if (si->cluster_nr) {
-		while (si->cluster_next <= si->highest_bit) {
-			offset = si->cluster_next++;
+	 * We try to cluster swap pages by allocating them sequentially
+	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
+	 * way, however, we resort to first-free allocation, starting
+	 * a new cluster.  This prevents us from scattering swap pages
+	 * all over the entire swap partition, so that we reduce
+	 * overall disk seek times between swap pages.  -- sct
+	 * But we do now try to find an empty cluster.  -Andrea
+	 */
+
+	if (unlikely(!si->cluster_nr)) {
+		si->cluster_nr = SWAPFILE_CLUSTER - 1;
+		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
+			goto lowest;
+
+		offset = si->lowest_bit;
+		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+		/* Locate the first empty (unaligned) cluster */
+		for (; last_in_cluster <= si->highest_bit; offset++) {
 			if (si->swap_map[offset])
-				continue;
-			si->cluster_nr--;
-			goto got_page;
-		}
-	}
-	si->cluster_nr = SWAPFILE_CLUSTER;
-
-	/* try to find an empty (even not aligned) cluster. */
-	offset = si->lowest_bit;
- check_next_cluster:
-	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
-	{
-		unsigned long nr;
-		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
-			if (si->swap_map[nr])
-			{
-				offset = nr+1;
-				goto check_next_cluster;
+				last_in_cluster = offset + SWAPFILE_CLUSTER;
+			else if (offset == last_in_cluster) {
+				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+				goto cluster;
 			}
-		/* We found a completly empty cluster, so start
-		 * using it.
-		 */
-		goto got_page;
+		}
+		goto lowest;
 	}
-	/* No luck, so now go finegrined as usual. -Andrea */
-	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
-		if (si->swap_map[offset])
-			continue;
-		si->lowest_bit = offset+1;
-	got_page:
-		if (offset == si->lowest_bit)
+
+	si->cluster_nr--;
+cluster:
+	offset = si->cluster_next;
+	if (offset > si->highest_bit)
+lowest:		offset = si->lowest_bit;
+	if (!si->highest_bit)
+		goto no_page;
+	if (!si->swap_map[offset]) {
+got_page:	if (offset == si->lowest_bit)
 			si->lowest_bit++;
 		if (offset == si->highest_bit)
 			si->highest_bit--;
-		if (si->lowest_bit > si->highest_bit) {
+		si->inuse_pages++;
+		if (si->inuse_pages == si->pages) {
 			si->lowest_bit = si->max;
 			si->highest_bit = 0;
 		}
 		si->swap_map[offset] = 1;
-		si->inuse_pages++;
-		si->cluster_next = offset+1;
+		si->cluster_next = offset + 1;
 		return offset;
 	}
-	si->lowest_bit = si->max;
-	si->highest_bit = 0;
+
+	while (++offset <= si->highest_bit) {
+		if (!si->swap_map[offset])
+			goto got_page;
+	}
+	goto lowest;
+
+no_page:
 	return 0;
 }
 
-- 
cgit v1.2.3


From 52b7efdbe5f5696fc80338560a3fc51e0b0a993c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:39 -0700
Subject: [PATCH] swap: scan_swap_map drop swap_device_lock

get_swap_page has often shown up on latency traces, doing lengthy scans while
holding two spinlocks.  swap_list_lock is already dropped, now scan_swap_map
drop swap_device_lock before scanning the swap_map.

While scanning for an empty cluster, don't worry that racing tasks may
allocate what was free and free what was allocated; but when allocating an
entry, check it's still free after retaking the lock.  Avoid dropping the lock
in the expected common path.  No barriers beyond the locks, just let the
cookie crumble; highest_bit limit is volatile, but benign.

Guard against swapoff: must check SWP_WRITEOK before allocating, must raise
SWP_SCANNING reference count while in scan_swap_map, swapoff wait for that to
fall - just use schedule_timeout, we don't want to burden scan_swap_map
itself, and it's very unlikely that anyone can really still be in
scan_swap_map once swapoff gets this far.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index c70248aab53..fdee145afc6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -98,10 +98,12 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 	 * But we do now try to find an empty cluster.  -Andrea
 	 */
 
+	si->flags += SWP_SCANNING;
 	if (unlikely(!si->cluster_nr)) {
 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
 			goto lowest;
+		swap_device_unlock(si);
 
 		offset = si->lowest_bit;
 		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
@@ -111,10 +113,12 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 			if (si->swap_map[offset])
 				last_in_cluster = offset + SWAPFILE_CLUSTER;
 			else if (offset == last_in_cluster) {
+				swap_device_lock(si);
 				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
 				goto cluster;
 			}
 		}
+		swap_device_lock(si);
 		goto lowest;
 	}
 
@@ -123,10 +127,12 @@ cluster:
 	offset = si->cluster_next;
 	if (offset > si->highest_bit)
 lowest:		offset = si->lowest_bit;
+checks:	if (!(si->flags & SWP_WRITEOK))
+		goto no_page;
 	if (!si->highest_bit)
 		goto no_page;
 	if (!si->swap_map[offset]) {
-got_page:	if (offset == si->lowest_bit)
+		if (offset == si->lowest_bit)
 			si->lowest_bit++;
 		if (offset == si->highest_bit)
 			si->highest_bit--;
@@ -137,16 +143,22 @@ got_page:	if (offset == si->lowest_bit)
 		}
 		si->swap_map[offset] = 1;
 		si->cluster_next = offset + 1;
+		si->flags -= SWP_SCANNING;
 		return offset;
 	}
 
+	swap_device_unlock(si);
 	while (++offset <= si->highest_bit) {
-		if (!si->swap_map[offset])
-			goto got_page;
+		if (!si->swap_map[offset]) {
+			swap_device_lock(si);
+			goto checks;
+		}
 	}
+	swap_device_lock(si);
 	goto lowest;
 
 no_page:
+	si->flags -= SWP_SCANNING;
 	return 0;
 }
 
@@ -1111,10 +1123,6 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	err = try_to_unuse(type);
 	current->flags &= ~PF_SWAPOFF;
 
-	/* wait for any unplug function to finish */
-	down_write(&swap_unplug_sem);
-	up_write(&swap_unplug_sem);
-
 	if (err) {
 		/* re-insert swap space back into swap_list */
 		swap_list_lock();
@@ -1128,10 +1136,28 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 			swap_info[prev].next = p - swap_info;
 		nr_swap_pages += p->pages;
 		total_swap_pages += p->pages;
+		swap_device_lock(p);
 		p->flags |= SWP_WRITEOK;
+		swap_device_unlock(p);
 		swap_list_unlock();
 		goto out_dput;
 	}
+
+	/* wait for any unplug function to finish */
+	down_write(&swap_unplug_sem);
+	up_write(&swap_unplug_sem);
+
+	/* wait for anyone still in scan_swap_map */
+	swap_device_lock(p);
+	p->highest_bit = 0;		/* cuts scans short */
+	while (p->flags >= SWP_SCANNING) {
+		swap_device_unlock(p);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(1);
+		swap_device_lock(p);
+	}
+	swap_device_unlock(p);
+
 	destroy_swap_extents(p);
 	down(&swapon_sem);
 	swap_list_lock();
@@ -1431,6 +1457,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		}
 
 		p->lowest_bit  = 1;
+		p->cluster_next = 1;
+
 		/*
 		 * Find out how many pages are allowed for a single swap
 		 * device. There are two limiting factors: 1) the number of
-- 
cgit v1.2.3


From 048c27fd72816b44e096997d1c6901c3abbfd45b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:40 -0700
Subject: [PATCH] swap: scan_swap_map latency breaks

The get_swap_page/scan_swap_map latency can be so bad that even those without
preemption configured deserve relief: periodically cond_resched.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index fdee145afc6..e675ae55f87 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -56,8 +56,6 @@ static DECLARE_MUTEX(swapon_sem);
  */
 static DECLARE_RWSEM(swap_unplug_sem);
 
-#define SWAPFILE_CLUSTER 256
-
 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 {
 	swp_entry_t entry;
@@ -84,9 +82,13 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	up_read(&swap_unplug_sem);
 }
 
+#define SWAPFILE_CLUSTER	256
+#define LATENCY_LIMIT		256
+
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset, last_in_cluster;
+	int latency_ration = LATENCY_LIMIT;
 
 	/* 
 	 * We try to cluster swap pages by allocating them sequentially
@@ -117,6 +119,10 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
 				goto cluster;
 			}
+			if (unlikely(--latency_ration < 0)) {
+				cond_resched();
+				latency_ration = LATENCY_LIMIT;
+			}
 		}
 		swap_device_lock(si);
 		goto lowest;
@@ -153,6 +159,10 @@ checks:	if (!(si->flags & SWP_WRITEOK))
 			swap_device_lock(si);
 			goto checks;
 		}
+		if (unlikely(--latency_ration < 0)) {
+			cond_resched();
+			latency_ration = LATENCY_LIMIT;
+		}
 	}
 	swap_device_lock(si);
 	goto lowest;
-- 
cgit v1.2.3


From 5d337b9194b1ce3b6fd5f3cb2799455ed2f9a3d1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:41 -0700
Subject: [PATCH] swap: swap_lock replace list+device

The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.

The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention.  However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split.  Certainly the split is mere
overhead in the common case of a single swap device.

So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).

If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c  |   7 ++--
 mm/rmap.c     |   3 +-
 mm/swapfile.c | 125 +++++++++++++++++++++++++---------------------------------
 3 files changed, 57 insertions(+), 78 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index c11418dd94e..edc54436fa9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -54,9 +54,8 @@
  *
  *  ->i_mmap_lock		(vmtruncate)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
- *      ->swap_list_lock
- *        ->swap_device_lock	(exclusive_swap_page, others)
- *          ->mapping->tree_lock
+ *      ->swap_lock		(exclusive_swap_page, others)
+ *        ->mapping->tree_lock
  *
  *  ->i_sem
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
@@ -86,7 +85,7 @@
  *    ->page_table_lock		(anon_vma_prepare and various)
  *
  *  ->page_table_lock
- *    ->swap_device_lock	(try_to_unmap_one)
+ *    ->swap_lock		(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
diff --git a/mm/rmap.c b/mm/rmap.c
index 08ac5c7fa91..facb8cdca66 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -34,9 +34,8 @@
  *       anon_vma->lock
  *         mm->page_table_lock
  *           zone->lru_lock (in mark_page_accessed)
- *           swap_list_lock (in swap_free etc's swap_info_get)
+ *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
- *             swap_device_lock (in swap_duplicate, swap_info_get)
  *             mapping->private_lock (in __set_page_dirty_buffers)
  *             inode_lock (in set_page_dirty's __mark_inode_dirty)
  *               sb_lock (within inode_lock in fs/fs-writeback.c)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e675ae55f87..4b6e8bf986b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,7 +31,7 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 
-DEFINE_SPINLOCK(swaplock);
+DEFINE_SPINLOCK(swap_lock);
 unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
@@ -51,7 +51,7 @@ static DECLARE_MUTEX(swapon_sem);
 
 /*
  * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
+ * hold swap_lock while calling the unplug_fn. And swap_lock
  * cannot be turned into a semaphore.
  */
 static DECLARE_RWSEM(swap_unplug_sem);
@@ -105,7 +105,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
 			goto lowest;
-		swap_device_unlock(si);
+		spin_unlock(&swap_lock);
 
 		offset = si->lowest_bit;
 		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
@@ -115,7 +115,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 			if (si->swap_map[offset])
 				last_in_cluster = offset + SWAPFILE_CLUSTER;
 			else if (offset == last_in_cluster) {
-				swap_device_lock(si);
+				spin_lock(&swap_lock);
 				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
 				goto cluster;
 			}
@@ -124,7 +124,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 				latency_ration = LATENCY_LIMIT;
 			}
 		}
-		swap_device_lock(si);
+		spin_lock(&swap_lock);
 		goto lowest;
 	}
 
@@ -153,10 +153,10 @@ checks:	if (!(si->flags & SWP_WRITEOK))
 		return offset;
 	}
 
-	swap_device_unlock(si);
+	spin_unlock(&swap_lock);
 	while (++offset <= si->highest_bit) {
 		if (!si->swap_map[offset]) {
-			swap_device_lock(si);
+			spin_lock(&swap_lock);
 			goto checks;
 		}
 		if (unlikely(--latency_ration < 0)) {
@@ -164,7 +164,7 @@ checks:	if (!(si->flags & SWP_WRITEOK))
 			latency_ration = LATENCY_LIMIT;
 		}
 	}
-	swap_device_lock(si);
+	spin_lock(&swap_lock);
 	goto lowest;
 
 no_page:
@@ -179,7 +179,7 @@ swp_entry_t get_swap_page(void)
 	int type, next;
 	int wrapped = 0;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	if (nr_swap_pages <= 0)
 		goto noswap;
 	nr_swap_pages--;
@@ -199,19 +199,17 @@ swp_entry_t get_swap_page(void)
 			continue;
 
 		swap_list.next = next;
-		swap_device_lock(si);
-		swap_list_unlock();
 		offset = scan_swap_map(si);
-		swap_device_unlock(si);
-		if (offset)
+		if (offset) {
+			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
-		swap_list_lock();
+		}
 		next = swap_list.next;
 	}
 
 	nr_swap_pages++;
 noswap:
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	return (swp_entry_t) {0};
 }
 
@@ -233,8 +231,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 		goto bad_offset;
 	if (!p->swap_map[offset])
 		goto bad_free;
-	swap_list_lock();
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	return p;
 
 bad_free:
@@ -252,12 +249,6 @@ out:
 	return NULL;
 }	
 
-static void swap_info_put(struct swap_info_struct * p)
-{
-	swap_device_unlock(p);
-	swap_list_unlock();
-}
-
 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 {
 	int count = p->swap_map[offset];
@@ -290,7 +281,7 @@ void swap_free(swp_entry_t entry)
 	p = swap_info_get(entry);
 	if (p) {
 		swap_entry_free(p, swp_offset(entry));
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 }
 
@@ -308,7 +299,7 @@ static inline int page_swapcount(struct page *page)
 	if (p) {
 		/* Subtract the 1 for the swap cache itself */
 		count = p->swap_map[swp_offset(entry)] - 1;
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 	return count;
 }
@@ -365,7 +356,7 @@ int remove_exclusive_swap_page(struct page *page)
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
 	}
-	swap_info_put(p);
+	spin_unlock(&swap_lock);
 
 	if (retval) {
 		swap_free(entry);
@@ -388,7 +379,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1)
 			page = find_trylock_page(&swapper_space, entry.val);
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 	if (page) {
 		int one_user;
@@ -558,10 +549,10 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 	int count;
 
 	/*
-	 * No need for swap_device_lock(si) here: we're just looking
+	 * No need for swap_lock here: we're just looking
 	 * for whether an entry is in use, not modifying it; false
 	 * hits are okay, and sys_swapoff() has already prevented new
-	 * allocations from this area (while holding swap_list_lock()).
+	 * allocations from this area (while holding swap_lock).
 	 */
 	for (;;) {
 		if (++i >= max) {
@@ -751,9 +742,9 @@ static int try_to_unuse(unsigned int type)
 		 * report them; but do report if we reset SWAP_MAP_MAX.
 		 */
 		if (*swap_map == SWAP_MAP_MAX) {
-			swap_device_lock(si);
+			spin_lock(&swap_lock);
 			*swap_map = 1;
-			swap_device_unlock(si);
+			spin_unlock(&swap_lock);
 			reset_overflow = 1;
 		}
 
@@ -817,9 +808,9 @@ static int try_to_unuse(unsigned int type)
 }
 
 /*
- * After a successful try_to_unuse, if no swap is now in use, we know we
- * can empty the mmlist.  swap_list_lock must be held on entry and exit.
- * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * After a successful try_to_unuse, if no swap is now in use, we know
+ * we can empty the mmlist.  swap_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_lock, and an mm must be
  * added to the mmlist just after page_duplicate - before would be racy.
  */
 static void drain_mmlist(void)
@@ -1092,7 +1083,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 
 	mapping = victim->f_mapping;
 	prev = -1;
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
 		p = swap_info + type;
 		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
@@ -1103,14 +1094,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	}
 	if (type < 0) {
 		err = -EINVAL;
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 	if (!security_vm_enough_memory(p->pages))
 		vm_unacct_memory(p->pages);
 	else {
 		err = -ENOMEM;
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 	if (prev < 0) {
@@ -1124,10 +1115,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	}
 	nr_swap_pages -= p->pages;
 	total_swap_pages -= p->pages;
-	swap_device_lock(p);
 	p->flags &= ~SWP_WRITEOK;
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 
 	current->flags |= PF_SWAPOFF;
 	err = try_to_unuse(type);
@@ -1135,7 +1124,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 
 	if (err) {
 		/* re-insert swap space back into swap_list */
-		swap_list_lock();
+		spin_lock(&swap_lock);
 		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
 			if (p->prio >= swap_info[i].prio)
 				break;
@@ -1146,10 +1135,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 			swap_info[prev].next = p - swap_info;
 		nr_swap_pages += p->pages;
 		total_swap_pages += p->pages;
-		swap_device_lock(p);
 		p->flags |= SWP_WRITEOK;
-		swap_device_unlock(p);
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 
@@ -1157,30 +1144,27 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	down_write(&swap_unplug_sem);
 	up_write(&swap_unplug_sem);
 
+	destroy_swap_extents(p);
+	down(&swapon_sem);
+	spin_lock(&swap_lock);
+	drain_mmlist();
+
 	/* wait for anyone still in scan_swap_map */
-	swap_device_lock(p);
 	p->highest_bit = 0;		/* cuts scans short */
 	while (p->flags >= SWP_SCANNING) {
-		swap_device_unlock(p);
+		spin_unlock(&swap_lock);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
-		swap_device_lock(p);
+		spin_lock(&swap_lock);
 	}
-	swap_device_unlock(p);
 
-	destroy_swap_extents(p);
-	down(&swapon_sem);
-	swap_list_lock();
-	drain_mmlist();
-	swap_device_lock(p);
 	swap_file = p->swap_file;
 	p->swap_file = NULL;
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	up(&swapon_sem);
 	vfree(swap_map);
 	inode = mapping->host;
@@ -1324,7 +1308,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	p = swap_info;
 	for (type = 0 ; type < nr_swapfiles ; type++,p++)
 		if (!(p->flags & SWP_USED))
@@ -1343,7 +1327,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	 * swp_entry_t or the architecture definition of a swap pte.
 	 */
 	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out;
 	}
 	if (type >= nr_swapfiles)
@@ -1357,7 +1341,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	p->highest_bit = 0;
 	p->cluster_nr = 0;
 	p->inuse_pages = 0;
-	spin_lock_init(&p->sdev_lock);
 	p->next = -1;
 	if (swap_flags & SWAP_FLAG_PREFER) {
 		p->prio =
@@ -1365,7 +1348,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	} else {
 		p->prio = --least_priority;
 	}
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	name = getname(specialfile);
 	error = PTR_ERR(name);
 	if (IS_ERR(name)) {
@@ -1542,8 +1525,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	}
 
 	down(&swapon_sem);
-	swap_list_lock();
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	p->flags = SWP_ACTIVE;
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;
@@ -1567,8 +1549,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	} else {
 		swap_info[prev].next = p - swap_info;
 	}
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	up(&swapon_sem);
 	error = 0;
 	goto out;
@@ -1579,14 +1560,14 @@ bad_swap:
 	}
 	destroy_swap_extents(p);
 bad_swap_2:
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	swap_map = p->swap_map;
 	p->swap_file = NULL;
 	p->swap_map = NULL;
 	p->flags = 0;
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	vfree(swap_map);
 	if (swap_file)
 		filp_close(swap_file, NULL);
@@ -1610,7 +1591,7 @@ void si_swapinfo(struct sysinfo *val)
 	unsigned int i;
 	unsigned long nr_to_be_unused = 0;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (i = 0; i < nr_swapfiles; i++) {
 		if (!(swap_info[i].flags & SWP_USED) ||
 		     (swap_info[i].flags & SWP_WRITEOK))
@@ -1619,7 +1600,7 @@ void si_swapinfo(struct sysinfo *val)
 	}
 	val->freeswap = nr_swap_pages + nr_to_be_unused;
 	val->totalswap = total_swap_pages + nr_to_be_unused;
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 }
 
 /*
@@ -1640,7 +1621,7 @@ int swap_duplicate(swp_entry_t entry)
 	p = type + swap_info;
 	offset = swp_offset(entry);
 
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	if (offset < p->max && p->swap_map[offset]) {
 		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
 			p->swap_map[offset]++;
@@ -1652,7 +1633,7 @@ int swap_duplicate(swp_entry_t entry)
 			result = 1;
 		}
 	}
-	swap_device_unlock(p);
+	spin_unlock(&swap_lock);
 out:
 	return result;
 
@@ -1668,7 +1649,7 @@ get_swap_info_struct(unsigned type)
 }
 
 /*
- * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * swap_lock prevents swap_map being freed. Don't grab an extra
  * reference on the swaphandle, it doesn't matter if it becomes unused.
  */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
@@ -1684,7 +1665,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		toff++, i--;
 	*offset = toff;
 
-	swap_device_lock(swapdev);
+	spin_lock(&swap_lock);
 	do {
 		/* Don't read-ahead past the end of the swap area */
 		if (toff >= swapdev->max)
@@ -1697,6 +1678,6 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		toff++;
 		ret++;
 	} while (--i);
-	swap_device_unlock(swapdev);
+	spin_unlock(&swap_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 3279ffd97f1b3962e40d3c5f09495ef8320b180b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:43 -0700
Subject: [PATCH] delete from_swap_cache BUG_ONs

Three of the four BUG_ONs in delete_from_swap_cache are immediately
repeated in __delete_from_swap_cache: delete those and add the one.  But
perhaps mm/ is altogether overprovisioned with historic BUGs?

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap_state.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4f251775ef9..029e56eb5e7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,6 +124,7 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
+	BUG_ON(PagePrivate(page));
 
 	radix_tree_delete(&swapper_space.page_tree, page->private);
 	page->private = 0;
@@ -196,11 +197,6 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
-	BUG_ON(!PageSwapCache(page));
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(PagePrivate(page));
-  
 	entry.val = page->private;
 
 	write_lock_irq(&swapper_space.tree_lock);
-- 
cgit v1.2.3


From 839b9685e80592809d6dfdd865986cd1b5ddc2fb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:43 -0700
Subject: [PATCH] rmap: don't test rss

Remove the three get_mm_counter(mm, rss) tests from rmap.c: there was a
time when testing rss was important to avoid a particular race between
dup_mmap and the anonmm rmap; but now it's just a rather silly pseudo-
optimization, made even more obscure by the get_mm_counter macro.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index facb8cdca66..28c6cf96d3c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -289,8 +289,6 @@ static int page_referenced_one(struct page *page,
 	pte_t *pte;
 	int referenced = 0;
 
-	if (!get_mm_counter(mm, rss))
-		goto out;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
@@ -517,8 +515,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	pte_t pteval;
 	int ret = SWAP_AGAIN;
 
-	if (!get_mm_counter(mm, rss))
-		goto out;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
@@ -766,8 +762,7 @@ static int try_to_unmap_file(struct page *page)
 			if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
-			while (get_mm_counter(vma->vm_mm, rss) &&
-				cursor < max_nl_cursor &&
+			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
 				try_to_unmap_cluster(cursor, &mapcount, vma);
 				cursor += CLUSTER_SIZE;
-- 
cgit v1.2.3


From 6e21c8f145f5052c1c2fb4a4b41bee01c848159b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 3 Sep 2005 15:54:45 -0700
Subject: [PATCH] /proc/<pid>/numa_maps to show on which nodes pages reside

This patch was recently discussed on linux-mm:
http://marc.theaimsgroup.com/?t=112085728500002&r=1&w=2

I inherited a large code base from Ray for page migration.  There was a
small patch in there that I find to be very useful since it allows the
display of the locality of the pages in use by a process.  I reworked that
patch and came up with a /proc/<pid>/numa_maps that gives more information
about the vma's of a process.  numa_maps is indexes by the start address
found in /proc/<pid>/maps.  F.e.  with this patch you can see the page use
of the "getty" process:

margin:/proc/12008 # cat maps
00000000-00004000 r--p 00000000 00:00 0
2000000000000000-200000000002c000 r-xp 00000000 08:04 516                /lib/ld-2.3.3.so
2000000000038000-2000000000040000 rw-p 00028000 08:04 516                /lib/ld-2.3.3.so
2000000000040000-2000000000044000 rw-p 2000000000040000 00:00 0
2000000000058000-2000000000260000 r-xp 00000000 08:04 54707842           /lib/tls/libc.so.6.1
2000000000260000-2000000000268000 ---p 00208000 08:04 54707842           /lib/tls/libc.so.6.1
2000000000268000-2000000000274000 rw-p 00200000 08:04 54707842           /lib/tls/libc.so.6.1
2000000000274000-2000000000280000 rw-p 2000000000274000 00:00 0
2000000000280000-20000000002b4000 r--p 00000000 08:04 9126923            /usr/lib/locale/en_US.utf8/LC_CTYPE
2000000000300000-2000000000308000 r--s 00000000 08:04 60071467           /usr/lib/gconv/gconv-modules.cache
2000000000318000-2000000000328000 rw-p 2000000000318000 00:00 0
4000000000000000-4000000000008000 r-xp 00000000 08:04 29576399           /sbin/mingetty
6000000000004000-6000000000008000 rw-p 00004000 08:04 29576399           /sbin/mingetty
6000000000008000-600000000002c000 rw-p 6000000000008000 00:00 0          [heap]
60000fff7fffc000-60000fff80000000 rw-p 60000fff7fffc000 00:00 0
60000ffffff44000-60000ffffff98000 rw-p 60000ffffff44000 00:00 0          [stack]
a000000000000000-a000000000020000 ---p 00000000 00:00 0                  [vdso]

cat numa_maps
2000000000000000 default MaxRef=43 Pages=11 Mapped=11 N0=4 N1=3 N2=2 N3=2
2000000000038000 default MaxRef=1 Pages=2 Mapped=2 Anon=2 N0=2
2000000000040000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
2000000000058000 default MaxRef=43 Pages=61 Mapped=61 N0=14 N1=15 N2=16 N3=16
2000000000268000 default MaxRef=1 Pages=2 Mapped=2 Anon=2 N0=2
2000000000274000 default MaxRef=1 Pages=3 Mapped=3 Anon=3 N0=3
2000000000280000 default MaxRef=8 Pages=3 Mapped=3 N0=3
2000000000300000 default MaxRef=8 Pages=2 Mapped=2 N0=2
2000000000318000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N2=1
4000000000000000 default MaxRef=6 Pages=2 Mapped=2 N1=2
6000000000004000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
6000000000008000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
60000fff7fffc000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1
60000ffffff44000 default MaxRef=1 Pages=1 Mapped=1 Anon=1 N0=1

getty uses ld.so.  The first vma is the code segment which is used by 43
other processes and the pages are evenly distributed over the 4 nodes.

The second vma is the process specific data portion for ld.so.  This is
only one page.

The display format is:

<startaddress>	 Links to information in /proc/<pid>/map
<memory policy>  This can be "default" "interleave={}", "prefer=<node>" or "bind={<zones>}"
MaxRef=		<maximum reference to a page in this vma>
Pages=		<Nr of pages in use>
Mapped=		<Nr of pages with mapcount >
Anon=		<nr of anonymous pages>
Nx=		<Nr of pages on Node x>

The content of the proc-file is self-evident.  If this would be tied into
the sparsemem system then the contents of this file would not be too
useful.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b4eababc819..13492d66b7c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -664,10 +664,10 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-static struct mempolicy *
-get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
+struct mempolicy *
+get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy *pol = task->mempolicy;
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -786,7 +786,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
 struct page *
 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	cpuset_update_current_mems_allowed();
 
@@ -908,7 +908,7 @@ void __mpol_free(struct mempolicy *p)
 /* Find first node suitable for an allocation */
 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
@@ -928,7 +928,7 @@ int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 /* Find secondary valid nodes for an allocation */
 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	switch (pol->policy) {
 	case MPOL_PREFERRED:
-- 
cgit v1.2.3


From c3dce2d89c269d5373a120d4a22fc2426ec992b0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:46 -0700
Subject: [PATCH] mm: comment rmap

Just be clear that VM_RESERVED pages here are a bug, and the test is not there
because they are expected.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 28c6cf96d3c..f5a6966b7eb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -527,6 +527,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	 * If the page is mlock()d, we cannot swap it out.
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
+	 *
+	 * Pages belonging to VM_RESERVED regions should not happen here.
 	 */
 	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 			ptep_clear_flush_young(vma, address, pte)) {
-- 
cgit v1.2.3


From 2822c1aa574d277b9ba0130b1e71c1a5874bc04a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:47 -0700
Subject: [PATCH] mm: micro-optimise rmap

Microoptimise page_add_anon_rmap.  Although these expressions are used only in
the taken branch of the if() statement, the compiler can't reorder them inside
because atomic_inc_and_test is a barrier.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index f5a6966b7eb..7e975ca24c7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -439,22 +439,23 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	struct anon_vma *anon_vma = vma->anon_vma;
-	pgoff_t index;
-
 	BUG_ON(PageReserved(page));
-	BUG_ON(!anon_vma);
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
 
-	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-	index = (address - vma->vm_start) >> PAGE_SHIFT;
-	index += vma->vm_pgoff;
-	index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
-		page->index = index;
+		struct anon_vma *anon_vma = vma->anon_vma;
+		pgoff_t index;
+
+		BUG_ON(!anon_vma);
+		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 		page->mapping = (struct address_space *) anon_vma;
+
+		index = (address - vma->vm_start) >> PAGE_SHIFT;
+		index += vma->vm_pgoff;
+		index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
+		page->index = index;
+
 		inc_page_state(nr_mapped);
 	}
 	/* else checking page index and mapping is racy */
-- 
cgit v1.2.3


From 4d7670e0f649f9e6e6ea6c8bb9f52441fa00f92b Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:48 -0700
Subject: [PATCH] mm: cleanup rmap

Thanks to Bill Irwin for pointing this out.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 7e975ca24c7..450f5241b5a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -445,16 +445,12 @@ void page_add_anon_rmap(struct page *page,
 
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
-		pgoff_t index;
 
 		BUG_ON(!anon_vma);
 		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 		page->mapping = (struct address_space *) anon_vma;
 
-		index = (address - vma->vm_start) >> PAGE_SHIFT;
-		index += vma->vm_pgoff;
-		index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-		page->index = index;
+		page->index = linear_page_index(vma, address);
 
 		inc_page_state(nr_mapped);
 	}
-- 
cgit v1.2.3


From 9a61c349b28ec5aef7e929236571fd770fdef0bb Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:49 -0700
Subject: [PATCH] mm: remap ZERO_PAGE mappings

filemap_xip's nopage routine maps the ZERO_PAGE into readonly mappings, if it
has no data page to map there: then if the hole in the file is later filled,
__xip_unmap uses an rmap technique to replace the ZERO_PAGEs mapped for that
offset by the newly allocated file page, so that established mappings will see
the newly written data.

However, on MIPS (alone) there's not one but as many as eight ZERO_PAGEs,
chosen for coloring by user virtual address; and if mremap has meanwhile been
used to move a mapping containing a ZERO_PAGE, it will generally not match the
ZERO_PAGE(address) __xip_unmap is looking for.

To maintain XIP's established mappings correctly on MIPS, we need Nick's fix
to mremap's move_one_page (originally presented as an optimization), to
replace the ZERO_PAGE appropriate to the old address by the ZERO_PAGE
appropriate to the new address.

(But when I first saw this, I was thinking the ZERO_PAGEs themselves would get
corrupted, very bad.  Now I think it's the other way round, that the
established mappings will fail to see the newly written data: incorrect, but
not corrupting everything else.  Whether filemap_xip's technique is generally
safe, I'd hesitate to say in a hurry: it's interesting, but we've never tried
to do that in tmpfs.)

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mremap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/mremap.c b/mm/mremap.c
index fc45dc9a617..a32fed454bd 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -141,6 +141,10 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
 			if (dst) {
 				pte_t pte;
 				pte = ptep_clear_flush(vma, old_addr, src);
+				/* ZERO_PAGE can be dependant on virtual addr */
+				if (pfn_valid(pte_pfn(pte)) &&
+					pte_page(pte) == ZERO_PAGE(old_addr))
+					pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr), new_vma->vm_page_prot));
 				set_pte_at(mm, new_addr, dst, pte);
 			} else
 				error = -ENOMEM;
-- 
cgit v1.2.3


From 242e54686257493f0b10ac557e730419d9af7d24 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 3 Sep 2005 15:54:50 -0700
Subject: [PATCH] mm: remove atomic

This bitop does not need to be atomic because it is performed when there will
be no references to the page (ie.  the page is being freed).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d088371196..620aa11b24e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -329,7 +329,7 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_writeback )))
 		bad_page(function, page);
 	if (PageDirty(page))
-		ClearPageDirty(page);
+		__ClearPageDirty(page);
 }
 
 /*
-- 
cgit v1.2.3


From bce5f6ba340b09d8b29902add204bb95a6d3d88b Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Sat, 3 Sep 2005 15:54:50 -0700
Subject: [PATCH] VM: add capabilites check to set_zone_reclaim

Add a capability check to sys_set_zone_reclaim().  This syscall is not
something that should be available to a user.

Signed-off-by:  Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfffe5098d5..ab631a3c62c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1375,6 +1375,9 @@ asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
 	struct zone *z;
 	int i;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (node >= MAX_NUMNODES || !node_online(node))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 53e9a6159fdc6419874ce4d86d3577dbedc77b62 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@bork.org>
Date: Sat, 3 Sep 2005 15:54:51 -0700
Subject: [PATCH] VM: zone reclaim atomic ops cleanup

Christoph Lameter and Marcelo Tosatti asked to get rid of the
atomic_inc_and_test() to cleanup the atomic ops in the zone reclaim code.

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 2 +-
 mm/vmscan.c     | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 620aa11b24e..d157dae8c9f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1909,7 +1909,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
-		atomic_set(&zone->reclaim_in_progress, -1);
+		atomic_set(&zone->reclaim_in_progress, 0);
 		if (!size)
 			continue;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab631a3c62c..0095533cdde 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -822,6 +822,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 
+	atomic_inc(&zone->reclaim_in_progress);
+
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
@@ -861,6 +863,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	}
 
 	throttle_vm_writeout();
+
+	atomic_dec(&zone->reclaim_in_progress);
 }
 
 /*
@@ -900,9 +904,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		atomic_inc(&zone->reclaim_in_progress);
 		shrink_zone(zone, sc);
-		atomic_dec(&zone->reclaim_in_progress);
 	}
 }
  
@@ -1358,14 +1360,13 @@ int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 
 	/* Don't reclaim the zone if there are other reclaimers active */
-	if (!atomic_inc_and_test(&zone->reclaim_in_progress))
+	if (atomic_read(&zone->reclaim_in_progress) > 0)
 		goto out;
 
 	shrink_zone(zone, &sc);
 	total_reclaimed = sc.nr_reclaimed;
 
  out:
-	atomic_dec(&zone->reclaim_in_progress);
 	return total_reclaimed;
 }
 
-- 
cgit v1.2.3


From 836d5ffd34550901ea024347693e689273ded8aa Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:53 -0700
Subject: [PATCH] mm: fix madvise vma merging

Better late than never, I've at last reviewed the madvise vma merging
going into 2.6.13.  Remove a pointless check and fix two little bugs -
a simple test (with /proc/<pid>/maps hacked to show ReadHints) showed
both mismerges in practice: though being madvise, neither was disastrous.

1. Correct placement of the success label in madvise_behavior: as in
   mprotect_fixup and mlock_fixup, it is necessary to update vm_flags
   when vma_merge succeeds (to handle the exceptional Case 8 noted in
   the comments above vma_merge itself).

2. Correct initial value of prev when starting part way into a vma: as
   in sys_mprotect and do_mlock, it needs to be set to vma in this case
   (vma_merge handles only that minimum of cases shown in its comments).

3. If find_vma_prev sets prev, then the vma it returns is prev->vm_next,
   so it's pointless to make that same assignment again in sys_madvise.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/madvise.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index c8c01a12fea..4454936f87d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,7 +37,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 
 	if (new_flags == vma->vm_flags) {
 		*prev = vma;
-		goto success;
+		goto out;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -62,6 +62,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 			goto out;
 	}
 
+success:
 	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
@@ -70,7 +71,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
 out:
 	if (error == -ENOMEM)
 		error = -EAGAIN;
-success:
 	return error;
 }
 
@@ -237,8 +237,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
 	 * - different from the way of handling in mlock etc.
 	 */
 	vma = find_vma_prev(current->mm, start, &prev);
-	if (!vma && prev)
-		vma = prev->vm_next;
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
 	for (;;) {
 		/* Still start < end. */
 		error = -ENOMEM;
-- 
cgit v1.2.3


From 0abf40c1ac3f25d264c019e1cfe155d590defb87 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Sat, 3 Sep 2005 15:54:54 -0700
Subject: [PATCH] vm: slab.c spelling correction

Fix a small spelling mistake.  subtile->subtle

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index c9e706db463..ae6cca04de4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -600,7 +600,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size,
 		csizep++;
 
 	/*
-	 * Really subtile: The last entry with cs->cs_size==ULONG_MAX
+	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 	 * has cs_{dma,}cachep==NULL. Thus no special case
 	 * for large kmalloc calls required.
 	 */
-- 
cgit v1.2.3


From d44ed4f86892e350f4b16a3489b7e7c1a9bb7ead Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 3 Sep 2005 15:54:55 -0700
Subject: [PATCH] shmem_populate: avoid an useless check, and some comments

Either shmem_getpage returns a failure, or it found a page, or it was told
it couldn't do any I/O.  So it's useless to check nonblock in the else
branch.  We could add a BUG() there but I preferred to comment the
offending function.

This was taken out from one Ingo Molnar's old patch I'm resurrecting.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 7 +++++++
 mm/shmem.c   | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index edc54436fa9..88611928e71 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1504,8 +1504,12 @@ repeat:
 		return -EINVAL;
 
 	page = filemap_getpage(file, pgoff, nonblock);
+
+	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
+	 * done in shmem_populate calling shmem_getpage */
 	if (!page && !nonblock)
 		return -ENOMEM;
+
 	if (page) {
 		err = install_page(mm, vma, addr, page, prot);
 		if (err) {
@@ -1513,6 +1517,9 @@ repeat:
 			return err;
 		}
 	} else {
+		/* No page was found just because we can't read it in now (being
+		 * here implies nonblock != 0), but the page may exist, so set
+		 * the PTE to fault it in later. */
 		err = install_file_pte(mm, vma, addr, pgoff, prot);
 		if (err)
 			return err;
diff --git a/mm/shmem.c b/mm/shmem.c
index 5a81b1ee4f7..08a3bc2fba6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1195,6 +1195,7 @@ static int shmem_populate(struct vm_area_struct *vma,
 		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
 		if (err)
 			return err;
+		/* Page may still be null, but only if nonblock was set. */
 		if (page) {
 			mark_page_accessed(page);
 			err = install_page(mm, vma, addr, page, prot);
@@ -1202,7 +1203,10 @@ static int shmem_populate(struct vm_area_struct *vma,
 				page_cache_release(page);
 				return err;
 			}
-		} else if (nonblock) {
+		} else {
+			/* No page was found just because we can't read it in
+			 * now (being here implies nonblock != 0), but the page
+			 * may exist, so set the PTE to fault it in later. */
     			err = install_file_pte(mm, vma, addr, pgoff, prot);
 			if (err)
 	    			return err;
-- 
cgit v1.2.3


From 4944e76d81801b8e60ed3e7789443f210c16ed65 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 3 Sep 2005 15:54:56 -0700
Subject: [PATCH] mm: remove implied vm_ops check

If !vma->vm-ops we already BUG above, so retesting it is useless.  The
compiler cannot optimize this because BUG is a macro and is not thus marked
noreturn; that should possibly be fixed.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index a596c117224..b25f5e58a14 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1955,7 +1955,7 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	 * Fall back to the linear mapping if the fs does not support
 	 * ->populate:
 	 */
-	if (!vma->vm_ops || !vma->vm_ops->populate || 
+	if (!vma->vm_ops->populate ||
 			(write_access && !(vma->vm_flags & VM_SHARED))) {
 		pte_clear(mm, address, pte);
 		return do_no_page(mm, vma, address, write_access, pte, pmd);
-- 
cgit v1.2.3


From fd195c49fb17a21e232f50bddb2267150053cf34 Mon Sep 17 00:00:00 2001
From: Deepak Saxena <dsaxena@plexity.net>
Date: Sat, 3 Sep 2005 15:54:58 -0700
Subject: [PATCH] arm: allow for arch-specific IOREMAP_MAX_ORDER

Version 6 of the ARM architecture introduces the concept of 16MB pages
(supersections) and 36-bit (40-bit actually, but nobody uses this) physical
addresses.  36-bit addressed memory and I/O and ARMv6 can only be mapped
using supersections and the requirement on these is that both virtual and
physical addresses be 16MB aligned.  In trying to add support for ioremap()
of 36-bit I/O, we run into the issue that get_vm_area() allows for a
maximum of 512K alignment via the IOREMAP_MAX_ORDER constant.  To work
around this, we can:

- Allocate a larger VM area than needed (size + (1ul << IOREMAP_MAX_ORDER))
  and then align the pointer ourselves, but this ends up with 512K of
  wasted VM per ioremap().

- Provide a new __get_vm_area_aligned() API and make __get_vm_area() sit
  on top of this. I did this and it works but I don't like the idea
  adding another VM API just for this one case.

- My preferred solution which is to allow the architecture to override
  the IOREMAP_MAX_ORDER constant with it's own version.

Signed-off-by: Deepak Saxena <dsaxena@plexity.net>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmalloc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8ff16a1eee6..67b358e57ef 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -158,8 +158,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 	return err;
 }
 
-#define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
-
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 				unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From 7bf07f3d4b4358aa6d99a26d7a0165f1e91c3fcc Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Sat, 3 Sep 2005 15:55:00 -0700
Subject: [PATCH] hugetlb: move stale pte check into huge_pte_alloc()

Initial Post (Wed, 17 Aug 2005)

This patch moves the
	if (! pte_none(*pte))
		hugetlb_clean_stale_pgtable(pte);
logic into huge_pte_alloc() so all of its callers can be immune to the bug
described by Kenneth Chen at http://lkml.org/lkml/2004/6/16/246

> It turns out there is a bug in hugetlb_prefault(): with 3 level page table,
> huge_pte_alloc() might return a pmd that points to a PTE page. It happens
> if the virtual address for hugetlb mmap is recycled from previously used
> normal page mmap. free_pgtables() might not scrub the pmd entry on
> munmap and hugetlb_prefault skips on any pmd presence regardless what type
> it is.

Unless I am missing something, it seems more correct to place the check inside
huge_pte_alloc() to prevent a the same bug wherever a huge pte is allocated.
It also allows checking for this condition when lazily faulting huge pages
later in the series.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6bf720bc662..901ac523a1c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -360,8 +360,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 			ret = -ENOMEM;
 			goto out;
 		}
-		if (! pte_none(*pte))
-			hugetlb_clean_stale_pgtable(pte);
 
 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-- 
cgit v1.2.3


From fa5b08d5f818063d18433194f20359ef2ae50254 Mon Sep 17 00:00:00 2001
From: Kyle Moffett <mrmacman_g4@mac.com>
Date: Sat, 3 Sep 2005 15:55:03 -0700
Subject: [PATCH] sab: consolidate kmem_bufctl_t

This is used only in slab.c and each architecture gets to define whcih
underlying type is to be used.

Seems a bit silly - move it to slab.c and use the same type for all
architectures: unsigned int.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ae6cca04de4..59d382fbca1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -189,6 +189,7 @@
  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
  */
 
+typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
-- 
cgit v1.2.3


From a600388d28419305aad3c4c0af52c223cf6fa0af Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:55:04 -0700
Subject: [PATCH] x86: ptep_clear optimization

Add a new accessor for PTEs, which passes the full hint from the mmu_gather
struct; this allows architectures with hardware pagetables to optimize away
atomic PTE operations when destroying an address space.  Removing the
locked operation should allow better pipelining of memory access in this
loop.  I measured an average savings of 30-35 cycles per zap_pte_range on
the first 500 destructions on Pentium-M, but I believe the optimization
would win more on older processors which still assert the bus lock on xchg
for an exclusive cacheline.

Update: I made some new measurements, and this saves exactly 26 cycles over
ptep_get_and_clear on Pentium M.  On P4, with a PAE kernel, this saves 180
cycles per ptep_get_and_clear, for a whopping 92160 cycles savings for a
full address space destruction.

pte_clear_full is not yet used, but is provided for future optimizations
(in particular, when running inside of a hypervisor that queues page table
updates, the full hint allows us to avoid queueing unnecessary page table
update for an address space in the process of being destroyed.

This is not a huge win, but it does help a bit, and sets the stage for
further hypervisor optimization of the mm layer on all architectures.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: <linux-mm@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index b25f5e58a14..788a6281034 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -562,7 +562,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear(tlb->mm, addr, pte);
+			ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
@@ -590,7 +591,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear(tlb->mm, addr, pte);
+		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
 }
-- 
cgit v1.2.3


From 34342e863c3143640c031760140d640a06c6a5f8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 3 Sep 2005 15:55:06 -0700
Subject: [PATCH] mm/slab.c: prefetchw the start of new allocated objects

Mostobjects returned by __cache_alloc() will be written by the caller,
(but not all callers want to write all the object, but just at the
begining) prefetchw() tells the modern CPU to think about the future
writes, ie start some memory transactions in advance.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 59d382fbca1..75127a6f1fd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2166,7 +2166,9 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
 		objp = cache_alloc_refill(cachep, flags);
 	}
 	local_irq_restore(save_flags);
-	objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
+	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
+					__builtin_return_address(0));
+	prefetchw(objp);
 	return objp;
 }
 
-- 
cgit v1.2.3


From 00e145b6d59a16dd7740197a18f7abdb3af004a9 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sat, 3 Sep 2005 15:55:07 -0700
Subject: [PATCH] slab: removes local_irq_save()/local_irq_restore() pair

Proposed by and based on a patch from Eric Dumazet <dada1@cosmosbay.com>:
This patch removes unnecessary critical section in ksize() function, as
cli/sti are rather expensive on modern CPUS.

It additionally adds a docbook entry for ksize() and further simplifies the
code.

Signed-Off-By: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 75127a6f1fd..a9ff4f7f986 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3076,20 +3076,24 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 }
 #endif
 
+/**
+ * ksize - get the actual amount of memory allocated for a given object
+ * @objp: Pointer to the object
+ *
+ * kmalloc may internally round up allocations and return more memory
+ * than requested. ksize() can be used to determine the actual amount of
+ * memory allocated. The caller may use this additional memory, even though
+ * a smaller amount of memory was initially specified with the kmalloc call.
+ * The caller must guarantee that objp points to a valid object previously
+ * allocated with either kmalloc() or kmem_cache_alloc(). The object
+ * must not be freed during the duration of the call.
+ */
 unsigned int ksize(const void *objp)
 {
-	kmem_cache_t *c;
-	unsigned long flags;
-	unsigned int size = 0;
-
-	if (likely(objp != NULL)) {
-		local_irq_save(flags);
-		c = GET_PAGE_CACHE(virt_to_page(objp));
-		size = kmem_cache_size(c);
-		local_irq_restore(flags);
-	}
+	if (unlikely(objp == NULL))
+		return 0;
 
-	return size;
+	return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp)));
 }
 
 
-- 
cgit v1.2.3


From c07e02db76940c75fc92f2f2c9adcdbb09ed70d0 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Sat, 3 Sep 2005 15:55:11 -0700
Subject: [PATCH] VM: add page_state info to per-node meminfo

Add page_state info to the per-node meminfo file in sysfs.  This is mostly
just for informational purposes.

The lack of this information was brought up recently during a discussion
regarding pagecache clearing, and I put this patch together to test out one
of the suggestions.

It seems like interesting info to have, so I'm submitting the patch.

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d157dae8c9f..b06a9636d97 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1130,19 +1130,20 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
 
-void __get_page_state(struct page_state *ret, int nr)
+void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
 	memset(ret, 0, sizeof(*ret));
+	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
-	cpu = first_cpu(cpu_online_map);
+	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
-		cpu = next_cpu(cpu, cpu_online_map);
+		cpu = next_cpu(cpu, *cpumask);
 
 		if (cpu < NR_CPUS)
 			prefetch(&per_cpu(page_states, cpu));
@@ -1153,19 +1154,33 @@ void __get_page_state(struct page_state *ret, int nr)
 	}
 }
 
+void get_page_state_node(struct page_state *ret, int node)
+{
+	int nr;
+	cpumask_t mask = node_to_cpumask(node);
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr+1, &mask);
+}
+
 void get_page_state(struct page_state *ret)
 {
 	int nr;
+	cpumask_t mask = CPU_MASK_ALL;
 
 	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 	nr /= sizeof(unsigned long);
 
-	__get_page_state(ret, nr + 1);
+	__get_page_state(ret, nr + 1, &mask);
 }
 
 void get_full_page_state(struct page_state *ret)
 {
-	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+	cpumask_t mask = CPU_MASK_ALL;
+
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
 unsigned long __read_page_state(unsigned long offset)
-- 
cgit v1.2.3


From f549d6c18c0e8e6cf1bf0e7a47acc1daf7e2cec1 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Sat, 3 Sep 2005 15:55:18 -0700
Subject: [PATCH] Generic VFS fallback for security xattrs

This patch modifies the VFS setxattr, getxattr, and listxattr code to fall
back to the security module for security xattrs if the filesystem does not
support xattrs natively.  This allows security modules to export the incore
inode security label information to userspace even if the filesystem does
not provide xattr storage, and eliminates the need to individually patch
various pseudo filesystem types to provide such access.  The patch removes
the existing xattr code from devpts and tmpfs as it is then no longer
needed.

The patch restructures the code flow slightly to reduce duplication between
the normal path and the fallback path, but this should only have one
user-visible side effect - a program may get -EACCES rather than
-EOPNOTSUPP if policy denied access but the filesystem didn't support the
operation anyway.  Note that the post_setxattr hook call is not needed in
the fallback case, as the inode_setsecurity hook call handles the incore
inode security state update directly.  In contrast, we do call fsnotify in
both cases.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 85 --------------------------------------------------------------
 1 file changed, 85 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 08a3bc2fba6..bdc4bbb6ddb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,6 @@
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
-#include <linux/xattr.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
@@ -179,7 +178,6 @@ static struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
-static struct inode_operations shmem_special_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info = {
@@ -1300,7 +1298,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 
 		switch (mode & S_IFMT) {
 		default:
-			inode->i_op = &shmem_special_inode_operations;
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
@@ -1804,12 +1801,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 static struct inode_operations shmem_symlink_inline_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= shmem_follow_link_inline,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static struct inode_operations shmem_symlink_inode_operations = {
@@ -1817,12 +1808,6 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= shmem_follow_link,
 	.put_link	= shmem_put_link,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
@@ -1942,12 +1927,6 @@ static void shmem_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-#ifdef CONFIG_TMPFS_XATTR
-static struct xattr_handler *shmem_xattr_handlers[];
-#else
-#define shmem_xattr_handlers NULL
-#endif
-
 static int shmem_fill_super(struct super_block *sb,
 			    void *data, int silent)
 {
@@ -1998,7 +1977,6 @@ static int shmem_fill_super(struct super_block *sb,
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = TMPFS_MAGIC;
 	sb->s_op = &shmem_ops;
-	sb->s_xattr = shmem_xattr_handlers;
 
 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
 	if (!inode)
@@ -2087,12 +2065,6 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
@@ -2106,21 +2078,6 @@ static struct inode_operations shmem_dir_inode_operations = {
 	.rmdir		= shmem_rmdir,
 	.mknod		= shmem_mknod,
 	.rename		= shmem_rename,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
-#endif
-};
-
-static struct inode_operations shmem_special_inode_operations = {
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= generic_listxattr,
-	.removexattr	= generic_removexattr,
 #endif
 };
 
@@ -2146,48 +2103,6 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
 
 
-#ifdef CONFIG_TMPFS_SECURITY
-
-static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len,
-					const char *name, size_t name_len)
-{
-	return security_inode_listsecurity(inode, list, list_len);
-}
-
-static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_getsecurity(inode, name, buffer, size);
-}
-
-static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_setsecurity(inode, name, value, size, flags);
-}
-
-static struct xattr_handler shmem_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= shmem_xattr_security_list,
-	.get	= shmem_xattr_security_get,
-	.set	= shmem_xattr_security_set,
-};
-
-#endif	/* CONFIG_TMPFS_SECURITY */
-
-#ifdef CONFIG_TMPFS_XATTR
-
-static struct xattr_handler *shmem_xattr_handlers[] = {
-#ifdef CONFIG_TMPFS_SECURITY
-	&shmem_xattr_security_handler,
-#endif
-	NULL
-};
-
-#endif	/* CONFIG_TMPFS_XATTR */
-
 static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-- 
cgit v1.2.3


From c3d8c1414573be8cf7c8fdc1e076935697c7f6af Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@scalex86.org>
Date: Tue, 6 Sep 2005 15:16:33 -0700
Subject: [PATCH] More __read_mostly variables

Move some more frequently read variables that showed up during some of our
performance tests as sometimes ending up in hot cachelines to the
read_mostly section.

Fix: Move the __read_mostly from before hpet_usec_quotient to follow the
variable like the other uses of __read_mostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Christoph Lameter <christoph@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c       | 2 +-
 mm/page_alloc.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 404319477e7..bb43340d397 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -61,7 +61,7 @@ pgprot_t protection_map[16] = {
 
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 atomic_t vm_committed_space = ATOMIC_INIT(0);
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b06a9636d97..34bba8f1144 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -42,11 +42,11 @@
  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
  * initializer cleaner
  */
-nodemask_t node_online_map = { { [0] = 1UL } };
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map = NODE_MASK_ALL;
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list;
+struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 long nr_swap_pages;
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << ZONETABLE_SHIFT];
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-- 
cgit v1.2.3


From cdb3826b9958c204bc8ffda2cf9bbe2d899ef90c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 Sep 2005 15:16:57 -0700
Subject: [PATCH] remove misleading comment above sys_brk

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index bb43340d397..12334aecf8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -203,13 +203,6 @@ static void remove_vm_struct(struct vm_area_struct *vma)
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
-/*
- *  sys_brk() for the most part doesn't need the global kernel
- *  lock, except when an application is doing something nasty
- *  like trying to un-brk an area that has already been mapped
- *  to a regular file.  in this case, the unmapping will need
- *  to invoke file system routines that need the global lock.
- */
 asmlinkage unsigned long sys_brk(unsigned long brk)
 {
 	unsigned long rlim, retval;
-- 
cgit v1.2.3


From 3b30bbd963ac2606b0377b39c9d148d6eeef7dce Mon Sep 17 00:00:00 2001
From: Steven Pratt <slpratt@austin.ibm.com>
Date: Tue, 6 Sep 2005 15:17:06 -0700
Subject: [PATCH] readahead: reset cache_hit earlier

We don't reset the cache hit count until after readahead does a successful
readahead.  This seems to leave a corner case open where we miss in cache,
but don't restart the readhead right away.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index b840e7c6ea7..d0b50034e24 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -540,6 +540,7 @@ void handle_ra_miss(struct address_space *mapping,
 {
 	ra->flags |= RA_FLAG_MISS;
 	ra->flags &= ~RA_FLAG_INCACHE;
+	ra->cache_hit = 0;
 }
 
 /*
-- 
cgit v1.2.3


From 6c231b7bab0aa6860cd9da2de8a064eddc34c146 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 6 Sep 2005 15:17:45 -0700
Subject: [PATCH] Additions to .data.read_mostly section

Mark variables which are usually accessed for reads with __readmostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 4 ++--
 mm/shmem.c      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34bba8f1144..14d7032c1d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,8 +47,8 @@ EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
 struct pglist_data *pgdat_list __read_mostly;
-unsigned long totalram_pages;
-unsigned long totalhigh_pages;
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index bdc4bbb6ddb..db2c9e8d990 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -180,7 +180,7 @@ static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info = {
+static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 	.unplug_io_fn	= default_unplug_io_fn,
-- 
cgit v1.2.3


From a49335cceab8afb6603152fcc3f7d3b6677366ca Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:09 -0700
Subject: [PATCH] cpusets: oom_kill tweaks

This patch series extends the use of the cpuset attribute 'mem_exclusive'
to support cpuset configurations that:
 1) allow GFP_KERNEL allocations to come from a potentially larger
    set of memory nodes than GFP_USER allocations, and
 2) can constrain the oom killer to tasks running in cpusets in
    a specified subtree of the cpuset hierarchy.

Here's an example usage scenario.  For a few hours or more, a large NUMA
system at a University is to be divided in two halves, with a bunch of student
jobs running in half the system under some form of batch manager, and with a
big research project running in the other half.  Each of the student jobs is
placed in a small cpuset, but should share the classic Unix time share
facilities, such as buffered pages of files in /bin and /usr/lib.  The big
research project wants no interference whatsoever from the student jobs, and
has highly tuned, unusual memory and i/o patterns that intend to make full use
of all the main memory on the nodes available to it.

In this example, we have two big sibling cpusets, one of which is further
divided into a more dynamic set of child cpusets.

We want kernel memory allocations constrained by the two big cpusets, and user
allocations constrained by the smaller child cpusets where present.  And we
require that the oom killer not operate across the two halves of this system,
or else the first time a student job runs amuck, the big research project will
likely be first inline to get shot.

Tweaking /proc/<pid>/oom_adj is not ideal -- if the big research project
really does run amuck allocating memory, it should be shot, not some other
task outside the research projects mem_exclusive cpuset.

I propose to extend the use of the 'mem_exclusive' flag of cpusets to manage
such scenarios.  Let memory allocations for user space (GFP_USER) be
constrained by a tasks current cpuset, but memory allocations for kernel space
(GFP_KERNEL) by constrained by the nearest mem_exclusive ancestor of the
current cpuset, even though kernel space allocations will still _prefer_ to
remain within the current tasks cpuset, if memory is easily available.

Let the oom killer be constrained to consider only tasks that are in
overlapping mem_exclusive cpusets (it won't help much to kill a task that
normally cannot allocate memory on any of the same nodes as the ones on which
the current task can allocate.)

The current constraints imposed on setting mem_exclusive are unchanged.  A
cpuset may only be mem_exclusive if its parent is also mem_exclusive, and a
mem_exclusive cpuset may not overlap any of its siblings memory nodes.

This patch was presented on linux-mm in early July 2005, though did not
generate much feedback at that time.  It has been built for a variety of
arch's using cross tools, and built, booted and tested for function on SN2
(ia64).

There are 4 patches in this set:
  1) Some minor cleanup, and some improvements to the code layout
     of one routine to make subsequent patches cleaner.
  2) Add another GFP flag - __GFP_HARDWALL.  It marks memory
     requests for USER space, which are tightly confined by the
     current tasks cpuset.
  3) Now memory requests (such as KERNEL) that not marked HARDWALL can
     if short on memory, look in the potentially larger pool of memory
     defined by the nearest mem_exclusive ancestor cpuset of the current
     tasks cpuset.
  4) Finally, modify the oom killer to skip any task whose mem_exclusive
     cpuset doesn't overlap ours.

Patch (1), the one time I looked on an SN2 (ia64) build, actually saved 32
bytes of kernel text space.  Patch (2) has no affect on the size of kernel
text space (it just adds a preprocessor flag).  Patches (3) and (4) added
about 600 bytes each of kernel text space, mostly in kernel/cpuset.c, which
matters only if CONFIG_CPUSET is enabled.

This patch:

This patch applies a few comment and code cleanups to mm/oom_kill.c prior to
applying a few small patches to improve cpuset management of memory placement.

The comment changed in oom_kill.c was seriously misleading.  The code layout
change in select_bad_process() makes room for adding another condition on
which a process can be spared the oom killer (see the subsequent
cpuset_nodes_overlap patch for this addition).

Also a couple typos and spellos that bugged me, while I was here.

This patch should have no material affect.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 57 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e56076672f..3a1d4650293 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -6,8 +6,8 @@
  *	for goading me into coding this file...
  *
  *  The routines in this file are used to kill a process when
- *  we're seriously out of memory. This gets called from kswapd()
- *  in linux/mm/vmscan.c when we really run out of memory.
+ *  we're seriously out of memory. This gets called from __alloc_pages()
+ *  in mm/page_alloc.c when we really run out of memory.
  *
  *  Since we won't call these routines often (on a well-configured
  *  machine) this file will double as a 'coding guide' and a signpost
@@ -26,7 +26,7 @@
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
- * @p: current uptime in seconds
+ * @uptime: current uptime in seconds
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -57,9 +57,9 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 
 	/*
 	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add the vmsize of the childs if they
+	 * a good choice. We add the vmsize of the children if they
 	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of childs
+	 * machine with an endless amount of children
 	 */
 	list_for_each(tsk, &p->children) {
 		struct task_struct *chld;
@@ -143,28 +143,32 @@ static struct task_struct * select_bad_process(void)
 	struct timespec uptime;
 
 	do_posix_clock_monotonic_gettime(&uptime);
-	do_each_thread(g, p)
+	do_each_thread(g, p) {
+		unsigned long points;
+		int releasing;
+
 		/* skip the init task with pid == 1 */
-		if (p->pid > 1 && p->oomkilladj != OOM_DISABLE) {
-			unsigned long points;
-
-			/*
-			 * This is in the process of releasing memory so wait it
-			 * to finish before killing some other task by mistake.
-			 */
-			if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
-			    !(p->flags & PF_DEAD))
-				return ERR_PTR(-1UL);
-			if (p->flags & PF_SWAPOFF)
-				return p;
-
-			points = badness(p, uptime.tv_sec);
-			if (points > maxpoints || !chosen) {
-				chosen = p;
-				maxpoints = points;
-			}
+		if (p->pid == 1)
+			continue;
+		if (p->oomkilladj == OOM_DISABLE)
+			continue;
+		/*
+		 * This is in the process of releasing memory so for wait it
+		 * to finish before killing some other task by mistake.
+		 */
+		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
+						p->flags & PF_EXITING;
+		if (releasing && !(p->flags & PF_DEAD))
+			return ERR_PTR(-1UL);
+		if (p->flags & PF_SWAPOFF)
+			return p;
+
+		points = badness(p, uptime.tv_sec);
+		if (points > maxpoints || !chosen) {
+			chosen = p;
+			maxpoints = points;
 		}
-	while_each_thread(g, p);
+	} while_each_thread(g, p);
 	return chosen;
 }
 
@@ -189,7 +193,8 @@ static void __oom_kill_task(task_t *p)
 		return;
 	}
 	task_unlock(p);
-	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
+	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
+							p->pid, p->comm);
 
 	/*
 	 * We give our sacrificial lamb high priority and access to
-- 
cgit v1.2.3


From 9bf2229f8817677127a60c177aefce1badd22d7b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:12 -0700
Subject: [PATCH] cpusets: formalize intermediate GFP_KERNEL containment

This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution.  With this patch, there are now the following four layers of
memory placement available:

 1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
 2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
 3) The current tasks cpuset (GFP_USER allocations constrained to here), and
 4) Specific node placement, using mbind and set_mempolicy.

These nest - each layer is a subset (same or within) of the previous.

Layer (2) above is new, with this patch.  The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed.  The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.

GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.

The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such.  Swapper and oom_kill activity is also constrained to Layer (2).  A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset.  Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.

This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.

Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 16 ++++++++++------
 mm/vmscan.c     |  8 ++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d7032c1d1..3974fd81d27 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
 	classzone_idx = zone_idx(zones[0]);
 
 restart:
-	/* Go through the zonelist once, looking for a zone with enough free */
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
-		if (!cpuset_zone_allowed(z))
+		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 			continue;
 
 		/*
@@ -845,6 +848,7 @@ zone_reclaim_retry:
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
@@ -852,7 +856,7 @@ zone_reclaim_retry:
 				       gfp_mask & __GFP_HIGH))
 			continue;
 
-		if (wait && !cpuset_zone_allowed(z))
+		if (wait && !cpuset_zone_allowed(z, gfp_mask))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -867,7 +871,7 @@ zone_reclaim_retry:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
 			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z))
+				if (!cpuset_zone_allowed(z, gfp_mask))
 					continue;
 				page = buffered_rmqueue(z, order, gfp_mask);
 				if (page)
@@ -903,7 +907,7 @@ rebalance:
 					       gfp_mask & __GFP_HIGH))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, gfp_mask))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -922,7 +926,7 @@ rebalance:
 					       classzone_idx, 0, 0))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0095533cdde..a740778f688 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->present_pages == 0)
 			continue;
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
-	if (!cpuset_zone_allowed(zone))
+	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 		return;
 	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
 		return;
-- 
cgit v1.2.3


From ef08e3b4981aebf2ba9bd7025ef7210e8eec07ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:13 -0700
Subject: [PATCH] cpusets: confine oom_killer to mem_exclusive cpuset

Now the real motivation for this cpuset mem_exclusive patch series seems
trivial.

This patch keeps a task in or under one mem_exclusive cpuset from provoking an
oom kill of a task under a non-overlapping mem_exclusive cpuset.  Since only
interrupt and GFP_ATOMIC allocations are allowed to escape mem_exclusive
containment, there is little to gain from oom killing a task under a
non-overlapping mem_exclusive cpuset, as almost all kernel and user memory
allocation must come from disjoint memory nodes.

This patch enables configuring a system so that a runaway job under one
mem_exclusive cpuset cannot cause the killing of a job in another such cpuset
that might be using very high compute and memory resources for a prolonged
time.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a1d4650293..5ec8da12cfd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -20,6 +20,7 @@
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/cpuset.h>
 
 /* #define DEBUG */
 
@@ -152,6 +153,10 @@ static struct task_struct * select_bad_process(void)
 			continue;
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
+		/* If p's nodes don't overlap ours, it won't help to kill p. */
+		if (!cpuset_excl_nodes_overlap(p))
+			continue;
+
 		/*
 		 * This is in the process of releasing memory so for wait it
 		 * to finish before killing some other task by mistake.
-- 
cgit v1.2.3


From dd3927105b6f65afb7dac17682172cdfb86d3f00 Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.Helsinki.FI>
Date: Tue, 6 Sep 2005 15:18:31 -0700
Subject: [PATCH] introduce and use kzalloc

This patch introduces a kzalloc wrapper and converts kernel/ to use it.  It
saves a little program text.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index a9ff4f7f986..d7c4443991f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2558,24 +2558,18 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 
 /**
- * kcalloc - allocate memory for an array. The memory is set to zero.
- * @n: number of elements.
- * @size: element size.
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, unsigned int __nocast flags)
 {
-	void *ret = NULL;
-
-	if (n != 0 && size > INT_MAX / n)
-		return ret;
-
-	ret = kmalloc(n * size, flags);
+	void *ret = kmalloc(size, flags);
 	if (ret)
-		memset(ret, 0, n * size);
+		memset(ret, 0, size);
 	return ret;
 }
-EXPORT_SYMBOL(kcalloc);
+EXPORT_SYMBOL(kzalloc);
 
 /**
  * kfree - free previously allocated memory
-- 
cgit v1.2.3